/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 13 by dpavlin, Sat Jul 16 23:56:14 2005 UTC revision 18 by dpavlin, Sun Jul 17 14:53:37 2005 UTC
# Line 6  use Data::Dumper; Line 6  use Data::Dumper;
6    
7  =head1 NAME  =head1 NAME
8    
9  WebPAC::Normalize - normalisation of source file  WebPAC::Normalize - data mungling for normalisation
10    
11  =head1 VERSION  =head1 VERSION
12    
# Line 18  our $VERSION = '0.01'; Line 18  our $VERSION = '0.01';
18    
19  =head1 SYNOPSIS  =head1 SYNOPSIS
20    
21  This package contains code that could be helpful in implementing different  This package contains code that mungle data to produce normalized format.
22  normalisation front-ends.  
23    It contains several assumptions:
24    
25    =over
26    
27    =item *
28    
29    format of fields is defined using C<v123^a> notation for repeatable fields
30    or C<s123^a> for single (or first) value, where C<123> is field number and
31    C<a> is subfield.
32    
33    =item *
34    
35    source data records (C<$rec>) have unique identifiers in field C<000>
36    
37    =item *
38    
39    optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
40    perl code that is evaluated before producing output (value of field will be
41    interpolated before that)
42    
43    =item *
44    
45    optional C<filter{filter_name}> at B<begining of format> will apply perl
46    code defined as code ref on format after field substitution to producing
47    output
48    
49    =item *
50    
51    optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
52    
53    =item *
54    
55    at end, optional C<format>s rules are resolved. Format rules are similar to
56    C<sprintf> and can also contain C<lookup{...}> which is performed after
57    values are inserted in format.
58    
59    =back
60    
61    This also describes order in which transformations are applied (eval,
62    filter, lookup, format) which is important to undestand when deciding how to
63    solve your data mungling and normalisation process.
64    
65    
66    
67    
68  =head1 FUNCTIONS  =head1 FUNCTIONS
69    
# Line 28  normalisation front-ends. Line 72  normalisation front-ends.
72  Create new normalisation object  Create new normalisation object
73    
74    my $n = new WebPAC::Normalize::Something(    my $n = new WebPAC::Normalize::Something(
75          cache_data_structure => './cache/ds/',          filter => {
76                    'filter_name_1' => sub {
77                            # filter code
78                            return length($_);
79                    }, ...
80            },
81            db => $webpac_db_obj,
82          lookup_regex => $lookup->regex,          lookup_regex => $lookup->regex,
83    );    );
84    
85  Optional parameter C<cache_data_structure> defines path to directory  Parametar C<filter> defines user supplied snippets of perl code which can
86  in which cache file for C<data_structure> call will be created.  be use with C<filter{...}> notation.
87    
88  Recommended parametar C<lookup_regex> is used to enable parsing of lookups  Recommended parametar C<lookup_regex> is used to enable parsing of lookups
89  in structures.  in structures.
# Line 45  sub new { Line 95  sub new {
95          my $self = {@_};          my $self = {@_};
96          bless($self, $class);          bless($self, $class);
97    
         $self->setup_cache_dir( $self->{'cache_data_structure'} );  
   
98          $self ? return $self : return undef;          $self ? return $self : return undef;
99  }  }
100    
 =head2 setup_cache_dir  
   
 Check if specified cache directory exist, and if not, disable caching.  
   
  $setup_cache_dir('./cache/ds/');  
   
 If you pass false or zero value to this function, it will disable  
 cacheing.  
   
 =cut  
   
 sub setup_cache_dir {  
         my $self = shift;  
   
         my $dir = shift;  
   
         my $log = $self->_get_logger();  
   
         if ($dir) {  
                 my $msg;  
                 if (! -e $dir) {  
                         $msg = "doesn't exist";  
                 } elsif (! -d $dir) {  
                         $msg = "is not directory";  
                 } elsif (! -w $dir) {  
                         $msg = "not writable";  
                 }  
   
                 if ($msg) {  
                         undef $self->{'cache_data_structure'};  
                         $log->warn("cache_data_structure $dir $msg, disabling...");  
                 } else {  
                         $log->debug("using cache dir $dir");  
                 }  
         } else {  
                 $log->debug("disabling cache");  
                 undef $self->{'cache_data_structure'};  
         }  
 }  
   
101    
102  =head2 data_structure  =head2 data_structure
103    
# Line 118  sub data_structure { Line 126  sub data_structure {
126    
127          my $cache_file;          my $cache_file;
128    
129          if (my $cache_path = $self->{'cache_data_structure'}) {          if ($self->{'db'}) {
130                  my $id = $rec->{'000'};                  my @ds = $self->{'db'}->get_ds($rec);
131                  $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);                  return @ds if (@ds);
                 unless (defined($id)) {  
                         $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");  
                         undef $self->{'cache_data_structure'};  
                 } else {  
                         $cache_file = "$cache_path/$id";  
                         if (-r $cache_file) {  
                                 my $ds_ref = retrieve($cache_file);  
                                 if ($ds_ref) {  
                                         $log->debug("cache hit: $cache_file");  
                                         my $ok = 1;  
                                         foreach my $f (qw(current_filename headline)) {  
                                                 if ($ds_ref->{$f}) {  
                                                         $self->{$f} = $ds_ref->{$f};  
                                                 } else {  
                                                         $ok = 0;  
                                                 }  
                                         };  
                                         if ($ok && $ds_ref->{'ds'}) {  
                                                 return @{ $ds_ref->{'ds'} };  
                                         } else {  
                                                 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");  
                                                 undef $self->{'cache_data_structure'};  
                                         }  
                                 }  
                         }  
                 }  
132          }          }
133    
134          undef $self->{'currnet_filename'};          undef $self->{'currnet_filename'};
# Line 254  sub data_structure { Line 236  sub data_structure {
236    
237          }          }
238    
239          if ($cache_file) {          $self->{'db'}->put_gs(
240                  store {                  ds => \@ds,
241                          ds => \@ds,                  current_filename => $self->{'current_filename'},
242                          current_filename => $self->{'current_filename'},                  headline => $self->{'headline'},
243                          headline => $self->{'headline'},          ) if ($self->{'db'});
                 }, $cache_file;  
                 $log->debug("created storable cache file $cache_file");  
         }  
244    
245          return @ds;          return @ds;
246    
247  }  }
248    
 =head2 apply_format  
   
 Apply format specified in tag with C<format_name="name"> and  
 C<format_delimiter=";;">.  
   
  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);  
   
 Formats can contain C<lookup{...}> if you need them.  
   
 =cut  
   
 sub apply_format {  
         my $self = shift;  
   
         my ($name,$delimiter,$data) = @_;  
   
         my $log = $self->_get_logger();  
   
         if (! $self->{'import_xml'}->{'format'}->{$name}) {  
                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});  
                 return $data;  
         }  
   
         $log->warn("no delimiter for format $name") if (! $delimiter);  
   
         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");  
   
         my @data = split(/\Q$delimiter\E/, $data);  
   
         my $out = sprintf($format, @data);  
         $log->debug("using format $name [$format] on $data to produce: $out");  
   
         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {  
                 return $self->lookup($out);  
         } else {  
                 return $out;  
         }  
   
 }  
   
249  =head2 parse  =head2 parse
250    
251  Perform smart parsing of string, skipping delimiters for fields which aren't  Perform smart parsing of string, skipping delimiters for fields which aren't
# Line 426  sub parse_to_arr { Line 365  sub parse_to_arr {
365          return @arr;          return @arr;
366  }  }
367    
368    
369    =head2 fill_in
370    
371    Workhourse of all: takes record from in-memory structure of database and
372    strings with placeholders and returns string or array of with substituted
373    values from record.
374    
375     my $text = $webpac->fill_in($rec,'v250^a');
376    
377    Optional argument is ordinal number for repeatable fields. By default,
378    it's assume to be first repeatable field (fields are perl array, so first
379    element is 0).
380    Following example will read second value from repeatable field.
381    
382     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
383    
384    This function B<does not> perform parsing of format to inteligenty skip
385    delimiters before fields which aren't used.
386    
387    This method will automatically decode UTF-8 string to local code page
388    if needed.
389    
390    =cut
391    
392    sub fill_in {
393            my $self = shift;
394    
395            my $log = $self->_get_logger();
396    
397            my $rec = shift || $log->logconfess("need data record");
398            my $format = shift || $log->logconfess("need format to parse");
399            # iteration (for repeatable fields)
400            my $i = shift || 0;
401    
402            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
403    
404            # FIXME remove for speedup?
405            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
406    
407            if (utf8::is_utf8($format)) {
408                    $format = $self->_x($format);
409            }
410    
411            my $found = 0;
412    
413            my $eval_code;
414            # remove eval{...} from beginning
415            $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
416    
417            my $filter_name;
418            # remove filter{...} from beginning
419            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
420    
421            # do actual replacement of placeholders
422            # repeatable fields
423            $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
424            # non-repeatable fields
425            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
426    
427            if ($found) {
428                    $log->debug("format: $format");
429                    if ($eval_code) {
430                            my $eval = $self->fill_in($rec,$eval_code,$i);
431                            return if (! $self->_eval($eval));
432                    }
433                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
434                            $log->debug("filter '$filter_name' for $format");
435                            $format = $self->{'filter'}->{$filter_name}->($format);
436                            return unless(defined($format));
437                            $log->debug("filter result: $format");
438                    }
439                    # do we have lookups?
440                    if ($self->{'lookup'}) {
441                            return $self->lookup($format);
442                    } else {
443                            return $format;
444                    }
445            } else {
446                    return;
447            }
448    }
449    
450    
451  =head2 fill_in_to_arr  =head2 fill_in_to_arr
452    
453  Similar to C<fill_in>, but returns array of all repeatable fields. Usable  Similar to C<fill_in>, but returns array of all repeatable fields. Usable
# Line 458  sub fill_in_to_arr { Line 480  sub fill_in_to_arr {
480          return @arr;          return @arr;
481  }  }
482    
483    
484    =head2 get_data
485    
486    Returns value from record.
487    
488     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
489    
490    Arguments are:
491    record reference C<$rec>,
492    field C<$f>,
493    optional subfiled C<$sf>,
494    index for repeatable values C<$i>.
495    
496    Optinal variable C<$found> will be incremeted if there
497    is field.
498    
499    Returns value or empty string.
500    
501    =cut
502    
503    sub get_data {
504            my $self = shift;
505    
506            my ($rec,$f,$sf,$i,$found) = @_;
507    
508            if ($$rec->{$f}) {
509                    return '' if (! $$rec->{$f}->[$i]);
510                    no strict 'refs';
511                    if ($sf && $$rec->{$f}->[$i]->{$sf}) {
512                            $$found++ if (defined($$found));
513                            return $$rec->{$f}->[$i]->{$sf};
514                    } elsif ($$rec->{$f}->[$i]) {
515                            $$found++ if (defined($$found));
516                            # it still might have subfield, just
517                            # not specified, so we'll dump all
518                            if ($$rec->{$f}->[$i] =~ /HASH/o) {
519                                    my $out;
520                                    foreach my $k (keys %{$$rec->{$f}->[$i]}) {
521                                            $out .= $$rec->{$f}->[$i]->{$k}." ";
522                                    }
523                                    return $out;
524                            } else {
525                                    return $$rec->{$f}->[$i];
526                            }
527                    }
528            } else {
529                    return '';
530            }
531    }
532    
533    
534    =head2 apply_format
535    
536    Apply format specified in tag with C<format_name="name"> and
537    C<format_delimiter=";;">.
538    
539     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
540    
541    Formats can contain C<lookup{...}> if you need them.
542    
543    =cut
544    
545    sub apply_format {
546            my $self = shift;
547    
548            my ($name,$delimiter,$data) = @_;
549    
550            my $log = $self->_get_logger();
551    
552            if (! $self->{'import_xml'}->{'format'}->{$name}) {
553                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
554                    return $data;
555            }
556    
557            $log->warn("no delimiter for format $name") if (! $delimiter);
558    
559            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
560    
561            my @data = split(/\Q$delimiter\E/, $data);
562    
563            my $out = sprintf($format, @data);
564            $log->debug("using format $name [$format] on $data to produce: $out");
565    
566            if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
567                    return $self->lookup($out);
568            } else {
569                    return $out;
570            }
571    
572    }
573    
574  =head2 sort_arr  =head2 sort_arr
575    
576  Sort array ignoring case and html in data  Sort array ignoring case and html in data
# Line 484  sub sort_arr { Line 597  sub sort_arr {
597  }  }
598    
599    
600    =head1 INTERNAL METHODS
601    
602  =head2 _sort_by_order  =head2 _sort_by_order
603    
604  Sort xml tags data structure accoding to C<order=""> attribute.  Sort xml tags data structure accoding to C<order=""> attribute.
# Line 503  sub _sort_by_order { Line 618  sub _sort_by_order {
618    
619  =head2 _x  =head2 _x
620    
621  Convert strings from C<conf/normalize> encoding into application specific  Convert strings from C<conf/normalize/*.xml> encoding into application
622  (optinally specified using C<code_page> to C<new> constructor.  specific encoding (optinally specified using C<code_page> to C<new>
623    constructor).
624    
625   my $text = $n->_x('normalize text string');   my $text = $n->_x('normalize text string');
626    

Legend:
Removed from v.13  
changed lines
  Added in v.18

  ViewVC Help
Powered by ViewVC 1.1.26