/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 14 by dpavlin, Sun Jul 17 00:04:25 2005 UTC revision 15 by dpavlin, Sun Jul 17 10:42:23 2005 UTC
# Line 7  use Storable; Line 7  use Storable;
7    
8  =head1 NAME  =head1 NAME
9    
10  WebPAC::Normalize - normalisation of source file  WebPAC::Normalize - data mungling for normalisation
11    
12  =head1 VERSION  =head1 VERSION
13    
# Line 19  our $VERSION = '0.01'; Line 19  our $VERSION = '0.01';
19    
20  =head1 SYNOPSIS  =head1 SYNOPSIS
21    
22  This package contains code that could be helpful in implementing different  This package contains code that mungle data to produce normalized format.
23  normalisation front-ends.  
24    It contains several assumptions:
25    
26    =over
27    
28    =item *
29    
30    format of fields is defined using C<v123^a> notation for repeatable fields
31    or C<s123^a> for single (or first) value, where C<123> is field number and
32    C<a> is subfield.
33    
34    =item *
35    
36    source data records (C<$rec>) have unique identifiers in field C<000>
37    
38    =item *
39    
40    optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41    perl code that is evaluated before producing output (value of field will be
42    interpolated before that)
43    
44    =item *
45    
46    optional C<filter{filter_name}> at B<begining of format> will apply perl
47    code defined as code ref on format after field substitution to producing
48    output
49    
50    =item *
51    
52    optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54    =item *
55    
56    at end, optional C<format>s rules are resolved. Format rules are similar to
57    C<sprintf> and can also contain C<lookup{...}> which is performed after
58    values are inserted in format.
59    
60    =back
61    
62    This also describes order in which transformations are applied (eval,
63    filter, lookup, format) which is important to undestand when deciding how to
64    solve your data mungling and normalisation process.
65    
66    
67    
68    
69  =head1 FUNCTIONS  =head1 FUNCTIONS
70    
# Line 29  normalisation front-ends. Line 73  normalisation front-ends.
73  Create new normalisation object  Create new normalisation object
74    
75    my $n = new WebPAC::Normalize::Something(    my $n = new WebPAC::Normalize::Something(
76            filter => {
77                    'filter_name_1' => sub {
78                            # filter code
79                            return length($_);
80                    }, ...
81            },
82          cache_data_structure => './cache/ds/',          cache_data_structure => './cache/ds/',
83          lookup_regex => $lookup->regex,          lookup_regex => $lookup->regex,
84    );    );
85    
86    Parametar C<filter> defines user supplied snippets of perl code which can
87    be use with C<filter{...}> notation.
88    
89  Optional parameter C<cache_data_structure> defines path to directory  Optional parameter C<cache_data_structure> defines path to directory
90  in which cache file for C<data_structure> call will be created.  in which cache file for C<data_structure> call will be created.
91    
# Line 268  sub data_structure { Line 321  sub data_structure {
321    
322  }  }
323    
 =head2 apply_format  
   
 Apply format specified in tag with C<format_name="name"> and  
 C<format_delimiter=";;">.  
   
  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);  
   
 Formats can contain C<lookup{...}> if you need them.  
   
 =cut  
   
 sub apply_format {  
         my $self = shift;  
   
         my ($name,$delimiter,$data) = @_;  
   
         my $log = $self->_get_logger();  
   
         if (! $self->{'import_xml'}->{'format'}->{$name}) {  
                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});  
                 return $data;  
         }  
   
         $log->warn("no delimiter for format $name") if (! $delimiter);  
   
         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");  
   
         my @data = split(/\Q$delimiter\E/, $data);  
   
         my $out = sprintf($format, @data);  
         $log->debug("using format $name [$format] on $data to produce: $out");  
   
         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {  
                 return $self->lookup($out);  
         } else {  
                 return $out;  
         }  
   
 }  
   
324  =head2 parse  =head2 parse
325    
326  Perform smart parsing of string, skipping delimiters for fields which aren't  Perform smart parsing of string, skipping delimiters for fields which aren't
# Line 427  sub parse_to_arr { Line 440  sub parse_to_arr {
440          return @arr;          return @arr;
441  }  }
442    
443    
444    =head2 fill_in
445    
446    Workhourse of all: takes record from in-memory structure of database and
447    strings with placeholders and returns string or array of with substituted
448    values from record.
449    
450     my $text = $webpac->fill_in($rec,'v250^a');
451    
452    Optional argument is ordinal number for repeatable fields. By default,
453    it's assume to be first repeatable field (fields are perl array, so first
454    element is 0).
455    Following example will read second value from repeatable field.
456    
457     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458    
459    This function B<does not> perform parsing of format to inteligenty skip
460    delimiters before fields which aren't used.
461    
462    This method will automatically decode UTF-8 string to local code page
463    if needed.
464    
465    =cut
466    
467    sub fill_in {
468            my $self = shift;
469    
470            my $log = $self->_get_logger();
471    
472            my $rec = shift || $log->logconfess("need data record");
473            my $format = shift || $log->logconfess("need format to parse");
474            # iteration (for repeatable fields)
475            my $i = shift || 0;
476    
477            $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
478    
479            # FIXME remove for speedup?
480            $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481    
482            if (utf8::is_utf8($format)) {
483                    $format = $self->_x($format);
484            }
485    
486            my $found = 0;
487    
488            my $eval_code;
489            # remove eval{...} from beginning
490            $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
491    
492            my $filter_name;
493            # remove filter{...} from beginning
494            $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
495    
496            # do actual replacement of placeholders
497            # repeatable fields
498            $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
499            # non-repeatable fields
500            $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
501    
502            if ($found) {
503                    $log->debug("format: $format");
504                    if ($eval_code) {
505                            my $eval = $self->fill_in($rec,$eval_code,$i);
506                            return if (! $self->_eval($eval));
507                    }
508                    if ($filter_name && $self->{'filter'}->{$filter_name}) {
509                            $log->debug("filter '$filter_name' for $format");
510                            $format = $self->{'filter'}->{$filter_name}->($format);
511                            return unless(defined($format));
512                            $log->debug("filter result: $format");
513                    }
514                    # do we have lookups?
515                    if ($self->{'lookup'}) {
516                            return $self->lookup($format);
517                    } else {
518                            return $format;
519                    }
520            } else {
521                    return;
522            }
523    }
524    
525    
526  =head2 fill_in_to_arr  =head2 fill_in_to_arr
527    
528  Similar to C<fill_in>, but returns array of all repeatable fields. Usable  Similar to C<fill_in>, but returns array of all repeatable fields. Usable
# Line 459  sub fill_in_to_arr { Line 555  sub fill_in_to_arr {
555          return @arr;          return @arr;
556  }  }
557    
558    
559    =head2 get_data
560    
561    Returns value from record.
562    
563     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
564    
565    Arguments are:
566    record reference C<$rec>,
567    field C<$f>,
568    optional subfiled C<$sf>,
569    index for repeatable values C<$i>.
570    
571    Optinal variable C<$found> will be incremeted if there
572    is field.
573    
574    Returns value or empty string.
575    
576    =cut
577    
578    sub get_data {
579            my $self = shift;
580    
581            my ($rec,$f,$sf,$i,$found) = @_;
582    
583            if ($$rec->{$f}) {
584                    return '' if (! $$rec->{$f}->[$i]);
585                    no strict 'refs';
586                    if ($sf && $$rec->{$f}->[$i]->{$sf}) {
587                            $$found++ if (defined($$found));
588                            return $$rec->{$f}->[$i]->{$sf};
589                    } elsif ($$rec->{$f}->[$i]) {
590                            $$found++ if (defined($$found));
591                            # it still might have subfield, just
592                            # not specified, so we'll dump all
593                            if ($$rec->{$f}->[$i] =~ /HASH/o) {
594                                    my $out;
595                                    foreach my $k (keys %{$$rec->{$f}->[$i]}) {
596                                            $out .= $$rec->{$f}->[$i]->{$k}." ";
597                                    }
598                                    return $out;
599                            } else {
600                                    return $$rec->{$f}->[$i];
601                            }
602                    }
603            } else {
604                    return '';
605            }
606    }
607    
608    
609    =head2 apply_format
610    
611    Apply format specified in tag with C<format_name="name"> and
612    C<format_delimiter=";;">.
613    
614     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615    
616    Formats can contain C<lookup{...}> if you need them.
617    
618    =cut
619    
620    sub apply_format {
621            my $self = shift;
622    
623            my ($name,$delimiter,$data) = @_;
624    
625            my $log = $self->_get_logger();
626    
627            if (! $self->{'import_xml'}->{'format'}->{$name}) {
628                    $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629                    return $data;
630            }
631    
632            $log->warn("no delimiter for format $name") if (! $delimiter);
633    
634            my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
635    
636            my @data = split(/\Q$delimiter\E/, $data);
637    
638            my $out = sprintf($format, @data);
639            $log->debug("using format $name [$format] on $data to produce: $out");
640    
641            if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642                    return $self->lookup($out);
643            } else {
644                    return $out;
645            }
646    
647    }
648    
649  =head2 sort_arr  =head2 sort_arr
650    
651  Sort array ignoring case and html in data  Sort array ignoring case and html in data
# Line 485  sub sort_arr { Line 672  sub sort_arr {
672  }  }
673    
674    
675    =head1 INTERNAL METHODS
676    
677  =head2 _sort_by_order  =head2 _sort_by_order
678    
679  Sort xml tags data structure accoding to C<order=""> attribute.  Sort xml tags data structure accoding to C<order=""> attribute.
# Line 504  sub _sort_by_order { Line 693  sub _sort_by_order {
693    
694  =head2 _x  =head2 _x
695    
696  Convert strings from C<conf/normalize> encoding into application specific  Convert strings from C<conf/normalize/*.xml> encoding into application
697  (optinally specified using C<code_page> to C<new> constructor.  specific encoding (optinally specified using C<code_page> to C<new>
698    constructor).
699    
700   my $text = $n->_x('normalize text string');   my $text = $n->_x('normalize text string');
701    

Legend:
Removed from v.14  
changed lines
  Added in v.15

  ViewVC Help
Powered by ViewVC 1.1.26