/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 786 by dpavlin, Sun Dec 10 12:45:11 2006 UTC revision 1021 by dpavlin, Sat Nov 10 11:11:16 2007 UTC
# Line 1  Line 1 
1  package WebPAC::Normalize;  package WebPAC::Normalize;
2  use Exporter 'import';  use Exporter 'import';
3  @EXPORT = qw/  our @EXPORT = qw/
4          _set_rec _set_lookup          _set_ds _set_lookup
5          _set_load_row          _set_load_row
6          _get_ds _clean_ds          _get_ds _clean_ds
7          _debug          _debug
8          _pack_subfields_hash          _pack_subfields_hash
9    
10          tag search display          search_display search display sorted
11    
12          marc marc_indicators marc_repeatable_subfield          marc marc_indicators marc_repeatable_subfield
13          marc_compose marc_leader          marc_compose marc_leader marc_fixed
14          marc_duplicate marc_remove          marc_duplicate marc_remove marc_count
15          marc_original_order          marc_original_order
16            marc_template
17    
18          rec1 rec2 rec          rec1 rec2 rec
19            frec frec_eq frec_ne
20          regex prefix suffix surround          regex prefix suffix surround
21          first lookup join_with          first lookup join_with
22          save_into_lookup          save_into_lookup
# Line 21  use Exporter 'import'; Line 24  use Exporter 'import';
24          split_rec_on          split_rec_on
25    
26          get set          get set
27            count
28    
29  /;  /;
30    
31  use warnings;  use warnings;
# Line 34  use Carp qw/confess/; Line 39  use Carp qw/confess/;
39  # debugging warn(s)  # debugging warn(s)
40  my $debug = 0;  my $debug = 0;
41    
42    # FIXME
43    use WebPAC::Normalize::ISBN;
44    push @EXPORT, ( 'isbn_10', 'isbn_13' );
45    
46    use WebPAC::Normalize::MARC;
47    push @EXPORT, ( 'marc_template' );
48    
49  =head1 NAME  =head1 NAME
50    
51  WebPAC::Normalize - describe normalisaton rules using sets  WebPAC::Normalize - describe normalisaton rules using sets
52    
 =head1 VERSION  
   
 Version 0.25  
   
53  =cut  =cut
54    
55  our $VERSION = '0.25';  our $VERSION = '0.35';
56    
57  =head1 SYNOPSIS  =head1 SYNOPSIS
58    
# Line 58  means that you check it's validity befor Line 65  means that you check it's validity befor
65  C<perl -c normalize.pl>.  C<perl -c normalize.pl>.
66    
67  Normalisation can generate multiple output normalized data. For now, supported output  Normalisation can generate multiple output normalized data. For now, supported output
68  types (on the left side of definition) are: C<tag>, C<display>, C<search> and  types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69  C<marc>.  C<marc>.
70    
71  =head1 FUNCTIONS  =head1 FUNCTIONS
# Line 77  Return data structure Line 84  Return data structure
84          marc_encoding => 'utf-8',          marc_encoding => 'utf-8',
85          config => $config,          config => $config,
86          load_row_coderef => sub {          load_row_coderef => sub {
87                  my ($database,$input,$mfn) = shift;                  my ($database,$input,$mfn) = @_;
88                  $store->load_row( database => $database, input => $input, id => $mfn );                  $store->load_row( database => $database, input => $input, id => $mfn );
89          },          },
90    );    );
# Line 103  sub data_structure { Line 110  sub data_structure {
110          die "need row argument" unless ($arg->{row});          die "need row argument" unless ($arg->{row});
111          die "need normalisation argument" unless ($arg->{rules});          die "need normalisation argument" unless ($arg->{rules});
112    
         no strict 'subs';  
113          _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});          _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114          _set_rec( $arg->{row} );          _set_ds( $arg->{row} );
115          _set_config( $arg->{config} ) if defined($arg->{config});          _set_config( $arg->{config} ) if defined($arg->{config});
116          _clean_ds( %{ $arg } );          _clean_ds( %{ $arg } );
117          $load_row_coderef = $arg->{load_row_coderef};          $load_row_coderef = $arg->{load_row_coderef};
118    
119          eval "$arg->{rules}";          no strict 'subs';
120            no warnings 'redefine';
121            eval "$arg->{rules};";
122          die "error evaling $arg->{rules}: $@\n" if ($@);          die "error evaling $arg->{rules}: $@\n" if ($@);
123    
124          return _get_ds();          return _get_ds();
125  }  }
126    
127  =head2 _set_rec  =head2 _set_ds
128    
129  Set current record hash  Set current record hash
130    
131    _set_rec( $rec );    _set_ds( $rec );
132    
133  =cut  =cut
134    
135  my $rec;  my $rec;
136    
137  sub _set_rec {  sub _set_ds {
138          $rec = shift or die "no record hash";          $rec = shift or die "no record hash";
139  }  }
140    
141    =head2
142    
143      my $rec = _get_rec();
144    
145    =cut
146    
147    sub _get_rec { $rec };
148    
149  =head2 _set_config  =head2 _set_config
150    
151  Set current config hash  Set current config hash
# Line 166  Return hash formatted as data structure Line 182  Return hash formatted as data structure
182    
183  =cut  =cut
184    
185  my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);  my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
186  my ($marc_record_offset, $marc_fetch_offset) = (0, 0);  my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
187    
188  sub _get_ds {  sub _get_ds {
189    #warn "## out = ",dump($out);
190          return $out;          return $out;
191  }  }
192    
# Line 183  Clean data structure hash for next recor Line 200  Clean data structure hash for next recor
200    
201  sub _clean_ds {  sub _clean_ds {
202          my $a = {@_};          my $a = {@_};
203          ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();          ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
204          ($marc_record_offset, $marc_fetch_offset) = (0,0);          ($marc_record_offset, $marc_fetch_offset) = (0,0);
205          $marc_encoding = $a->{marc_encoding};          $marc_encoding = $a->{marc_encoding};
206  }  }
# Line 288  will return 42th copy record (if it exis Line 305  will return 42th copy record (if it exis
305    
306  =cut  =cut
307    
308    my $fetch_pos;
309    
310  sub _get_marc_fields {  sub _get_marc_fields {
311    
312          my $arg = {@_};          my $arg = {@_};
313          warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);          warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
314          my $offset = $marc_fetch_offset;          $fetch_pos = $marc_fetch_offset;
315          if ($arg->{offset}) {          if ($arg->{offset}) {
316                  $offset = $arg->{offset};                  $fetch_pos = $arg->{offset};
317          } elsif($arg->{fetch_next}) {          } elsif($arg->{fetch_next}) {
318                  $marc_fetch_offset++;                  $marc_fetch_offset++;
319          }          }
# Line 303  sub _get_marc_fields { Line 322  sub _get_marc_fields {
322    
323          warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);          warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
324    
325          my $marc_rec = $marc_record->[ $offset ];          my $marc_rec = $marc_record->[ $fetch_pos ];
326    
327          warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);          warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
328    
329          return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);          return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
330    
# Line 326  sub _get_marc_fields { Line 345  sub _get_marc_fields {
345    
346          if ($debug) {          if ($debug) {
347                  warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );                  warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
348                  warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;                  warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
349                  warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;                  warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
350                  warn "## subfield count = ", dump( $u ), $/;                  warn "## subfield count = ", dump( $u ), $/;
351          }          }
# Line 407  sub _get_marc_fields { Line 426  sub _get_marc_fields {
426          return \@m;          return \@m;
427  }  }
428    
429    =head2 _get_marc_leader
430    
431    Return leader from currently fetched record by L</_get_marc_fields>
432    
433      print WebPAC::Normalize::_get_marc_leader();
434    
435    =cut
436    
437    sub _get_marc_leader {
438            die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
439            return $marc_leader->[ $fetch_pos ];
440    }
441    
442  =head2 _debug  =head2 _debug
443    
444  Change level of debug warnings  Change level of debug warnings
# Line 426  sub _debug { Line 458  sub _debug {
458    
459  Those functions generally have to first in your normalization file.  Those functions generally have to first in your normalization file.
460    
461  =head2 tag  =head2 search_display
462    
463  Define new tag for I<search> and I<display>.  Define output for L<search> and L<display> at the same time
464    
465    tag('Title', rec('200','a') );    search_display('Title', rec('200','a') );
466    
467    
468  =cut  =cut
469    
470  sub tag {  sub search_display {
471          my $name = shift or die "tag needs name as first argument";          my $name = shift or die "search_display needs name as first argument";
472          my @o = grep { defined($_) && $_ ne '' } @_;          my @o = grep { defined($_) && $_ ne '' } @_;
473          return unless (@o);          return unless (@o);
         $out->{$name}->{tag} = $name;  
474          $out->{$name}->{search} = \@o;          $out->{$name}->{search} = \@o;
475          $out->{$name}->{display} = \@o;          $out->{$name}->{display} = \@o;
476  }  }
477    
478    =head2 tag
479    
480    Old name for L<search_display>, but supported
481    
482    =cut
483    
484    sub tag {
485            search_display( @_ );
486    }
487    
488  =head2 display  =head2 display
489    
490  Define tag just for I<display>  Define output just for I<display>
491    
492    @v = display('Title', rec('200','a') );    @v = display('Title', rec('200','a') );
493    
494  =cut  =cut
495    
496  sub display {  sub _field {
497          my $name = shift or die "display needs name as first argument";          my $type = shift or confess "need type -- BUG?";
498            my $name = shift or confess "needs name as first argument";
499          my @o = grep { defined($_) && $_ ne '' } @_;          my @o = grep { defined($_) && $_ ne '' } @_;
500          return unless (@o);          return unless (@o);
501          $out->{$name}->{tag} = $name;          $out->{$name}->{$type} = \@o;
         $out->{$name}->{display} = \@o;  
502  }  }
503    
504    sub display { _field( 'display', @_ ) }
505    
506  =head2 search  =head2 search
507    
508  Prepare values just for I<search>  Prepare values just for I<search>
# Line 468  Prepare values just for I<search> Line 511  Prepare values just for I<search>
511    
512  =cut  =cut
513    
514  sub search {  sub search { _field( 'search', @_ ) }
515          my $name = shift or die "search needs name as first argument";  
516          my @o = grep { defined($_) && $_ ne '' } @_;  =head2 sorted
517          return unless (@o);  
518          $out->{$name}->{tag} = $name;  Insert into lists which will be automatically sorted
519          $out->{$name}->{search} = \@o;  
520  }   sorted('Title', rec('200','a') );
521    
522    =cut
523    
524    sub sorted { _field( 'sorted', @_ ) }
525    
526    
527  =head2 marc_leader  =head2 marc_leader
528    
# Line 489  sub marc_leader { Line 537  sub marc_leader {
537          my ($offset,$value) = @_;          my ($offset,$value) = @_;
538    
539          if ($offset) {          if ($offset) {
540                  $leader->{ $offset } = $value;                  $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
541          } else {          } else {
542                  return $leader;                  
543                    if (defined($marc_leader)) {
544                            die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
545                            return $marc_leader->[ $marc_record_offset ];
546                    } else {
547                            return;
548                    }
549            }
550    }
551    
552    =head2 marc_fixed
553    
554    Create control/indentifier fields with values in fixed positions
555    
556      marc_fixed('008', 00, '070402');
557      marc_fixed('008', 39, '|');
558    
559    Positions not specified will be filled with spaces (C<0x20>).
560    
561    There will be no effort to extend last specified value to full length of
562    field in standard.
563    
564    =cut
565    
566    sub marc_fixed {
567            my ($f, $pos, $val) = @_;
568            die "need marc(field, position, value)" unless defined($f) && defined($pos);
569    
570            confess "need val" unless defined $val;
571    
572            my $update = 0;
573    
574            map {
575                    if ($_->[0] eq $f) {
576                            my $old = $_->[1];
577                            if (length($old) <= $pos) {
578                                    $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
579                                    warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
580                            } else {
581                                    $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
582                                    warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
583                            }
584                            $update++;
585                    }
586            } @{ $marc_record->[ $marc_record_offset ] };
587    
588            if (! $update) {
589                    my $v = ' ' x $pos . $val;
590                    push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
591                    warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
592          }          }
593  }  }
594    
# Line 623  sub marc_duplicate { Line 720  sub marc_duplicate {
720           my $m = $marc_record->[ -1 ];           my $m = $marc_record->[ -1 ];
721           die "can't duplicate record which isn't defined" unless ($m);           die "can't duplicate record which isn't defined" unless ($m);
722           push @{ $marc_record }, dclone( $m );           push @{ $marc_record }, dclone( $m );
723           warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);           push @{ $marc_leader }, dclone( marc_leader() );
724             warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
725           $marc_record_offset = $#{ $marc_record };           $marc_record_offset = $#{ $marc_record };
726           warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);           warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
727    
728  }  }
729    
730  =head2 marc_remove  =head2 marc_remove
# Line 661  sub marc_remove { Line 760  sub marc_remove {
760          if ($f eq '*') {          if ($f eq '*') {
761    
762                  delete( $marc_record->[ $marc_record_offset ] );                  delete( $marc_record->[ $marc_record_offset ] );
763                    warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
764    
765          } else {          } else {
766    
# Line 699  sub marc_remove { Line 799  sub marc_remove {
799                  $marc_record->[ $marc_record_offset ] = $marc;                  $marc_record->[ $marc_record_offset ] = $marc;
800          }          }
801    
   
802          warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);          warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
803  }  }
804    
# Line 728  sub marc_original_order { Line 827  sub marc_original_order {
827          return unless defined($rec->{$from});          return unless defined($rec->{$from});
828    
829          my $r = $rec->{$from};          my $r = $rec->{$from};
830          die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');          die "record field $from isn't array ",dump( $rec ) unless (ref($r) eq 'ARRAY');
831    
832          my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');          my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
833          warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);          warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
# Line 774  sub marc_original_order { Line 873  sub marc_original_order {
873  }  }
874    
875    
876    =head2 marc_count
877    
878    Return number of MARC records created using L</marc_duplicate>.
879    
880      print "created ", marc_count(), " records";
881    
882    =cut
883    
884    sub marc_count {
885            return $#{ $marc_record };
886    }
887    
888    =head2 _marc_push
889    
890     _marc_push( $marc );
891    
892    =cut
893    
894    sub _marc_push {
895            my $marc = shift || die "no marc?";
896            push @{ $marc_record->[ $marc_record_offset ] }, $marc;
897    }
898    
899    
900  =head1 Functions to extract data from input  =head1 Functions to extract data from input
901    
902  This function should be used inside functions to create C<data_structure> described  This function should be used inside functions to create C<data_structure> described
# Line 795  sub _pack_subfields_hash { Line 918  sub _pack_subfields_hash {
918    
919          my ($h,$include_subfields) = @_;          my ($h,$include_subfields) = @_;
920    
921            # sanity and ease of use
922            return $h if (ref($h) ne 'HASH');
923    
924          if ( defined($h->{subfields}) ) {          if ( defined($h->{subfields}) ) {
925                  my $sfs = delete $h->{subfields} || die "no subfields?";                  my $sfs = delete $h->{subfields} || die "no subfields?";
926                  my @out;                  my @out;
# Line 915  sub rec { Line 1041  sub rec {
1041          }          }
1042  }  }
1043    
1044    =head2 frec
1045    
1046    Returns first value from field
1047    
1048      $v = frec('200');
1049      $v = frec('200','a');
1050    
1051    =cut
1052    
1053    sub frec {
1054            my @out = rec(@_);
1055            warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1056            return shift @out;
1057    }
1058    
1059    =head2 frec_eq
1060    
1061    =head2 frec_ne
1062    
1063    Check if first values from two fields are same or different
1064    
1065      if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
1066            # values are same
1067      } else {
1068        # values are different
1069      }
1070    
1071    Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
1072    could write something like:
1073    
1074      if ( frec( '900','a' ) eq frec( '910','c' ) ) {
1075            # yada tada
1076      }
1077    
1078    but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
1079    in order to parse text and create invalid function C<eqfrec>.
1080    
1081    =cut
1082    
1083    sub frec_eq {
1084            my ( $f1,$sf1, $f2, $sf2 ) = @_;
1085            return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
1086    }
1087    
1088    sub frec_ne {
1089            return ! frec_eq( @_ );
1090    }
1091    
1092  =head2 regex  =head2 regex
1093    
1094  Apply regex to some or all values  Apply regex to some or all values
# Line 944  Prefix all values with a string Line 1118  Prefix all values with a string
1118  =cut  =cut
1119    
1120  sub prefix {  sub prefix {
1121          my $p = shift or return;          my $p = shift;
1122            return @_ unless defined( $p );
1123          return map { $p . $_ } grep { defined($_) } @_;          return map { $p . $_ } grep { defined($_) } @_;
1124  }  }
1125    
# Line 957  suffix all values with a string Line 1132  suffix all values with a string
1132  =cut  =cut
1133    
1134  sub suffix {  sub suffix {
1135          my $s = shift or die "suffix needs string as first argument";          my $s = shift;
1136            return @_ unless defined( $s );
1137          return map { $_ . $s } grep { defined($_) } @_;          return map { $_ . $s } grep { defined($_) } @_;
1138  }  }
1139    
# Line 970  surround all values with a two strings Line 1146  surround all values with a two strings
1146  =cut  =cut
1147    
1148  sub surround {  sub surround {
1149          my $p = shift or die "surround need prefix as first argument";          my $p = shift;
1150          my $s = shift or die "surround needs suffix as second argument";          my $s = shift;
1151            $p = '' unless defined( $p );
1152            $s = '' unless defined( $s );
1153          return map { $p . $_ . $s } grep { defined($_) } @_;          return map { $p . $_ . $s } grep { defined($_) } @_;
1154  }  }
1155    
# Line 1146  Consult config values stored in C<config Line 1324  Consult config values stored in C<config
1324    $database_code = config();    # use _ from hash    $database_code = config();    # use _ from hash
1325    $database_name = config('name');    $database_name = config('name');
1326    $database_input_name = config('input name');    $database_input_name = config('input name');
   $tag = config('input normalize tag');  
1327    
1328  Up to three levels are supported.  Up to three levels are supported.
1329    
# Line 1272  my $hash; Line 1449  my $hash;
1449    
1450  sub set {  sub set {
1451          my ($k,$v) = @_;          my ($k,$v) = @_;
1452          warn "## set ( $k => ", dump($v), " )", $/;          warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1453          $hash->{$k} = $v;          $hash->{$k} = $v;
1454  };  };
1455    
# Line 1285  sub set { Line 1462  sub set {
1462  sub get {  sub get {
1463          my $k = shift || return;          my $k = shift || return;
1464          my $v = $hash->{$k};          my $v = $hash->{$k};
1465          warn "## get $k = ", dump( $v ), $/;          warn "## get $k = ", dump( $v ), $/ if ( $debug );
1466          return $v;          return $v;
1467  }  }
1468    
1469    =head2 count
1470    
1471      if ( count( @result ) == 1 ) {
1472            # do something if only 1 result is there
1473      }
1474    
1475    =cut
1476    
1477    sub count {
1478            warn "## count ",dump(@_),$/ if ( $debug );
1479            return @_ . '';
1480    }
1481    
1482  # END  # END
1483  1;  1;

Legend:
Removed from v.786  
changed lines
  Added in v.1021

  ViewVC Help
Powered by ViewVC 1.1.26