--- trunk/lib/WebPAC/Normalize.pm 2006/07/23 17:33:11 601 +++ trunk/lib/WebPAC/Normalize.pm 2006/09/26 10:05:25 711 @@ -4,15 +4,18 @@ _set_rec _set_lookup _get_ds _clean_ds _debug + _pack_subfields_hash tag search display marc marc_indicators marc_repeatable_subfield marc_compose marc_leader marc_duplicate marc_remove + marc_original_order rec1 rec2 rec regex prefix suffix surround first lookup join_with + save_into_lookup split_rec_on /; @@ -22,7 +25,6 @@ #use base qw/WebPAC::Common/; use Data::Dump qw/dump/; -use Encode qw/from_to/; use Storable qw/dclone/; # debugging warn(s) @@ -35,11 +37,11 @@ =head1 VERSION -Version 0.14 +Version 0.20 =cut -our $VERSION = '0.14'; +our $VERSION = '0.20'; =head1 SYNOPSIS @@ -65,14 +67,14 @@ Return data structure my $ds = WebPAC::Normalize::data_structure( - lookup => $lookup->lookup_hash, + lookup => $lookup_variable, row => $row, rules => $normalize_pl_config, marc_encoding => 'utf-8', config => $config, ); -Options C, C, C and C are mandatory while all +Options C, C and C are mandatory while all other are optional. This function will B if normalizastion can't be evaled. @@ -89,9 +91,9 @@ die "need normalisation argument" unless ($arg->{rules}); no strict 'subs'; - _set_lookup( $arg->{lookup} ); + _set_lookup( $arg->{lookup} ) if (defined( $arg->{lookup} )); _set_rec( $arg->{row} ); - _set_config( $arg->{config} ); + _set_config( $arg->{config} ) if (defined( $arg->{config} )); _clean_ds( %{ $arg } ); eval "$arg->{rules}"; die "error evaling $arg->{rules}: $@\n" if ($@); @@ -185,6 +187,18 @@ $lookup = shift; } +=head2 _get_lookup + +Get current lookup hash + + my $lookup = _get_lookup(); + +=cut + +sub _get_lookup { + return $lookup; +} + =head2 _get_marc_fields Get all fields defined by calls to C @@ -468,7 +482,6 @@ foreach (@_) { my $v = $_; # make var read-write for Encode next unless (defined($v) && $v !~ /^\s*$/); - from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding); my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' '); if (defined $sf) { push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ]; @@ -525,6 +538,9 @@ 'c', rec('200','c') ); +If you specify C<+> for subfield, value will be appended +to previous defined subfield. + =cut sub marc_compose { @@ -536,14 +552,21 @@ warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2); + if ($#_ % 2 != 1) { + die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n"; + } + while (@_) { - my $sf = shift or die "marc_compose $f needs subfield"; + my $sf = shift; my $v = shift; next unless (defined($v) && $v !~ /^\s*$/); - from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding); - push @$m, ( $sf, $v ); warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1); + if ($sf ne '+') { + push @$m, ( $sf, $v ); + } else { + $m->[ $#$m ] .= $v; + } } warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1); @@ -634,11 +657,138 @@ warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1); } +=head2 marc_original_order + +Copy all subfields preserving original order to marc field. + + marc_original_order( marc_field_number, original_input_field_number ); + +Please note that field numbers are consistent with other commands (marc +field number first), but somewhat counter-intuitive (destination and then +source). + +You might want to use this command if you are just renaming subfields or +using pre-processing modify_record in C and don't need any +post-processing or want to preserve order of original subfields. + + +=cut + +sub marc_original_order { + + my ($to, $from) = @_; + die "marc_original_order needs from and to fields\n" unless ($from && $to); + + return unless defined($rec->{$from}); + + my $r = $rec->{$from}; + die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY'); + + my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' '); + warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1); + + foreach my $d (@$r) { + + if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') { + warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n"; + next; + } + + my @sfs = @{ $d->{subfields} }; + + die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1); + + warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2); + + my $m = [ $to, $i1, $i2 ]; + + while (my $sf = shift @sfs) { + + warn "#--> sf: ",dump($sf), $/ if ($debug > 2); + my $offset = shift @sfs; + die "corrupted sufields specification for field $from\n" unless defined($offset); + + my $v; + if (ref($d->{$sf}) eq 'ARRAY') { + $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset])); + } elsif ($offset == 0) { + $v = $d->{$sf}; + } else { + die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf}); + } + push @$m, ( $sf, $v ) if (defined($v)); + } + + if ($#{$m} > 2) { + push @{ $marc_record->[ $marc_record_offset ] }, $m; + } + } + + warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1); +} + + =head1 Functions to extract data from input This function should be used inside functions to create C described above. +=head2 _pack_subfields_hash + + @subfields = _pack_subfields_hash( $h ); + $subfields = _pack_subfields_hash( $h, 1 ); + +Return each subfield value in array or pack them all together and return scalar +with subfields (denoted by C<^>) and values. + +=cut + +sub _pack_subfields_hash { + + warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1); + + my ($h,$include_subfields) = @_; + + if ( defined($h->{subfields}) ) { + my $sfs = delete $h->{subfields} || die "no subfields?"; + my @out; + while (@$sfs) { + my $sf = shift @$sfs; + push @out, '^' . $sf if ($include_subfields); + my $o = shift @$sfs; + if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) { + # single element subfields are not arrays +#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n"; + + push @out, $h->{$sf}; + } else { +#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n"; + push @out, $h->{$sf}->[$o]; + } + } + if ($include_subfields) { + return join('', @out); + } else { + return @out; + } + } else { + if ($include_subfields) { + my $out = ''; + foreach my $sf (sort keys %$h) { + if (ref($h->{$sf}) eq 'ARRAY') { + $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} }); + } else { + $out .= '^' . $sf . $h->{$sf}; + } + } + return $out; + } else { + # FIXME this should probably be in alphabetical order instead of hash order + values %{$h}; + } + } +} + =head2 rec1 Return all values in some field @@ -655,13 +805,15 @@ return unless (defined($rec) && defined($rec->{$f})); warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1); if (ref($rec->{$f}) eq 'ARRAY') { - return map { - if (ref($_) eq 'HASH') { - values %{$_}; + my @out; + foreach my $h ( @{ $rec->{$f} } ) { + if (ref($h) eq 'HASH') { + push @out, ( _pack_subfields_hash( $h ) ); } else { - $_; + push @out, $h; } - } @{ $rec->{$f} }; + } + return @out; } elsif( defined($rec->{$f}) ) { return $rec->{$f}; } @@ -792,6 +944,8 @@ @v = lookup( $v ); @v = lookup( @v ); +FIXME B + =cut sub lookup { @@ -804,6 +958,29 @@ } } +=head2 save_into_lookup + +Save value into lookup. + + save_into_lookup($key,sub { + # code which produce one or more values + }); + +This function shouldn't be called directly, it's called from code created by L. + +=cut + +sub save_into_lookup { + my ($k,$coderef) = @_; + die "save_into_lookup needs key" unless defined($k); + die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' ); + my $mfn = $rec->{'000'}->[0] || die "mfn not defined or zero"; + foreach my $v ( $coderef->() ) { + $lookup->{$k}->{$v}->{$mfn}++; + warn "# lookup $k $v $mfn saved\n" if ($debug > 1); + } +} + =head2 config Consult config values stored in C @@ -917,7 +1094,7 @@ my $v = shift @r; warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2); - return '' if( ! defined($v) || $v =~ /^\s*$/); + return '' if ( ! defined($v) || $v =~ /^\s*$/); my @s = split( $regex, $v ); warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);