lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds
        _debug

        tag search display
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader
        marc_duplicate marc_remove

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with

        split_rec_on
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Encode qw/from_to/;
use Storable qw/dclone/;

# debugging warn(s)
my $debug = 0;


=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.12

=cut

our $VERSION = '0.12';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
my ($marc_record_offset, $marc_fetch_offset) = (0, 0);

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
        ($marc_record_offset, $marc_fetch_offset) = (0,0);
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc_fields

Get all fields defined by calls to C<marc>

        $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );

We are using I<magic> which detect repeatable fields only from
sequence of field/subfield data generated by normalization.

Repeatable field is created when there is second occurence of same subfield or
if any of indicators are different.

This is sane for most cases. Something like:

  900a-1 900b-1 900c-1
  900a-2 900b-2
  900a-3

will be created from any combination of:

  900a-1 900a-2 900a-3 900b-1 900b-2 900c-1

and following rules:

  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

which might not be what you have in mind. If you need repeatable subfield,
define it using C<marc_repeatable_subfield> like this:

  marc_repeatable_subfield('900','a');
  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

will create:

  900a-1 900a-2 900a-3 900b-1 900c-1
  900b-2

There is also support for returning next or specific using:

  while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
        # do something with $mf
  }

will always return fields from next MARC record or

  my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );

will return 42th copy record (if it exists).

=cut

sub _get_marc_fields {

        my $arg = {@_};
        warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
        my $offset = $marc_fetch_offset;
        if ($arg->{offset}) {
                $offset = $arg->{offset};
        } elsif($arg->{fetch_next}) {
                $marc_fetch_offset++;
        }

        return if (! $marc_record || ref($marc_record) ne 'ARRAY');

        warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);

        my $marc_rec = $marc_record->[ $offset ];

        warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);

        return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);

        # first, sort all existing fields 
        # XXX might not be needed, but modern perl might randomize elements in hash
        my @sorted_marc_record = sort {
                $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
        } @{ $marc_rec };

        @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
        
        # output marc fields
        my @m;

        # count unique field-subfields (used for offset when walking to next subfield)
        my $u;
        map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;

        if ($debug) {
                warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
                warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
                warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
                warn "## subfield count = ", dump( $u ), $/;
        }

        my $len = $#sorted_marc_record;
        my $visited;
        my $i = 0;
        my $field;

        foreach ( 0 .. $len ) {

                # find next element which isn't visited
                while ($visited->{$i}) {
                        $i = ($i + 1) % ($len + 1);
                }

                # mark it visited
                $visited->{$i}++;

                my $row = dclone( $sorted_marc_record[$i] );

                # field and subfield which is key for
                # marc_repeatable_subfield and u
                my $fsf = $row->[0] . ( $row->[3] || '' );

                if ($debug > 1) {

                        print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
                        print "### this [$i]: ", dump( $row ),$/;
                        print "### sf: ", $row->[3], " vs ", $field->[3],
                                $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
                                if ($#$field >= 0);

                }

                # if field exists
                if ( $#$field >= 0 ) {
                        if (
                                $row->[0] ne $field->[0] ||             # field
                                $row->[1] ne $field->[1] ||             # i1
                                $row->[2] ne $field->[2]                # i2
                        ) {
                                push @m, $field;
                                warn "## saved/1 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } elsif (
                                ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
                                ||
                                ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
                                        ! $marc_repeatable_subfield->{ $fsf }
                                )
                        ) {
                                push @m, $field;
                                warn "## saved/2 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } else {
                                # append new subfields to existing field
                                push @$field, ( $row->[3], $row->[4] );
                        }
                } else {
                        # insert first field
                        $field = $row;
                }

                if (! $marc_repeatable_subfield->{ $fsf }) {
                        # make step to next subfield
                        $i = ($i + $u->{ $fsf } ) % ($len + 1);
                }
        }

        if ($#$field >= 0) {
                push @m, $field;
                warn "## saved/3 ", dump( $field ),$/ if ($debug);
        }

        return \@m;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc_leader

Setup fields within MARC leader or get leader

  marc_leader('05','c');
  my $leader = marc_leader();

=cut

sub marc_leader {
        my ($offset,$value) = @_;

        if ($offset) {
                $out->{' leader'}->{ $offset } = $value;
        } else {
                return $out->{' leader'};
        }
}

=head2 marc

Save value for MARC field

  marc('900','a', rec('200','a') );
  marc('001', rec('000') );

=cut

sub marc {
        my $f = shift or die "marc needs field";
        die "marc field must be numer" unless ($f =~ /^\d+$/);

        my $sf;
        if ($f >= 10) {
                $sf = shift or die "marc needs subfield";
        }

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
                if (defined $sf) {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
                } else {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
                }
        }
}

=head2 marc_repeatable_subfield

Save values for MARC repetable subfield

  marc_repeatable_subfield('910', 'z', rec('909') );

=cut

sub marc_repeatable_subfield {
        my ($f,$sf) = @_;
        die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
        $marc_repeatable_subfield->{ $f . $sf }++;
        marc(@_);
}

=head2 marc_indicators

Set both indicators for MARC field

  marc_indicators('900', ' ', 1);

Any indicator value other than C<0-9> will be treated as undefined.

=cut

sub marc_indicators {
        my $f = shift || die "marc_indicators need field!\n";
        my ($i1,$i2) = @_;
        die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
        die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));

        $i1 = ' ' if ($i1 !~ /^\d$/);
        $i2 = ' ' if ($i2 !~ /^\d$/);
        @{ $marc_indicators->{$f} } = ($i1,$i2);
}

=head2 marc_compose

Save values for each MARC subfield explicitly

  marc_compose('900',
        'a', rec('200','a')
        'b', rec('201','a')
        'a', rec('200','b')
        'c', rec('200','c')
  );

=cut

sub marc_compose {
        my $f = shift or die "marc_compose needs field";
        die "marc_compose field must be numer" unless ($f =~ /^\d+$/);

        my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
        my $m = [ $f, $i1, $i2 ];

        warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);

        while (@_) {
                my $sf = shift or die "marc_compose $f needs subfield";
                my $v = shift;

                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                push @$m, ( $sf, $v );
                warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
        }

        warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);

        push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
}

=head2 marc_duplicate

Generate copy of current MARC record and continue working on copy

  marc_duplicate();

Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
C<< _get_marc_fields( offset => 42 ) >>.

=cut

sub marc_duplicate {
         my $m = $marc_record->[ -1 ];
         die "can't duplicate record which isn't defined" unless ($m);
         push @{ $marc_record }, dclone( $m );
         warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
         $marc_record_offset = $#{ $marc_record };
         warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
}

=head2 marc_remove

Remove some field or subfield from MARC record.

  marc_remove('200');
  marc_remove('200','a');

This will erase field C<200> or C<200^a> from current MARC record.

This is useful after calling C<marc_duplicate> or on it's own (but, you
should probably just remove that subfield definition if you are not
using C<marc_duplicate>).

FIXME: support fields < 10.

=cut

sub marc_remove {
        my ($f, $sf) = @_;

        die "marc_remove needs record number" unless defined($f);

        my $marc = $marc_record->[ $marc_record_offset ];

        warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);

        my $i = 0;
        foreach ( 0 .. $#{ $marc } ) {
                last unless (defined $marc->[$i]);
                warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
                if ($marc->[$i]->[0] eq $f) {
                        if (! defined $sf) {
                                # remove whole field
                                splice @$marc, $i, 1;
                                warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
                                $i--;
                        } else {
                                foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
                                        my $o = ($j * 2) + 3;
                                        if ($marc->[$i]->[$o] eq $sf) {
                                                # remove subfield
                                                splice @{$marc->[$i]}, $o, 2;
                                                warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                # is record now empty?
                                                if ($#{ $marc->[$i] } == 2) {
                                                        splice @$marc, $i, 1;
                                                        warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                        $i--;
                                                };
                                        }
                                }
                        }
                }
                $i++;
        }

        warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);

        $marc_record->[ $marc_record_offset ] = $marc;

        warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	@EXPORT = qw/
4	_set_rec _set_lookup
5	_get_ds _clean_ds
6	_debug
7
8	tag search display
9	marc marc_indicators marc_repeatable_subfield
10	marc_compose marc_leader
11	marc_duplicate marc_remove
12
13	rec1 rec2 rec
14	regex prefix suffix surround
15	first lookup join_with
16
17	split_rec_on
18	/;
19
20	use warnings;
21	use strict;
22
23	#use base qw/WebPAC::Common/;
24	use Data::Dump qw/dump/;
25	use Encode qw/from_to/;
26	use Storable qw/dclone/;
27
28	# debugging warn(s)
29	my $debug = 0;
30
31
32	=head1 NAME
33
34	WebPAC::Normalize - describe normalisaton rules using sets
35
36	=head1 VERSION
37
38	Version 0.12
39
40	=cut
41
42	our $VERSION = '0.12';
43
44	=head1 SYNOPSIS
45
46	This module uses C<conf/normalize/*.pl> files to perform normalisation
47	from input records using perl functions which are specialized for set
48	processing.
49
50	Sets are implemented as arrays, and normalisation file is valid perl, which
51	means that you check it's validity before running WebPAC using
52	C<perl -c normalize.pl>.
53
54	Normalisation can generate multiple output normalized data. For now, supported output
55	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56	C<marc>.
57
58	=head1 FUNCTIONS
59
60	Functions which start with C<_> are private and used by WebPAC internally.
61	All other functions are available for use within normalisation rules.
62
63	=head2 data_structure
64
65	Return data structure
66
67	my $ds = WebPAC::Normalize::data_structure(
68	lookup => $lookup->lookup_hash,
69	row => $row,
70	rules => $normalize_pl_config,
71	marc_encoding => 'utf-8',
72	);
73
74	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75	other are optional.
76
77	This function will B<die> if normalizastion can't be evaled.
78
79	Since this function isn't exported you have to call it with
80	C<WebPAC::Normalize::data_structure>.
81
82	=cut
83
84	sub data_structure {
85	my $arg = {@_};
86
87	die "need row argument" unless ($arg->{row});
88	die "need normalisation argument" unless ($arg->{rules});
89
90	no strict 'subs';
91	_set_lookup( $arg->{lookup} );
92	_set_rec( $arg->{row} );
93	_clean_ds( %{ $arg } );
94	eval "$arg->{rules}";
95	die "error evaling $arg->{rules}: $@\n" if ($@);
96
97	return _get_ds();
98	}
99
100	=head2 _set_rec
101
102	Set current record hash
103
104	_set_rec( $rec );
105
106	=cut
107
108	my $rec;
109
110	sub _set_rec {
111	$rec = shift or die "no record hash";
112	}
113
114	=head2 _get_ds
115
116	Return hash formatted as data structure
117
118	my $ds = _get_ds();
119
120	=cut
121
122	my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123	my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125	sub _get_ds {
126	return $out;
127	}
128
129	=head2 _clean_ds
130
131	Clean data structure hash for next record
132
133	_clean_ds();
134
135	=cut
136
137	sub _clean_ds {
138	my $a = {@_};
139	($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140	($marc_record_offset, $marc_fetch_offset) = (0,0);
141	$marc_encoding = $a->{marc_encoding};
142	}
143
144	=head2 _set_lookup
145
146	Set current lookup hash
147
148	_set_lookup( $lookup );
149
150	=cut
151
152	my $lookup;
153
154	sub _set_lookup {
155	$lookup = shift;
156	}
157
158	=head2 _get_marc_fields
159
160	Get all fields defined by calls to C<marc>
161
162	$marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164	We are using I<magic> which detect repeatable fields only from
165	sequence of field/subfield data generated by normalization.
166
167	Repeatable field is created when there is second occurence of same subfield or
168	if any of indicators are different.
169
170	This is sane for most cases. Something like:
171
172	900a-1 900b-1 900c-1
173	900a-2 900b-2
174	900a-3
175
176	will be created from any combination of:
177
178	900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180	and following rules:
181
182	marc('900','a', rec('200','a') );
183	marc('900','b', rec('200','b') );
184	marc('900','c', rec('200','c') );
185
186	which might not be what you have in mind. If you need repeatable subfield,
187	define it using C<marc_repeatable_subfield> like this:
188
189	marc_repeatable_subfield('900','a');
190	marc('900','a', rec('200','a') );
191	marc('900','b', rec('200','b') );
192	marc('900','c', rec('200','c') );
193
194	will create:
195
196	900a-1 900a-2 900a-3 900b-1 900c-1
197	900b-2
198
199	There is also support for returning next or specific using:
200
201	while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202	# do something with $mf
203	}
204
205	will always return fields from next MARC record or
206
207	my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209	will return 42th copy record (if it exists).
210
211	=cut
212
213	sub _get_marc_fields {
214
215	my $arg = {@_};
216	warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217	my $offset = $marc_fetch_offset;
218	if ($arg->{offset}) {
219	$offset = $arg->{offset};
220	} elsif($arg->{fetch_next}) {
221	$marc_fetch_offset++;
222	}
223
224	return if (! $marc_record \|\| ref($marc_record) ne 'ARRAY');
225
226	warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228	my $marc_rec = $marc_record->[ $offset ];
229
230	warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232	return if (! $marc_rec \|\| ref($marc_rec) ne 'ARRAY' \|\| $#{ $marc_rec } < 0);
233
234	# first, sort all existing fields
235	# XXX might not be needed, but modern perl might randomize elements in hash
236	my @sorted_marc_record = sort {
237	$a->[0] . ( $a->[3] \|\| '' ) cmp $b->[0] . ( $b->[3] \|\| '')
238	} @{ $marc_rec };
239
240	@sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242	# output marc fields
243	my @m;
244
245	# count unique field-subfields (used for offset when walking to next subfield)
246	my $u;
247	map { $u->{ $_->[0] . ( $_->[3] \|\| '') }++ } @sorted_marc_record;
248
249	if ($debug) {
250	warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251	warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252	warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253	warn "## subfield count = ", dump( $u ), $/;
254	}
255
256	my $len = $#sorted_marc_record;
257	my $visited;
258	my $i = 0;
259	my $field;
260
261	foreach ( 0 .. $len ) {
262
263	# find next element which isn't visited
264	while ($visited->{$i}) {
265	$i = ($i + 1) % ($len + 1);
266	}
267
268	# mark it visited
269	$visited->{$i}++;
270
271	my $row = dclone( $sorted_marc_record[$i] );
272
273	# field and subfield which is key for
274	# marc_repeatable_subfield and u
275	my $fsf = $row->[0] . ( $row->[3] \|\| '' );
276
277	if ($debug > 1) {
278
279	print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280	print "### this [$i]: ", dump( $row ),$/;
281	print "### sf: ", $row->[3], " vs ", $field->[3],
282	$marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283	if ($#$field >= 0);
284
285	}
286
287	# if field exists
288	if ( $#$field >= 0 ) {
289	if (
290	$row->[0] ne $field->[0] \|\| # field
291	$row->[1] ne $field->[1] \|\| # i1
292	$row->[2] ne $field->[2] # i2
293	) {
294	push @m, $field;
295	warn "## saved/1 ", dump( $field ),$/ if ($debug);
296	$field = $row;
297
298	} elsif (
299	( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300	\|\|
301	( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302	! $marc_repeatable_subfield->{ $fsf }
303	)
304	) {
305	push @m, $field;
306	warn "## saved/2 ", dump( $field ),$/ if ($debug);
307	$field = $row;
308
309	} else {
310	# append new subfields to existing field
311	push @$field, ( $row->[3], $row->[4] );
312	}
313	} else {
314	# insert first field
315	$field = $row;
316	}
317
318	if (! $marc_repeatable_subfield->{ $fsf }) {
319	# make step to next subfield
320	$i = ($i + $u->{ $fsf } ) % ($len + 1);
321	}
322	}
323
324	if ($#$field >= 0) {
325	push @m, $field;
326	warn "## saved/3 ", dump( $field ),$/ if ($debug);
327	}
328
329	return \@m;
330	}
331
332	=head2 _debug
333
334	Change level of debug warnings
335
336	_debug( 2 );
337
338	=cut
339
340	sub _debug {
341	my $l = shift;
342	return $debug unless defined($l);
343	warn "debug level $l",$/ if ($l > 0);
344	$debug = $l;
345	}
346
347	=head1 Functions to create C<data_structure>
348
349	Those functions generally have to first in your normalization file.
350
351	=head2 tag
352
353	Define new tag for I<search> and I<display>.
354
355	tag('Title', rec('200','a') );
356
357
358	=cut
359
360	sub tag {
361	my $name = shift or die "tag needs name as first argument";
362	my @o = grep { defined($_) && $_ ne '' } @_;
363	return unless (@o);
364	$out->{$name}->{tag} = $name;
365	$out->{$name}->{search} = \@o;
366	$out->{$name}->{display} = \@o;
367	}
368
369	=head2 display
370
371	Define tag just for I<display>
372
373	@v = display('Title', rec('200','a') );
374
375	=cut
376
377	sub display {
378	my $name = shift or die "display needs name as first argument";
379	my @o = grep { defined($_) && $_ ne '' } @_;
380	return unless (@o);
381	$out->{$name}->{tag} = $name;
382	$out->{$name}->{display} = \@o;
383	}
384
385	=head2 search
386
387	Prepare values just for I<search>
388
389	@v = search('Title', rec('200','a') );
390
391	=cut
392
393	sub search {
394	my $name = shift or die "search needs name as first argument";
395	my @o = grep { defined($_) && $_ ne '' } @_;
396	return unless (@o);
397	$out->{$name}->{tag} = $name;
398	$out->{$name}->{search} = \@o;
399	}
400
401	=head2 marc_leader
402
403	Setup fields within MARC leader or get leader
404
405	marc_leader('05','c');
406	my $leader = marc_leader();
407
408	=cut
409
410	sub marc_leader {
411	my ($offset,$value) = @_;
412
413	if ($offset) {
414	$out->{' leader'}->{ $offset } = $value;
415	} else {
416	return $out->{' leader'};
417	}
418	}
419
420	=head2 marc
421
422	Save value for MARC field
423
424	marc('900','a', rec('200','a') );
425	marc('001', rec('000') );
426
427	=cut
428
429	sub marc {
430	my $f = shift or die "marc needs field";
431	die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433	my $sf;
434	if ($f >= 10) {
435	$sf = shift or die "marc needs subfield";
436	}
437
438	foreach (@_) {
439	my $v = $_; # make var read-write for Encode
440	next unless (defined($v) && $v !~ /^\s*$/);
441	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442	my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443	if (defined $sf) {
444	push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445	} else {
446	push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447	}
448	}
449	}
450
451	=head2 marc_repeatable_subfield
452
453	Save values for MARC repetable subfield
454
455	marc_repeatable_subfield('910', 'z', rec('909') );
456
457	=cut
458
459	sub marc_repeatable_subfield {
460	my ($f,$sf) = @_;
461	die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462	$marc_repeatable_subfield->{ $f . $sf }++;
463	marc(@_);
464	}
465
466	=head2 marc_indicators
467
468	Set both indicators for MARC field
469
470	marc_indicators('900', ' ', 1);
471
472	Any indicator value other than C<0-9> will be treated as undefined.
473
474	=cut
475
476	sub marc_indicators {
477	my $f = shift \|\| die "marc_indicators need field!\n";
478	my ($i1,$i2) = @_;
479	die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480	die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482	$i1 = ' ' if ($i1 !~ /^\d$/);
483	$i2 = ' ' if ($i2 !~ /^\d$/);
484	@{ $marc_indicators->{$f} } = ($i1,$i2);
485	}
486
487	=head2 marc_compose
488
489	Save values for each MARC subfield explicitly
490
491	marc_compose('900',
492	'a', rec('200','a')
493	'b', rec('201','a')
494	'a', rec('200','b')
495	'c', rec('200','c')
496	);
497
498	=cut
499
500	sub marc_compose {
501	my $f = shift or die "marc_compose needs field";
502	die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504	my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505	my $m = [ $f, $i1, $i2 ];
506
507	warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
508
509	while (@_) {
510	my $sf = shift or die "marc_compose $f needs subfield";
511	my $v = shift;
512
513	next unless (defined($v) && $v !~ /^\s*$/);
514	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
515	push @$m, ( $sf, $v );
516	warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
517	}
518
519	warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
520
521	push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
522	}
523
524	=head2 marc_duplicate
525
526	Generate copy of current MARC record and continue working on copy
527
528	marc_duplicate();
529
530	Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
531	C<< _get_marc_fields( offset => 42 ) >>.
532
533	=cut
534
535	sub marc_duplicate {
536	my $m = $marc_record->[ -1 ];
537	die "can't duplicate record which isn't defined" unless ($m);
538	push @{ $marc_record }, dclone( $m );
539	warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
540	$marc_record_offset = $#{ $marc_record };
541	warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
542	}
543
544	=head2 marc_remove
545
546	Remove some field or subfield from MARC record.
547
548	marc_remove('200');
549	marc_remove('200','a');
550
551	This will erase field C<200> or C<200^a> from current MARC record.
552
553	This is useful after calling C<marc_duplicate> or on it's own (but, you
554	should probably just remove that subfield definition if you are not
555	using C<marc_duplicate>).
556
557	FIXME: support fields < 10.
558
559	=cut
560
561	sub marc_remove {
562	my ($f, $sf) = @_;
563
564	die "marc_remove needs record number" unless defined($f);
565
566	my $marc = $marc_record->[ $marc_record_offset ];
567
568	warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
569
570	my $i = 0;
571	foreach ( 0 .. $#{ $marc } ) {
572	last unless (defined $marc->[$i]);
573	warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
574	if ($marc->[$i]->[0] eq $f) {
575	if (! defined $sf) {
576	# remove whole field
577	splice @$marc, $i, 1;
578	warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
579	$i--;
580	} else {
581	foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
582	my $o = ($j * 2) + 3;
583	if ($marc->[$i]->[$o] eq $sf) {
584	# remove subfield
585	splice @{$marc->[$i]}, $o, 2;
586	warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
587	# is record now empty?
588	if ($#{ $marc->[$i] } == 2) {
589	splice @$marc, $i, 1;
590	warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
591	$i--;
592	};
593	}
594	}
595	}
596	}
597	$i++;
598	}
599
600	warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
601
602	$marc_record->[ $marc_record_offset ] = $marc;
603
604	warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
605	}
606
607	=head1 Functions to extract data from input
608
609	This function should be used inside functions to create C<data_structure> described
610	above.
611
612	=head2 rec1
613
614	Return all values in some field
615
616	@v = rec1('200')
617
618	TODO: order of values is probably same as in source data, need to investigate that
619
620	=cut
621
622	sub rec1 {
623	my $f = shift;
624	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
625	return unless (defined($rec) && defined($rec->{$f}));
626	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
627	if (ref($rec->{$f}) eq 'ARRAY') {
628	return map {
629	if (ref($_) eq 'HASH') {
630	values %{$_};
631	} else {
632	$_;
633	}
634	} @{ $rec->{$f} };
635	} elsif( defined($rec->{$f}) ) {
636	return $rec->{$f};
637	}
638	}
639
640	=head2 rec2
641
642	Return all values in specific field and subfield
643
644	@v = rec2('200','a')
645
646	=cut
647
648	sub rec2 {
649	my $f = shift;
650	return unless (defined($rec && $rec->{$f}));
651	my $sf = shift;
652	return map {
653	if (ref($_->{$sf}) eq 'ARRAY') {
654	@{ $_->{$sf} };
655	} else {
656	$_->{$sf};
657	}
658	} grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
659	}
660
661	=head2 rec
662
663	syntaxtic sugar for
664
665	@v = rec('200')
666	@v = rec('200','a')
667
668	=cut
669
670	sub rec {
671	my @out;
672	if ($#_ == 0) {
673	@out = rec1(@_);
674	} elsif ($#_ == 1) {
675	@out = rec2(@_);
676	}
677	if (@out) {
678	return @out;
679	} else {
680	return '';
681	}
682	}
683
684	=head2 regex
685
686	Apply regex to some or all values
687
688	@v = regex( 's/foo/bar/g', @v );
689
690	=cut
691
692	sub regex {
693	my $r = shift;
694	my @out;
695	#warn "r: $r\n", dump(\@_);
696	foreach my $t (@_) {
697	next unless ($t);
698	eval "\$t =~ $r";
699	push @out, $t if ($t && $t ne '');
700	}
701	return @out;
702	}
703
704	=head2 prefix
705
706	Prefix all values with a string
707
708	@v = prefix( 'my_', @v );
709
710	=cut
711
712	sub prefix {
713	my $p = shift or die "prefix needs string as first argument";
714	return map { $p . $_ } grep { defined($_) } @_;
715	}
716
717	=head2 suffix
718
719	suffix all values with a string
720
721	@v = suffix( '_my', @v );
722
723	=cut
724
725	sub suffix {
726	my $s = shift or die "suffix needs string as first argument";
727	return map { $_ . $s } grep { defined($_) } @_;
728	}
729
730	=head2 surround
731
732	surround all values with a two strings
733
734	@v = surround( 'prefix_', '_suffix', @v );
735
736	=cut
737
738	sub surround {
739	my $p = shift or die "surround need prefix as first argument";
740	my $s = shift or die "surround needs suffix as second argument";
741	return map { $p . $_ . $s } grep { defined($_) } @_;
742	}
743
744	=head2 first
745
746	Return first element
747
748	$v = first( @v );
749
750	=cut
751
752	sub first {
753	my $r = shift;
754	return $r;
755	}
756
757	=head2 lookup
758
759	Consult lookup hashes for some value
760
761	@v = lookup( $v );
762	@v = lookup( @v );
763
764	=cut
765
766	sub lookup {
767	my $k = shift or return;
768	return unless (defined($lookup->{$k}));
769	if (ref($lookup->{$k}) eq 'ARRAY') {
770	return @{ $lookup->{$k} };
771	} else {
772	return $lookup->{$k};
773	}
774	}
775
776	=head2 join_with
777
778	Joins walues with some delimiter
779
780	$v = join_with(", ", @v);
781
782	=cut
783
784	sub join_with {
785	my $d = shift;
786	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
787	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
788	return '' unless defined($v);
789	return $v;
790	}
791
792	=head2 split_rec_on
793
794	Split record subfield on some regex and take one of parts out
795
796	$a_before_semi_column =
797	split_rec_on('200','a', /\s;\s/, $part);
798
799	C<$part> is optional number of element. First element is
800	B<1>, not 0!
801
802	If there is no C<$part> parameter or C<$part> is 0, this function will
803	return all values produced by splitting.
804
805	=cut
806
807	sub split_rec_on {
808	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
809
810	my ($fld, $sf, $regex, $part) = @_;
811	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
812
813	my @r = rec( $fld, $sf );
814	my $v = shift @r;
815	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
816
817	return '' if( ! defined($v) \|\| $v =~ /^\s*$/);
818
819	my @s = split( $regex, $v );
820	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
821	if ($part && $part > 0) {
822	return $s[ $part - 1 ];
823	} else {
824	return @s;
825	}
826	}
827
828	# END
829	1;