lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds
        _debug

        tag search display
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader
        marc_duplicate marc_remove

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with

        split_rec_on
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Encode qw/from_to/;
use Storable qw/dclone/;

# debugging warn(s)
my $debug = 0;


=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.10

=cut

our $VERSION = '0.10';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
my ($marc_record_offset, $marc_fetch_offset) = (0, 0);

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
        ($marc_record_offset, $marc_fetch_offset) = (0,0);
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc_fields

Get all fields defined by calls to C<marc>

        $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );

We are using I<magic> which detect repeatable fields only from
sequence of field/subfield data generated by normalization.

Repeatable field is created when there is second occurence of same subfield or
if any of indicators are different.

This is sane for most cases. Something like:

  900a-1 900b-1 900c-1
  900a-2 900b-2
  900a-3

will be created from any combination of:

  900a-1 900a-2 900a-3 900b-1 900b-2 900c-1

and following rules:

  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

which might not be what you have in mind. If you need repeatable subfield,
define it using C<marc_repeatable_subfield> like this:

  marc_repeatable_subfield('900','a');
  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

will create:

  900a-1 900a-2 900a-3 900b-1 900c-1
  900b-2

There is also support for returning next or specific using:

  while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
        # do something with $mf
  }

will always return fields from next MARC record or

  my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );

will return 42th copy record (if it exists).

=cut

sub _get_marc_fields {

        my $arg = {@_};
        warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
        my $offset = $marc_fetch_offset;
        if ($arg->{offset}) {
                $offset = $arg->{offset};
        } elsif($arg->{fetch_next}) {
                $marc_fetch_offset++;
        }

        return if (! $marc_record || ref($marc_record) ne 'ARRAY');

        warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);

        my $marc_rec = $marc_record->[ $offset ];

        warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);

        return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);

        # first, sort all existing fields 
        # XXX might not be needed, but modern perl might randomize elements in hash
        my @sorted_marc_record = sort {
                $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
        } @{ $marc_rec };

        @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
        
        # output marc fields
        my @m;

        # count unique field-subfields (used for offset when walking to next subfield)
        my $u;
        map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;

        if ($debug) {
                warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
                warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
                warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
                warn "## subfield count = ", dump( $u ), $/;
        }

        my $len = $#sorted_marc_record;
        my $visited;
        my $i = 0;
        my $field;

        foreach ( 0 .. $len ) {

                # find next element which isn't visited
                while ($visited->{$i}) {
                        $i = ($i + 1) % ($len + 1);
                }

                # mark it visited
                $visited->{$i}++;

                my $row = dclone( $sorted_marc_record[$i] );

                # field and subfield which is key for
                # marc_repeatable_subfield and u
                my $fsf = $row->[0] . ( $row->[3] || '' );

                if ($debug > 1) {

                        print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
                        print "### this [$i]: ", dump( $row ),$/;
                        print "### sf: ", $row->[3], " vs ", $field->[3],
                                $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
                                if ($#$field >= 0);

                }

                # if field exists
                if ( $#$field >= 0 ) {
                        if (
                                $row->[0] ne $field->[0] ||             # field
                                $row->[1] ne $field->[1] ||             # i1
                                $row->[2] ne $field->[2]                # i2
                        ) {
                                push @m, $field;
                                warn "## saved/1 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } elsif (
                                ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
                                ||
                                ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
                                        ! $marc_repeatable_subfield->{ $fsf }
                                )
                        ) {
                                push @m, $field;
                                warn "## saved/2 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } else {
                                # append new subfields to existing field
                                push @$field, ( $row->[3], $row->[4] );
                        }
                } else {
                        # insert first field
                        $field = $row;
                }

                if (! $marc_repeatable_subfield->{ $fsf }) {
                        # make step to next subfield
                        $i = ($i + $u->{ $fsf } ) % ($len + 1);
                }
        }

        if ($#$field >= 0) {
                push @m, $field;
                warn "## saved/3 ", dump( $field ),$/ if ($debug);
        }

        return @m;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc_leader

Setup fields within MARC leader or get leader

  marc_leader('05','c');
  my $leader = marc_leader();

=cut

sub marc_leader {
        my ($offset,$value) = @_;

        if ($offset) {
                $out->{' leader'}->{ $offset } = $value;
        } else {
                return $out->{' leader'};
        }
}

=head2 marc

Save value for MARC field

  marc('900','a', rec('200','a') );
  marc('001', rec('000') );

=cut

sub marc {
        my $f = shift or die "marc needs field";
        die "marc field must be numer" unless ($f =~ /^\d+$/);

        my $sf;
        if ($f >= 10) {
                $sf = shift or die "marc needs subfield";
        }

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
                if (defined $sf) {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
                } else {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
                }
        }
}

=head2 marc_repeatable_subfield

Save values for MARC repetable subfield

  marc_repeatable_subfield('910', 'z', rec('909') );

=cut

sub marc_repeatable_subfield {
        my ($f,$sf) = @_;
        die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
        $marc_repeatable_subfield->{ $f . $sf }++;
        marc(@_);
}

=head2 marc_indicators

Set both indicators for MARC field

  marc_indicators('900', ' ', 1);

Any indicator value other than C<0-9> will be treated as undefined.

=cut

sub marc_indicators {
        my $f = shift || die "marc_indicators need field!\n";
        my ($i1,$i2) = @_;
        die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
        die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));

        $i1 = ' ' if ($i1 !~ /^\d$/);
        $i2 = ' ' if ($i2 !~ /^\d$/);
        @{ $marc_indicators->{$f} } = ($i1,$i2);
}

=head2 marc_compose

Save values for each MARC subfield explicitly

  marc_compose('900',
        'a', rec('200','a')
        'b', rec('201','a')
        'a', rec('200','b')
        'c', rec('200','c')
  );

=cut

sub marc_compose {
        my $f = shift or die "marc_compose needs field";
        die "marc_compose field must be numer" unless ($f =~ /^\d+$/);

        my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
        my $m = [ $f, $i1, $i2 ];

        while (@_) {
                my $sf = shift or die "marc_compose $f needs subfield";
                my $v = shift;

                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                push @$m, ( $sf, $v );
                warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
        }

        warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);

        push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
}

=head2 marc_duplicate

Generate copy of current MARC record and continue working on copy

  marc_duplicate();

Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
C<< _get_marc_fields( offset => 42 ) >>.

=cut

sub marc_duplicate {
         my $m = $marc_record->[ -1 ];
         die "can't duplicate record which isn't defined" unless ($m);
         push @{ $marc_record }, dclone( $m );
         warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
         $marc_record_offset = $#{ $marc_record };
         warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
}

=head2 marc_remove

Remove some field or subfield from MARC record.

  marc_remove('200');
  marc_remove('200','a');

This will erase field C<200> or C<200^a> from current MARC record.

This is useful after calling C<marc_duplicate> or on it's own (but, you
should probably just remove that subfield definition if you are not
using C<marc_duplicate>).

FIXME: support fields < 10.

=cut

sub marc_remove {
        my ($f, $sf) = @_;

        die "marc_remove needs record number" unless defined($f);

        my $marc = $marc_record->[ $marc_record_offset ];

        warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);

        foreach my $i ( 0 .. $#{ $marc } ) {
                last unless (defined $marc->[$i]);
                warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
                if ($marc->[$i]->[0] eq $f) {
                        if (! defined $sf) {
                                # remove whole field
                                splice @$marc, $i, 1;
                                warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
                                $i--;
                        } else {
                                foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
                                        my $o = ($j * 2) + 3;
                                        if ($marc->[$i]->[$o] eq $sf) {
                                                # remove subfield
                                                splice @{$marc->[$i]}, $o, 2;
                                                warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                # is record now empty?
                                                if ($#{ $marc->[$i] } == 2) {
                                                        splice @$marc, $i, 1;
                                                        warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                        $i--;
                                                };
                                        }
                                }
                        }
                }
        }

        warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);

        $marc_record->[ $marc_record_offset ] = $marc;

        warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        if ($#_ == 0) {
                return rec1(@_);
        } elsif ($#_ == 1) {
                return rec2(@_);
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or die "prefix needs string as first argument";
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        return join($d, grep { defined($_) && $_ ne '' } @_);
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	@EXPORT = qw/
4	_set_rec _set_lookup
5	_get_ds _clean_ds
6	_debug
7
8	tag search display
9	marc marc_indicators marc_repeatable_subfield
10	marc_compose marc_leader
11	marc_duplicate marc_remove
12
13	rec1 rec2 rec
14	regex prefix suffix surround
15	first lookup join_with
16
17	split_rec_on
18	/;
19
20	use warnings;
21	use strict;
22
23	#use base qw/WebPAC::Common/;
24	use Data::Dump qw/dump/;
25	use Encode qw/from_to/;
26	use Storable qw/dclone/;
27
28	# debugging warn(s)
29	my $debug = 0;
30
31
32	=head1 NAME
33
34	WebPAC::Normalize - describe normalisaton rules using sets
35
36	=head1 VERSION
37
38	Version 0.10
39
40	=cut
41
42	our $VERSION = '0.10';
43
44	=head1 SYNOPSIS
45
46	This module uses C<conf/normalize/*.pl> files to perform normalisation
47	from input records using perl functions which are specialized for set
48	processing.
49
50	Sets are implemented as arrays, and normalisation file is valid perl, which
51	means that you check it's validity before running WebPAC using
52	C<perl -c normalize.pl>.
53
54	Normalisation can generate multiple output normalized data. For now, supported output
55	types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56	C<marc>.
57
58	=head1 FUNCTIONS
59
60	Functions which start with C<_> are private and used by WebPAC internally.
61	All other functions are available for use within normalisation rules.
62
63	=head2 data_structure
64
65	Return data structure
66
67	my $ds = WebPAC::Normalize::data_structure(
68	lookup => $lookup->lookup_hash,
69	row => $row,
70	rules => $normalize_pl_config,
71	marc_encoding => 'utf-8',
72	);
73
74	Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75	other are optional.
76
77	This function will B<die> if normalizastion can't be evaled.
78
79	Since this function isn't exported you have to call it with
80	C<WebPAC::Normalize::data_structure>.
81
82	=cut
83
84	sub data_structure {
85	my $arg = {@_};
86
87	die "need row argument" unless ($arg->{row});
88	die "need normalisation argument" unless ($arg->{rules});
89
90	no strict 'subs';
91	_set_lookup( $arg->{lookup} );
92	_set_rec( $arg->{row} );
93	_clean_ds( %{ $arg } );
94	eval "$arg->{rules}";
95	die "error evaling $arg->{rules}: $@\n" if ($@);
96
97	return _get_ds();
98	}
99
100	=head2 _set_rec
101
102	Set current record hash
103
104	_set_rec( $rec );
105
106	=cut
107
108	my $rec;
109
110	sub _set_rec {
111	$rec = shift or die "no record hash";
112	}
113
114	=head2 _get_ds
115
116	Return hash formatted as data structure
117
118	my $ds = _get_ds();
119
120	=cut
121
122	my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123	my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125	sub _get_ds {
126	return $out;
127	}
128
129	=head2 _clean_ds
130
131	Clean data structure hash for next record
132
133	_clean_ds();
134
135	=cut
136
137	sub _clean_ds {
138	my $a = {@_};
139	($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140	($marc_record_offset, $marc_fetch_offset) = (0,0);
141	$marc_encoding = $a->{marc_encoding};
142	}
143
144	=head2 _set_lookup
145
146	Set current lookup hash
147
148	_set_lookup( $lookup );
149
150	=cut
151
152	my $lookup;
153
154	sub _set_lookup {
155	$lookup = shift;
156	}
157
158	=head2 _get_marc_fields
159
160	Get all fields defined by calls to C<marc>
161
162	$marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164	We are using I<magic> which detect repeatable fields only from
165	sequence of field/subfield data generated by normalization.
166
167	Repeatable field is created when there is second occurence of same subfield or
168	if any of indicators are different.
169
170	This is sane for most cases. Something like:
171
172	900a-1 900b-1 900c-1
173	900a-2 900b-2
174	900a-3
175
176	will be created from any combination of:
177
178	900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180	and following rules:
181
182	marc('900','a', rec('200','a') );
183	marc('900','b', rec('200','b') );
184	marc('900','c', rec('200','c') );
185
186	which might not be what you have in mind. If you need repeatable subfield,
187	define it using C<marc_repeatable_subfield> like this:
188
189	marc_repeatable_subfield('900','a');
190	marc('900','a', rec('200','a') );
191	marc('900','b', rec('200','b') );
192	marc('900','c', rec('200','c') );
193
194	will create:
195
196	900a-1 900a-2 900a-3 900b-1 900c-1
197	900b-2
198
199	There is also support for returning next or specific using:
200
201	while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202	# do something with $mf
203	}
204
205	will always return fields from next MARC record or
206
207	my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209	will return 42th copy record (if it exists).
210
211	=cut
212
213	sub _get_marc_fields {
214
215	my $arg = {@_};
216	warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217	my $offset = $marc_fetch_offset;
218	if ($arg->{offset}) {
219	$offset = $arg->{offset};
220	} elsif($arg->{fetch_next}) {
221	$marc_fetch_offset++;
222	}
223
224	return if (! $marc_record \|\| ref($marc_record) ne 'ARRAY');
225
226	warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228	my $marc_rec = $marc_record->[ $offset ];
229
230	warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232	return if (! $marc_rec \|\| ref($marc_rec) ne 'ARRAY' \|\| $#{ $marc_rec } < 0);
233
234	# first, sort all existing fields
235	# XXX might not be needed, but modern perl might randomize elements in hash
236	my @sorted_marc_record = sort {
237	$a->[0] . ( $a->[3] \|\| '' ) cmp $b->[0] . ( $b->[3] \|\| '')
238	} @{ $marc_rec };
239
240	@sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242	# output marc fields
243	my @m;
244
245	# count unique field-subfields (used for offset when walking to next subfield)
246	my $u;
247	map { $u->{ $_->[0] . ( $_->[3] \|\| '') }++ } @sorted_marc_record;
248
249	if ($debug) {
250	warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251	warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252	warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253	warn "## subfield count = ", dump( $u ), $/;
254	}
255
256	my $len = $#sorted_marc_record;
257	my $visited;
258	my $i = 0;
259	my $field;
260
261	foreach ( 0 .. $len ) {
262
263	# find next element which isn't visited
264	while ($visited->{$i}) {
265	$i = ($i + 1) % ($len + 1);
266	}
267
268	# mark it visited
269	$visited->{$i}++;
270
271	my $row = dclone( $sorted_marc_record[$i] );
272
273	# field and subfield which is key for
274	# marc_repeatable_subfield and u
275	my $fsf = $row->[0] . ( $row->[3] \|\| '' );
276
277	if ($debug > 1) {
278
279	print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280	print "### this [$i]: ", dump( $row ),$/;
281	print "### sf: ", $row->[3], " vs ", $field->[3],
282	$marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283	if ($#$field >= 0);
284
285	}
286
287	# if field exists
288	if ( $#$field >= 0 ) {
289	if (
290	$row->[0] ne $field->[0] \|\| # field
291	$row->[1] ne $field->[1] \|\| # i1
292	$row->[2] ne $field->[2] # i2
293	) {
294	push @m, $field;
295	warn "## saved/1 ", dump( $field ),$/ if ($debug);
296	$field = $row;
297
298	} elsif (
299	( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300	\|\|
301	( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302	! $marc_repeatable_subfield->{ $fsf }
303	)
304	) {
305	push @m, $field;
306	warn "## saved/2 ", dump( $field ),$/ if ($debug);
307	$field = $row;
308
309	} else {
310	# append new subfields to existing field
311	push @$field, ( $row->[3], $row->[4] );
312	}
313	} else {
314	# insert first field
315	$field = $row;
316	}
317
318	if (! $marc_repeatable_subfield->{ $fsf }) {
319	# make step to next subfield
320	$i = ($i + $u->{ $fsf } ) % ($len + 1);
321	}
322	}
323
324	if ($#$field >= 0) {
325	push @m, $field;
326	warn "## saved/3 ", dump( $field ),$/ if ($debug);
327	}
328
329	return @m;
330	}
331
332	=head2 _debug
333
334	Change level of debug warnings
335
336	_debug( 2 );
337
338	=cut
339
340	sub _debug {
341	my $l = shift;
342	return $debug unless defined($l);
343	warn "debug level $l",$/ if ($l > 0);
344	$debug = $l;
345	}
346
347	=head1 Functions to create C<data_structure>
348
349	Those functions generally have to first in your normalization file.
350
351	=head2 tag
352
353	Define new tag for I<search> and I<display>.
354
355	tag('Title', rec('200','a') );
356
357
358	=cut
359
360	sub tag {
361	my $name = shift or die "tag needs name as first argument";
362	my @o = grep { defined($_) && $_ ne '' } @_;
363	return unless (@o);
364	$out->{$name}->{tag} = $name;
365	$out->{$name}->{search} = \@o;
366	$out->{$name}->{display} = \@o;
367	}
368
369	=head2 display
370
371	Define tag just for I<display>
372
373	@v = display('Title', rec('200','a') );
374
375	=cut
376
377	sub display {
378	my $name = shift or die "display needs name as first argument";
379	my @o = grep { defined($_) && $_ ne '' } @_;
380	return unless (@o);
381	$out->{$name}->{tag} = $name;
382	$out->{$name}->{display} = \@o;
383	}
384
385	=head2 search
386
387	Prepare values just for I<search>
388
389	@v = search('Title', rec('200','a') );
390
391	=cut
392
393	sub search {
394	my $name = shift or die "search needs name as first argument";
395	my @o = grep { defined($_) && $_ ne '' } @_;
396	return unless (@o);
397	$out->{$name}->{tag} = $name;
398	$out->{$name}->{search} = \@o;
399	}
400
401	=head2 marc_leader
402
403	Setup fields within MARC leader or get leader
404
405	marc_leader('05','c');
406	my $leader = marc_leader();
407
408	=cut
409
410	sub marc_leader {
411	my ($offset,$value) = @_;
412
413	if ($offset) {
414	$out->{' leader'}->{ $offset } = $value;
415	} else {
416	return $out->{' leader'};
417	}
418	}
419
420	=head2 marc
421
422	Save value for MARC field
423
424	marc('900','a', rec('200','a') );
425	marc('001', rec('000') );
426
427	=cut
428
429	sub marc {
430	my $f = shift or die "marc needs field";
431	die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433	my $sf;
434	if ($f >= 10) {
435	$sf = shift or die "marc needs subfield";
436	}
437
438	foreach (@_) {
439	my $v = $_; # make var read-write for Encode
440	next unless (defined($v) && $v !~ /^\s*$/);
441	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442	my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443	if (defined $sf) {
444	push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445	} else {
446	push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447	}
448	}
449	}
450
451	=head2 marc_repeatable_subfield
452
453	Save values for MARC repetable subfield
454
455	marc_repeatable_subfield('910', 'z', rec('909') );
456
457	=cut
458
459	sub marc_repeatable_subfield {
460	my ($f,$sf) = @_;
461	die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462	$marc_repeatable_subfield->{ $f . $sf }++;
463	marc(@_);
464	}
465
466	=head2 marc_indicators
467
468	Set both indicators for MARC field
469
470	marc_indicators('900', ' ', 1);
471
472	Any indicator value other than C<0-9> will be treated as undefined.
473
474	=cut
475
476	sub marc_indicators {
477	my $f = shift \|\| die "marc_indicators need field!\n";
478	my ($i1,$i2) = @_;
479	die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480	die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482	$i1 = ' ' if ($i1 !~ /^\d$/);
483	$i2 = ' ' if ($i2 !~ /^\d$/);
484	@{ $marc_indicators->{$f} } = ($i1,$i2);
485	}
486
487	=head2 marc_compose
488
489	Save values for each MARC subfield explicitly
490
491	marc_compose('900',
492	'a', rec('200','a')
493	'b', rec('201','a')
494	'a', rec('200','b')
495	'c', rec('200','c')
496	);
497
498	=cut
499
500	sub marc_compose {
501	my $f = shift or die "marc_compose needs field";
502	die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504	my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505	my $m = [ $f, $i1, $i2 ];
506
507	while (@_) {
508	my $sf = shift or die "marc_compose $f needs subfield";
509	my $v = shift;
510
511	next unless (defined($v) && $v !~ /^\s*$/);
512	from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
513	push @$m, ( $sf, $v );
514	warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
515	}
516
517	warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
518
519	push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
520	}
521
522	=head2 marc_duplicate
523
524	Generate copy of current MARC record and continue working on copy
525
526	marc_duplicate();
527
528	Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
529	C<< _get_marc_fields( offset => 42 ) >>.
530
531	=cut
532
533	sub marc_duplicate {
534	my $m = $marc_record->[ -1 ];
535	die "can't duplicate record which isn't defined" unless ($m);
536	push @{ $marc_record }, dclone( $m );
537	warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
538	$marc_record_offset = $#{ $marc_record };
539	warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
540	}
541
542	=head2 marc_remove
543
544	Remove some field or subfield from MARC record.
545
546	marc_remove('200');
547	marc_remove('200','a');
548
549	This will erase field C<200> or C<200^a> from current MARC record.
550
551	This is useful after calling C<marc_duplicate> or on it's own (but, you
552	should probably just remove that subfield definition if you are not
553	using C<marc_duplicate>).
554
555	FIXME: support fields < 10.
556
557	=cut
558
559	sub marc_remove {
560	my ($f, $sf) = @_;
561
562	die "marc_remove needs record number" unless defined($f);
563
564	my $marc = $marc_record->[ $marc_record_offset ];
565
566	warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
567
568	foreach my $i ( 0 .. $#{ $marc } ) {
569	last unless (defined $marc->[$i]);
570	warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
571	if ($marc->[$i]->[0] eq $f) {
572	if (! defined $sf) {
573	# remove whole field
574	splice @$marc, $i, 1;
575	warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
576	$i--;
577	} else {
578	foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
579	my $o = ($j * 2) + 3;
580	if ($marc->[$i]->[$o] eq $sf) {
581	# remove subfield
582	splice @{$marc->[$i]}, $o, 2;
583	warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
584	# is record now empty?
585	if ($#{ $marc->[$i] } == 2) {
586	splice @$marc, $i, 1;
587	warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
588	$i--;
589	};
590	}
591	}
592	}
593	}
594	}
595
596	warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
597
598	$marc_record->[ $marc_record_offset ] = $marc;
599
600	warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
601	}
602
603	=head1 Functions to extract data from input
604
605	This function should be used inside functions to create C<data_structure> described
606	above.
607
608	=head2 rec1
609
610	Return all values in some field
611
612	@v = rec1('200')
613
614	TODO: order of values is probably same as in source data, need to investigate that
615
616	=cut
617
618	sub rec1 {
619	my $f = shift;
620	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
621	return unless (defined($rec) && defined($rec->{$f}));
622	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
623	if (ref($rec->{$f}) eq 'ARRAY') {
624	return map {
625	if (ref($_) eq 'HASH') {
626	values %{$_};
627	} else {
628	$_;
629	}
630	} @{ $rec->{$f} };
631	} elsif( defined($rec->{$f}) ) {
632	return $rec->{$f};
633	}
634	}
635
636	=head2 rec2
637
638	Return all values in specific field and subfield
639
640	@v = rec2('200','a')
641
642	=cut
643
644	sub rec2 {
645	my $f = shift;
646	return unless (defined($rec && $rec->{$f}));
647	my $sf = shift;
648	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
649	}
650
651	=head2 rec
652
653	syntaxtic sugar for
654
655	@v = rec('200')
656	@v = rec('200','a')
657
658	=cut
659
660	sub rec {
661	if ($#_ == 0) {
662	return rec1(@_);
663	} elsif ($#_ == 1) {
664	return rec2(@_);
665	}
666	}
667
668	=head2 regex
669
670	Apply regex to some or all values
671
672	@v = regex( 's/foo/bar/g', @v );
673
674	=cut
675
676	sub regex {
677	my $r = shift;
678	my @out;
679	#warn "r: $r\n", dump(\@_);
680	foreach my $t (@_) {
681	next unless ($t);
682	eval "\$t =~ $r";
683	push @out, $t if ($t && $t ne '');
684	}
685	return @out;
686	}
687
688	=head2 prefix
689
690	Prefix all values with a string
691
692	@v = prefix( 'my_', @v );
693
694	=cut
695
696	sub prefix {
697	my $p = shift or die "prefix needs string as first argument";
698	return map { $p . $_ } grep { defined($_) } @_;
699	}
700
701	=head2 suffix
702
703	suffix all values with a string
704
705	@v = suffix( '_my', @v );
706
707	=cut
708
709	sub suffix {
710	my $s = shift or die "suffix needs string as first argument";
711	return map { $_ . $s } grep { defined($_) } @_;
712	}
713
714	=head2 surround
715
716	surround all values with a two strings
717
718	@v = surround( 'prefix_', '_suffix', @v );
719
720	=cut
721
722	sub surround {
723	my $p = shift or die "surround need prefix as first argument";
724	my $s = shift or die "surround needs suffix as second argument";
725	return map { $p . $_ . $s } grep { defined($_) } @_;
726	}
727
728	=head2 first
729
730	Return first element
731
732	$v = first( @v );
733
734	=cut
735
736	sub first {
737	my $r = shift;
738	return $r;
739	}
740
741	=head2 lookup
742
743	Consult lookup hashes for some value
744
745	@v = lookup( $v );
746	@v = lookup( @v );
747
748	=cut
749
750	sub lookup {
751	my $k = shift or return;
752	return unless (defined($lookup->{$k}));
753	if (ref($lookup->{$k}) eq 'ARRAY') {
754	return @{ $lookup->{$k} };
755	} else {
756	return $lookup->{$k};
757	}
758	}
759
760	=head2 join_with
761
762	Joins walues with some delimiter
763
764	$v = join_with(", ", @v);
765
766	=cut
767
768	sub join_with {
769	my $d = shift;
770	return join($d, grep { defined($_) && $_ ne '' } @_);
771	}
772
773	=head2 split_rec_on
774
775	Split record subfield on some regex and take one of parts out
776
777	$a_before_semi_column =
778	split_rec_on('200','a', /\s;\s/, $part);
779
780	C<$part> is optional number of element. First element is
781	B<1>, not 0!
782
783	If there is no C<$part> parameter or C<$part> is 0, this function will
784	return all values produced by splitting.
785
786	=cut
787
788	sub split_rec_on {
789	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
790
791	my ($fld, $sf, $regex, $part) = @_;
792	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
793
794	my @r = rec( $fld, $sf );
795	my $v = shift @r;
796	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
797
798	return '' if( ! defined($v) \|\| $v =~ /^\s*$/);
799
800	my @s = split( $regex, $v );
801	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
802	if ($part && $part > 0) {
803	return $s[ $part - 1 ];
804	} else {
805	return @s;
806	}
807	}
808
809	# END
810	1;