lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
@EXPORT = qw/
        _set_rec _set_lookup
        _get_ds _clean_ds
        _debug

        tag search display
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader
        marc_duplicate marc_remove

        rec1 rec2 rec
        regex prefix suffix surround
        first lookup join_with

        split_rec_on
/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Encode qw/from_to/;
use Storable qw/dclone/;

# debugging warn(s)
my $debug = 0;


=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=head1 VERSION

Version 0.15

=cut

our $VERSION = '0.15';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<tag>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup->lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
  );

Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
other are optional.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        no strict 'subs';
        _set_lookup( $arg->{lookup} );
        _set_rec( $arg->{row} );
        _set_config( $arg->{config} );
        _clean_ds( %{ $arg } );
        eval "$arg->{rules}";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_rec

Set current record hash

  _set_rec( $rec );

=cut

my $rec;

sub _set_rec {
        $rec = shift or die "no record hash";
}

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
my ($marc_record_offset, $marc_fetch_offset) = (0, 0);

sub _get_ds {
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
        ($marc_record_offset, $marc_fetch_offset) = (0,0);
        $marc_encoding = $a->{marc_encoding};
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_marc_fields

Get all fields defined by calls to C<marc>

        $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );

We are using I<magic> which detect repeatable fields only from
sequence of field/subfield data generated by normalization.

Repeatable field is created when there is second occurence of same subfield or
if any of indicators are different.

This is sane for most cases. Something like:

  900a-1 900b-1 900c-1
  900a-2 900b-2
  900a-3

will be created from any combination of:

  900a-1 900a-2 900a-3 900b-1 900b-2 900c-1

and following rules:

  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

which might not be what you have in mind. If you need repeatable subfield,
define it using C<marc_repeatable_subfield> like this:

  marc_repeatable_subfield('900','a');
  marc('900','a', rec('200','a') );
  marc('900','b', rec('200','b') );
  marc('900','c', rec('200','c') );

will create:

  900a-1 900a-2 900a-3 900b-1 900c-1
  900b-2

There is also support for returning next or specific using:

  while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
        # do something with $mf
  }

will always return fields from next MARC record or

  my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );

will return 42th copy record (if it exists).

=cut

sub _get_marc_fields {

        my $arg = {@_};
        warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
        my $offset = $marc_fetch_offset;
        if ($arg->{offset}) {
                $offset = $arg->{offset};
        } elsif($arg->{fetch_next}) {
                $marc_fetch_offset++;
        }

        return if (! $marc_record || ref($marc_record) ne 'ARRAY');

        warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);

        my $marc_rec = $marc_record->[ $offset ];

        warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);

        return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);

        # first, sort all existing fields 
        # XXX might not be needed, but modern perl might randomize elements in hash
        my @sorted_marc_record = sort {
                $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
        } @{ $marc_rec };

        @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
        
        # output marc fields
        my @m;

        # count unique field-subfields (used for offset when walking to next subfield)
        my $u;
        map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;

        if ($debug) {
                warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
                warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
                warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
                warn "## subfield count = ", dump( $u ), $/;
        }

        my $len = $#sorted_marc_record;
        my $visited;
        my $i = 0;
        my $field;

        foreach ( 0 .. $len ) {

                # find next element which isn't visited
                while ($visited->{$i}) {
                        $i = ($i + 1) % ($len + 1);
                }

                # mark it visited
                $visited->{$i}++;

                my $row = dclone( $sorted_marc_record[$i] );

                # field and subfield which is key for
                # marc_repeatable_subfield and u
                my $fsf = $row->[0] . ( $row->[3] || '' );

                if ($debug > 1) {

                        print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
                        print "### this [$i]: ", dump( $row ),$/;
                        print "### sf: ", $row->[3], " vs ", $field->[3],
                                $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
                                if ($#$field >= 0);

                }

                # if field exists
                if ( $#$field >= 0 ) {
                        if (
                                $row->[0] ne $field->[0] ||             # field
                                $row->[1] ne $field->[1] ||             # i1
                                $row->[2] ne $field->[2]                # i2
                        ) {
                                push @m, $field;
                                warn "## saved/1 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } elsif (
                                ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
                                ||
                                ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
                                        ! $marc_repeatable_subfield->{ $fsf }
                                )
                        ) {
                                push @m, $field;
                                warn "## saved/2 ", dump( $field ),$/ if ($debug);
                                $field = $row;

                        } else {
                                # append new subfields to existing field
                                push @$field, ( $row->[3], $row->[4] );
                        }
                } else {
                        # insert first field
                        $field = $row;
                }

                if (! $marc_repeatable_subfield->{ $fsf }) {
                        # make step to next subfield
                        $i = ($i + $u->{ $fsf } ) % ($len + 1);
                }
        }

        if ($#$field >= 0) {
                push @m, $field;
                warn "## saved/3 ", dump( $field ),$/ if ($debug);
        }

        return \@m;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 tag

Define new tag for I<search> and I<display>.

  tag('Title', rec('200','a') );


=cut

sub tag {
        my $name = shift or die "tag needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 display

Define tag just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display {
        my $name = shift or die "display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{display} = \@o;
}

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search {
        my $name = shift or die "search needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{tag} = $name;
        $out->{$name}->{search} = \@o;
}

=head2 marc_leader

Setup fields within MARC leader or get leader

  marc_leader('05','c');
  my $leader = marc_leader();

=cut

sub marc_leader {
        my ($offset,$value) = @_;

        if ($offset) {
                $out->{' leader'}->{ $offset } = $value;
        } else {
                return $out->{' leader'};
        }
}

=head2 marc

Save value for MARC field

  marc('900','a', rec('200','a') );
  marc('001', rec('000') );

=cut

sub marc {
        my $f = shift or die "marc needs field";
        die "marc field must be numer" unless ($f =~ /^\d+$/);

        my $sf;
        if ($f >= 10) {
                $sf = shift or die "marc needs subfield";
        }

        foreach (@_) {
                my $v = $_;             # make var read-write for Encode
                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
                if (defined $sf) {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
                } else {
                        push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
                }
        }
}

=head2 marc_repeatable_subfield

Save values for MARC repetable subfield

  marc_repeatable_subfield('910', 'z', rec('909') );

=cut

sub marc_repeatable_subfield {
        my ($f,$sf) = @_;
        die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
        $marc_repeatable_subfield->{ $f . $sf }++;
        marc(@_);
}

=head2 marc_indicators

Set both indicators for MARC field

  marc_indicators('900', ' ', 1);

Any indicator value other than C<0-9> will be treated as undefined.

=cut

sub marc_indicators {
        my $f = shift || die "marc_indicators need field!\n";
        my ($i1,$i2) = @_;
        die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
        die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));

        $i1 = ' ' if ($i1 !~ /^\d$/);
        $i2 = ' ' if ($i2 !~ /^\d$/);
        @{ $marc_indicators->{$f} } = ($i1,$i2);
}

=head2 marc_compose

Save values for each MARC subfield explicitly

  marc_compose('900',
        'a', rec('200','a')
        'b', rec('201','a')
        'a', rec('200','b')
        'c', rec('200','c')
  );

If you specify C<+> for subfield, value will be appended
to previous defined subfield.

=cut

sub marc_compose {
        my $f = shift or die "marc_compose needs field";
        die "marc_compose field must be numer" unless ($f =~ /^\d+$/);

        my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
        my $m = [ $f, $i1, $i2 ];

        warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);

        while (@_) {
                my $sf = shift or die "marc_compose $f needs subfield";
                my $v = shift;

                next unless (defined($v) && $v !~ /^\s*$/);
                from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
                warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
                if ($sf ne '+') {
                        push @$m, ( $sf, $v );
                } else {
                        $m->[ $#$m ] .= $v;
                }
        }

        warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);

        push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
}

=head2 marc_duplicate

Generate copy of current MARC record and continue working on copy

  marc_duplicate();

Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
C<< _get_marc_fields( offset => 42 ) >>.

=cut

sub marc_duplicate {
         my $m = $marc_record->[ -1 ];
         die "can't duplicate record which isn't defined" unless ($m);
         push @{ $marc_record }, dclone( $m );
         warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
         $marc_record_offset = $#{ $marc_record };
         warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
}

=head2 marc_remove

Remove some field or subfield from MARC record.

  marc_remove('200');
  marc_remove('200','a');

This will erase field C<200> or C<200^a> from current MARC record.

This is useful after calling C<marc_duplicate> or on it's own (but, you
should probably just remove that subfield definition if you are not
using C<marc_duplicate>).

FIXME: support fields < 10.

=cut

sub marc_remove {
        my ($f, $sf) = @_;

        die "marc_remove needs record number" unless defined($f);

        my $marc = $marc_record->[ $marc_record_offset ];

        warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);

        my $i = 0;
        foreach ( 0 .. $#{ $marc } ) {
                last unless (defined $marc->[$i]);
                warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
                if ($marc->[$i]->[0] eq $f) {
                        if (! defined $sf) {
                                # remove whole field
                                splice @$marc, $i, 1;
                                warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
                                $i--;
                        } else {
                                foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
                                        my $o = ($j * 2) + 3;
                                        if ($marc->[$i]->[$o] eq $sf) {
                                                # remove subfield
                                                splice @{$marc->[$i]}, $o, 2;
                                                warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                # is record now empty?
                                                if ($#{ $marc->[$i] } == 2) {
                                                        splice @$marc, $i, 1;
                                                        warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
                                                        $i--;
                                                };
                                        }
                                }
                        }
                }
                $i++;
        }

        warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);

        $marc_record->[ $marc_record_offset ] = $marc;

        warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
}

=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                return map { 
                        if (ref($_) eq 'HASH') {
                                values %{$_};
                        } else {
                                $_;
                        }
                } @{ $rec->{$f} };
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift or return;
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift or die "suffix needs string as first argument";
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift or die "surround need prefix as first argument";
        my $s = shift or die "surround needs suffix as second argument";
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup( $v );
  @v = lookup( @v );

=cut

sub lookup {
        my $k = shift or return;
        return unless (defined($lookup->{$k}));
        if (ref($lookup->{$k}) eq 'ARRAY') {
                return @{ $lookup->{$k} };
        } else {
                return $lookup->{$k};
        }
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');
  $tag = config('input normalize tag');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

# END
1;