lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        to
        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

        row
        rec_array

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;
_debug( $debug );

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

use Storable qw/dclone/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.36';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

sub rec_array {
        my $d = $rec->{ $_[0] };
        return @$d if ref($d) eq 'ARRAY';
        die "field $_[0] not array: ",dump( $d );
}

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 to

Generic way to set values for some name

  to('field-name', 'name-value' => rec('200','a') );

There are many helpers defined below which might be easier to use.

=cut

sub to {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );

=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, it will probably be removed at one point.

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display { to( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { to( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { to( 'sorted', @_ ) }

=head2 row

Insert new row of data into output module

  row( column => 'foo', column2 => 'bar' );

=cut

use Data::Dump qw/dump/;

sub row {
        die "array doesn't have odd number of elements but $#_: ",dump( @_ ) if $#_ % 2 == 1;
        my $table = shift @_;
        push @{ $out->{'_rows'}->{$table} }, {@_};
}


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($hash,$include_subfields) = @_;

        # sanity and ease of use
        return $hash if (ref($hash) ne 'HASH');

        my $h = dclone( $hash );

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;