lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        to
        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

        row
        rec_array

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;
_debug( $debug );

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

use Storable qw/dclone/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.36';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

sub rec_array {
        my $d = $rec->{ $_[0] };
        return @$d if ref($d) eq 'ARRAY';
        die "field $_[0] not array: ",dump( $d );
}

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 to

Generic way to set values for some name

  to('field-name', 'name-value' => rec('200','a') );

There are many helpers defined below which might be easier to use.

=cut

sub to {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );

=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, it will probably be removed at one point.

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display { to( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { to( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { to( 'sorted', @_ ) }

=head2 row

Insert new row of data into output module

  row( column => 'foo', column2 => 'bar' );

=cut

use Data::Dump qw/dump/;

sub row {
        die "array doesn't have odd number of elements but $#_: ",dump( @_ ) if $#_ % 2 == 1;
        my $table = shift @_;
        push @{ $out->{'_rows'}->{$table} }, {@_};
}


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($hash,$include_subfields) = @_;

        # sanity and ease of use
        return $hash if (ref($hash) ne 'HASH');

        my $h = dclone( $hash );

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	our @EXPORT = qw/
4	_set_ds _set_lookup
5	_set_load_row
6	_get_ds _clean_ds
7	_debug
8	_pack_subfields_hash
9
10	to
11	search_display search display sorted
12
13	rec1 rec2 rec
14	frec frec_eq frec_ne
15	regex prefix suffix surround
16	first lookup join_with
17	save_into_lookup
18
19	split_rec_on
20
21	get set
22	count
23
24	row
25	rec_array
26
27	/;
28
29	use warnings;
30	use strict;
31
32	#use base qw/WebPAC::Common/;
33	use Data::Dump qw/dump/;
34	use Carp qw/confess/;
35
36	# debugging warn(s)
37	my $debug = 0;
38	_debug( $debug );
39
40	# FIXME
41	use WebPAC::Normalize::ISBN;
42	push @EXPORT, ( 'isbn_10', 'isbn_13' );
43
44	use WebPAC::Normalize::MARC;
45	push @EXPORT, ( qw/
46	marc marc_indicators marc_repeatable_subfield
47	marc_compose marc_leader marc_fixed
48	marc_duplicate marc_remove marc_count
49	marc_original_order
50	marc_template
51	/);
52
53	use Storable qw/dclone/;
54
55	=head1 NAME
56
57	WebPAC::Normalize - describe normalisaton rules using sets
58
59	=cut
60
61	our $VERSION = '0.36';
62
63	=head1 SYNOPSIS
64
65	This module uses C<conf/normalize/*.pl> files to perform normalisation
66	from input records using perl functions which are specialized for set
67	processing.
68
69	Sets are implemented as arrays, and normalisation file is valid perl, which
70	means that you check it's validity before running WebPAC using
71	C<perl -c normalize.pl>.
72
73	Normalisation can generate multiple output normalized data. For now, supported output
74	types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
75	C<marc>.
76
77	=head1 FUNCTIONS
78
79	Functions which start with C<_> are private and used by WebPAC internally.
80	All other functions are available for use within normalisation rules.
81
82	=head2 data_structure
83
84	Return data structure
85
86	my $ds = WebPAC::Normalize::data_structure(
87	lookup => $lookup_hash,
88	row => $row,
89	rules => $normalize_pl_config,
90	marc_encoding => 'utf-8',
91	config => $config,
92	load_row_coderef => sub {
93	my ($database,$input,$mfn) = @_;
94	$store->load_row( database => $database, input => $input, id => $mfn );
95	},
96	);
97
98	Options C<row>, C<rules> and C<log> are mandatory while all
99	other are optional.
100
101	C<load_row_coderef> is closure only used when executing lookups, so they will
102	die if it's not defined.
103
104	This function will B<die> if normalizastion can't be evaled.
105
106	Since this function isn't exported you have to call it with
107	C<WebPAC::Normalize::data_structure>.
108
109	=cut
110
111	my $load_row_coderef;
112
113	sub data_structure {
114	my $arg = {@_};
115
116	die "need row argument" unless ($arg->{row});
117	die "need normalisation argument" unless ($arg->{rules});
118
119	_set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
120	_set_ds( $arg->{row} );
121	_set_config( $arg->{config} ) if defined($arg->{config});
122	_clean_ds( %{ $arg } );
123	$load_row_coderef = $arg->{load_row_coderef};
124
125	no strict 'subs';
126	no warnings 'redefine';
127	eval "$arg->{rules};";
128	die "error evaling $arg->{rules}: $@\n" if ($@);
129
130	return _get_ds();
131	}
132
133	=head2 _set_ds
134
135	Set current record hash
136
137	_set_ds( $rec );
138
139	=cut
140
141	my $rec;
142
143	sub _set_ds {
144	$rec = shift or die "no record hash";
145	$WebPAC::Normalize::MARC::rec = $rec;
146	}
147
148	=head2
149
150	my $rec = _get_rec();
151
152	=cut
153
154	sub _get_rec { $rec };
155
156	sub rec_array {
157	my $d = $rec->{ $_[0] };
158	return @$d if ref($d) eq 'ARRAY';
159	die "field $_[0] not array: ",dump( $d );
160	}
161
162	=head2 _set_config
163
164	Set current config hash
165
166	_set_config( $config );
167
168	Magic keys are:
169
170	=over 4
171
172	=item _
173
174	Code of current database
175
176	=item _mfn
177
178	Current MFN
179
180	=back
181
182	=cut
183
184	my $config;
185
186	sub _set_config {
187	$config = shift;
188	}
189
190	=head2 _get_ds
191
192	Return hash formatted as data structure
193
194	my $ds = _get_ds();
195
196	=cut
197
198	my $out;
199
200	sub _get_ds {
201	#warn "## out = ",dump($out);
202	return $out;
203	}
204
205	=head2 _clean_ds
206
207	Clean data structure hash for next record
208
209	_clean_ds();
210
211	=cut
212
213	sub _clean_ds {
214	my $a = {@_};
215	$out = undef;
216	WebPAC::Normalize::MARC::_clean();
217	}
218
219	=head2 _set_lookup
220
221	Set current lookup hash
222
223	_set_lookup( $lookup );
224
225	=cut
226
227	my $lookup;
228
229	sub _set_lookup {
230	$lookup = shift;
231	}
232
233	=head2 _get_lookup
234
235	Get current lookup hash
236
237	my $lookup = _get_lookup();
238
239	=cut
240
241	sub _get_lookup {
242	return $lookup;
243	}
244
245	=head2 _set_load_row
246
247	Setup code reference which will return L<data_structure> from
248	L<WebPAC::Store>
249
250	_set_load_row(sub {
251	my ($database,$input,$mfn) = @_;
252	$store->load_row( database => $database, input => $input, id => $mfn );
253	});
254
255	=cut
256
257	sub _set_load_row {
258	my $coderef = shift;
259	confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
260
261	$load_row_coderef = $coderef;
262	}
263
264	=head2 _debug
265
266	Change level of debug warnings
267
268	_debug( 2 );
269
270	=cut
271
272	sub _debug {
273	my $l = shift;
274	return $debug unless defined($l);
275	warn "debug level $l",$/ if ($l > 0);
276	$debug = $l;
277	$WebPAC::Normalize::MARC::debug = $debug;
278	}
279
280	=head1 Functions to create C<data_structure>
281
282	Those functions generally have to first in your normalization file.
283
284	=head2 to
285
286	Generic way to set values for some name
287
288	to('field-name', 'name-value' => rec('200','a') );
289
290	There are many helpers defined below which might be easier to use.
291
292	=cut
293
294	sub to {
295	my $type = shift or confess "need type -- BUG?";
296	my $name = shift or confess "needs name as first argument";
297	my @o = grep { defined($_) && $_ ne '' } @_;
298	return unless (@o);
299	$out->{$name}->{$type} = \@o;
300	}
301
302	=head2 search_display
303
304	Define output for L<search> and L<display> at the same time
305
306	search_display('Title', rec('200','a') );
307
308	=cut
309
310	sub search_display {
311	my $name = shift or die "search_display needs name as first argument";
312	my @o = grep { defined($_) && $_ ne '' } @_;
313	return unless (@o);
314	$out->{$name}->{search} = \@o;
315	$out->{$name}->{display} = \@o;
316	}
317
318	=head2 tag
319
320	Old name for L<search_display>, it will probably be removed at one point.
321
322	=cut
323
324	sub tag {
325	search_display( @_ );
326	}
327
328	=head2 display
329
330	Define output just for I<display>
331
332	@v = display('Title', rec('200','a') );
333
334	=cut
335
336	sub display { to( 'display', @_ ) }
337
338	=head2 search
339
340	Prepare values just for I<search>
341
342	@v = search('Title', rec('200','a') );
343
344	=cut
345
346	sub search { to( 'search', @_ ) }
347
348	=head2 sorted
349
350	Insert into lists which will be automatically sorted
351
352	sorted('Title', rec('200','a') );
353
354	=cut
355
356	sub sorted { to( 'sorted', @_ ) }
357
358	=head2 row
359
360	Insert new row of data into output module
361
362	row( column => 'foo', column2 => 'bar' );
363
364	=cut
365
366	use Data::Dump qw/dump/;
367
368	sub row {
369	die "array doesn't have odd number of elements but $#_: ",dump( @_ ) if $#_ % 2 == 1;
370	my $table = shift @_;
371	push @{ $out->{'_rows'}->{$table} }, {@_};
372	}
373
374
375	=head1 Functions to extract data from input
376
377	This function should be used inside functions to create C<data_structure> described
378	above.
379
380	=head2 _pack_subfields_hash
381
382	@subfields = _pack_subfields_hash( $h );
383	$subfields = _pack_subfields_hash( $h, 1 );
384
385	Return each subfield value in array or pack them all together and return scalar
386	with subfields (denoted by C<^>) and values.
387
388	=cut
389
390	sub _pack_subfields_hash {
391
392	warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
393
394	my ($hash,$include_subfields) = @_;
395
396	# sanity and ease of use
397	return $hash if (ref($hash) ne 'HASH');
398
399	my $h = dclone( $hash );
400
401	if ( defined($h->{subfields}) ) {
402	my $sfs = delete $h->{subfields} \|\| die "no subfields?";
403	my @out;
404	while (@$sfs) {
405	my $sf = shift @$sfs;
406	push @out, '^' . $sf if ($include_subfields);
407	my $o = shift @$sfs;
408	if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
409	# single element subfields are not arrays
410	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
411
412	push @out, $h->{$sf};
413	} else {
414	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
415	push @out, $h->{$sf}->[$o];
416	}
417	}
418	if ($include_subfields) {
419	return join('', @out);
420	} else {
421	return @out;
422	}
423	} else {
424	if ($include_subfields) {
425	my $out = '';
426	foreach my $sf (sort keys %$h) {
427	if (ref($h->{$sf}) eq 'ARRAY') {
428	$out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
429	} else {
430	$out .= '^' . $sf . $h->{$sf};
431	}
432	}
433	return $out;
434	} else {
435	# FIXME this should probably be in alphabetical order instead of hash order
436	values %{$h};
437	}
438	}
439	}
440
441	=head2 rec1
442
443	Return all values in some field
444
445	@v = rec1('200')
446
447	TODO: order of values is probably same as in source data, need to investigate that
448
449	=cut
450
451	sub rec1 {
452	my $f = shift;
453	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
454	return unless (defined($rec) && defined($rec->{$f}));
455	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
456	if (ref($rec->{$f}) eq 'ARRAY') {
457	my @out;
458	foreach my $h ( @{ $rec->{$f} } ) {
459	if (ref($h) eq 'HASH') {
460	push @out, ( _pack_subfields_hash( $h ) );
461	} else {
462	push @out, $h;
463	}
464	}
465	return @out;
466	} elsif( defined($rec->{$f}) ) {
467	return $rec->{$f};
468	}
469	}
470
471	=head2 rec2
472
473	Return all values in specific field and subfield
474
475	@v = rec2('200','a')
476
477	=cut
478
479	sub rec2 {
480	my $f = shift;
481	return unless (defined($rec && $rec->{$f}));
482	my $sf = shift;
483	warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
484	return map {
485	if (ref($_->{$sf}) eq 'ARRAY') {
486	@{ $_->{$sf} };
487	} else {
488	$_->{$sf};
489	}
490	} grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
491	}
492
493	=head2 rec
494
495	syntaxtic sugar for
496
497	@v = rec('200')
498	@v = rec('200','a')
499
500	If rec() returns just single value, it will
501	return scalar, not array.
502
503	=cut
504
505	sub rec {
506	my @out;
507	if ($#_ == 0) {
508	@out = rec1(@_);
509	} elsif ($#_ == 1) {
510	@out = rec2(@_);
511	}
512	if ($#out == 0 && ! wantarray) {
513	return $out[0];
514	} elsif (@out) {
515	return @out;
516	} else {
517	return '';
518	}
519	}
520
521	=head2 frec
522
523	Returns first value from field
524
525	$v = frec('200');
526	$v = frec('200','a');
527
528	=cut
529
530	sub frec {
531	my @out = rec(@_);
532	warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
533	return shift @out;
534	}
535
536	=head2 frec_eq
537
538	=head2 frec_ne
539
540	Check if first values from two fields are same or different
541
542	if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
543	# values are same
544	} else {
545	# values are different
546	}
547
548	Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
549	could write something like:
550
551	if ( frec( '900','a' ) eq frec( '910','c' ) ) {
552	# yada tada
553	}
554
555	but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
556	in order to parse text and create invalid function C<eqfrec>.
557
558	=cut
559
560	sub frec_eq {
561	my ( $f1,$sf1, $f2, $sf2 ) = @_;
562	return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
563	}
564
565	sub frec_ne {
566	return ! frec_eq( @_ );
567	}
568
569	=head2 regex
570
571	Apply regex to some or all values
572
573	@v = regex( 's/foo/bar/g', @v );
574
575	=cut
576
577	sub regex {
578	my $r = shift;
579	my @out;
580	#warn "r: $r\n", dump(\@_);
581	foreach my $t (@_) {
582	next unless ($t);
583	eval "\$t =~ $r";
584	push @out, $t if ($t && $t ne '');
585	}
586	return @out;
587	}
588
589	=head2 prefix
590
591	Prefix all values with a string
592
593	@v = prefix( 'my_', @v );
594
595	=cut
596
597	sub prefix {
598	my $p = shift;
599	return @_ unless defined( $p );
600	return map { $p . $_ } grep { defined($_) } @_;
601	}
602
603	=head2 suffix
604
605	suffix all values with a string
606
607	@v = suffix( '_my', @v );
608
609	=cut
610
611	sub suffix {
612	my $s = shift;
613	return @_ unless defined( $s );
614	return map { $_ . $s } grep { defined($_) } @_;
615	}
616
617	=head2 surround
618
619	surround all values with a two strings
620
621	@v = surround( 'prefix_', '_suffix', @v );
622
623	=cut
624
625	sub surround {
626	my $p = shift;
627	my $s = shift;
628	$p = '' unless defined( $p );
629	$s = '' unless defined( $s );
630	return map { $p . $_ . $s } grep { defined($_) } @_;
631	}
632
633	=head2 first
634
635	Return first element
636
637	$v = first( @v );
638
639	=cut
640
641	sub first {
642	my $r = shift;
643	return $r;
644	}
645
646	=head2 lookup
647
648	Consult lookup hashes for some value
649
650	@v = lookup(
651	sub {
652	'ffkk/peri/mfn'.rec('000')
653	},
654	'ffkk','peri','200-a-200-e',
655	sub {
656	first(rec(200,'a')).' '.first(rec('200','e'))
657	}
658	);
659
660	Code like above will be B<automatically generated> using L<WebPAC::Parse> from
661	normal lookup definition in C<conf/lookup/something.pl> which looks like:
662
663	lookup(
664	# which results to return from record recorded in lookup
665	sub { 'ffkk/peri/mfn' . rec('000') },
666	# from which database and input
667	'ffkk','peri',
668	# such that following values match
669	sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
670	# if this part is missing, we will try to match same fields
671	# from lookup record and current one, or you can override
672	# which records to use from current record using
673	sub { rec('900','x') . ' ' . rec('900','y') },
674	)
675
676	You can think about this lookup as SQL (if that helps):
677
678	select
679	sub { what }
680	from
681	database, input
682	where
683	sub { filter from lookuped record }
684	having
685	sub { optional filter on current record }
686
687	Easy as pie, right?
688
689	=cut
690
691	sub lookup {
692	my ($what, $database, $input, $key, $having) = @_;
693
694	confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
695
696	warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
697	return unless (defined($lookup->{$database}->{$input}->{$key}));
698
699	confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
700
701	my $mfns;
702	my @having = $having->();
703
704	warn "## having = ", dump( @having ) if ($debug > 2);
705
706	foreach my $h ( @having ) {
707	if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
708	warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
709	$mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
710	}
711	}
712
713	return unless ($mfns);
714
715	my @mfns = sort keys %$mfns;
716
717	warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
718
719	my $old_rec = $rec;
720	my @out;
721
722	foreach my $mfn (@mfns) {
723	$rec = $load_row_coderef->( $database, $input, $mfn );
724
725	warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
726
727	my @vals = $what->();
728
729	push @out, ( @vals );
730
731	warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
732	}
733
734	# if (ref($lookup->{$k}) eq 'ARRAY') {
735	# return @{ $lookup->{$k} };
736	# } else {
737	# return $lookup->{$k};
738	# }
739
740	$rec = $old_rec;
741
742	warn "## lookup returns = ", dump(@out), $/ if ($debug);
743
744	if ($#out == 0) {
745	return $out[0];
746	} else {
747	return @out;
748	}
749	}
750
751	=head2 save_into_lookup
752
753	Save value into lookup. It associates current database, input
754	and specific keys with one or more values which will be
755	associated over MFN.
756
757	MFN will be extracted from first occurence current of field 000
758	in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
759
760	my $nr = save_into_lookup($database,$input,$key,sub {
761	# code which produce one or more values
762	});
763
764	It returns number of items saved.
765
766	This function shouldn't be called directly, it's called from code created by
767	L<WebPAC::Parser>.
768
769	=cut
770
771	sub save_into_lookup {
772	my ($database,$input,$key,$coderef) = @_;
773	die "save_into_lookup needs database" unless defined($database);
774	die "save_into_lookup needs input" unless defined($input);
775	die "save_into_lookup needs key" unless defined($key);
776	die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
777
778	warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
779
780	my $mfn =
781	defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
782	defined($config->{_mfn}) ? $config->{_mfn} :
783	die "mfn not defined or zero";
784
785	my $nr = 0;
786
787	foreach my $v ( $coderef->() ) {
788	$lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
789	warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
790	$nr++;
791	}
792
793	return $nr;
794	}
795
796	=head2 config
797
798	Consult config values stored in C<config.yml>
799
800	# return database code (key under databases in yaml)
801	$database_code = config(); # use _ from hash
802	$database_name = config('name');
803	$database_input_name = config('input name');
804
805	Up to three levels are supported.
806
807	=cut
808
809	sub config {
810	return unless ($config);
811
812	my $p = shift;
813
814	$p \|\|= '';
815
816	my $v;
817
818	warn "### getting config($p)\n" if ($debug > 1);
819
820	my @p = split(/\s+/,$p);
821	if ($#p < 0) {
822	$v = $config->{ '_' }; # special, database code
823	} else {
824
825	my $c = dclone( $config );
826
827	foreach my $k (@p) {
828	warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
829	if (ref($c) eq 'ARRAY') {
830	$c = shift @$c;
831	warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
832	last;
833	}
834
835	if (! defined($c->{$k}) ) {
836	$c = undef;
837	last;
838	} else {
839	$c = $c->{$k};
840	}
841	}
842	$v = $c if ($c);
843
844	}
845
846	warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
847	warn "config( '$p' ) is empty\n" if (! $v);
848
849	return $v;
850	}
851
852	=head2 id
853
854	Returns unique id of this record
855
856	$id = id();
857
858	Returns C<42/2> for 2nd occurence of MFN 42.
859
860	=cut
861
862	sub id {
863	my $mfn = $config->{_mfn} \|\| die "no _mfn in config data";
864	return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() \|\| '' );
865	}
866
867	=head2 join_with
868
869	Joins walues with some delimiter
870
871	$v = join_with(", ", @v);
872
873	=cut
874
875	sub join_with {
876	my $d = shift;
877	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
878	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
879	return '' unless defined($v);
880	return $v;
881	}
882
883	=head2 split_rec_on
884
885	Split record subfield on some regex and take one of parts out
886
887	$a_before_semi_column =
888	split_rec_on('200','a', /\s;\s/, $part);
889
890	C<$part> is optional number of element. First element is
891	B<1>, not 0!
892
893	If there is no C<$part> parameter or C<$part> is 0, this function will
894	return all values produced by splitting.
895
896	=cut
897
898	sub split_rec_on {
899	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
900
901	my ($fld, $sf, $regex, $part) = @_;
902	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
903
904	my @r = rec( $fld, $sf );
905	my $v = shift @r;
906	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
907
908	return '' if ( ! defined($v) \|\| $v =~ /^\s*$/);
909
910	my @s = split( $regex, $v );
911	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
912	if ($part && $part > 0) {
913	return $s[ $part - 1 ];
914	} else {
915	return @s;
916	}
917	}
918
919	my $hash;
920
921	=head2 set
922
923	set( key => 'value' );
924
925	=cut
926
927	sub set {
928	my ($k,$v) = @_;
929	warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
930	$hash->{$k} = $v;
931	};
932
933	=head2 get
934
935	get( 'key' );
936
937	=cut
938
939	sub get {
940	my $k = shift \|\| return;
941	my $v = $hash->{$k};
942	warn "## get $k = ", dump( $v ), $/ if ( $debug );
943	return $v;
944	}
945
946	=head2 count
947
948	if ( count( @result ) == 1 ) {
949	# do something if only 1 result is there
950	}
951
952	=cut
953
954	sub count {
955	warn "## count ",dump(@_),$/ if ( $debug );
956	return @_ . '';
957	}
958
959	# END
960	1;