lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;
_debug( $debug );

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.35';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
warn "#### MARC::debug = ",dump($WebPAC::Normalize::MARC::debug);
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );


=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, but supported

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub _field {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

sub display { _field( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { _field( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { _field( 'sorted', @_ ) }


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($h,$include_subfields) = @_;

        # sanity and ease of use
        return $h if (ref($h) ne 'HASH');

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	our @EXPORT = qw/
4	_set_ds _set_lookup
5	_set_load_row
6	_get_ds _clean_ds
7	_debug
8	_pack_subfields_hash
9
10	search_display search display sorted
11
12	rec1 rec2 rec
13	frec frec_eq frec_ne
14	regex prefix suffix surround
15	first lookup join_with
16	save_into_lookup
17
18	split_rec_on
19
20	get set
21	count
22
23	/;
24
25	use warnings;
26	use strict;
27
28	#use base qw/WebPAC::Common/;
29	use Data::Dump qw/dump/;
30	use Carp qw/confess/;
31
32	# debugging warn(s)
33	my $debug = 0;
34	_debug( $debug );
35
36	# FIXME
37	use WebPAC::Normalize::ISBN;
38	push @EXPORT, ( 'isbn_10', 'isbn_13' );
39
40	use WebPAC::Normalize::MARC;
41	push @EXPORT, ( qw/
42	marc marc_indicators marc_repeatable_subfield
43	marc_compose marc_leader marc_fixed
44	marc_duplicate marc_remove marc_count
45	marc_original_order
46	marc_template
47	/);
48
49	=head1 NAME
50
51	WebPAC::Normalize - describe normalisaton rules using sets
52
53	=cut
54
55	our $VERSION = '0.35';
56
57	=head1 SYNOPSIS
58
59	This module uses C<conf/normalize/*.pl> files to perform normalisation
60	from input records using perl functions which are specialized for set
61	processing.
62
63	Sets are implemented as arrays, and normalisation file is valid perl, which
64	means that you check it's validity before running WebPAC using
65	C<perl -c normalize.pl>.
66
67	Normalisation can generate multiple output normalized data. For now, supported output
68	types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69	C<marc>.
70
71	=head1 FUNCTIONS
72
73	Functions which start with C<_> are private and used by WebPAC internally.
74	All other functions are available for use within normalisation rules.
75
76	=head2 data_structure
77
78	Return data structure
79
80	my $ds = WebPAC::Normalize::data_structure(
81	lookup => $lookup_hash,
82	row => $row,
83	rules => $normalize_pl_config,
84	marc_encoding => 'utf-8',
85	config => $config,
86	load_row_coderef => sub {
87	my ($database,$input,$mfn) = @_;
88	$store->load_row( database => $database, input => $input, id => $mfn );
89	},
90	);
91
92	Options C<row>, C<rules> and C<log> are mandatory while all
93	other are optional.
94
95	C<load_row_coderef> is closure only used when executing lookups, so they will
96	die if it's not defined.
97
98	This function will B<die> if normalizastion can't be evaled.
99
100	Since this function isn't exported you have to call it with
101	C<WebPAC::Normalize::data_structure>.
102
103	=cut
104
105	my $load_row_coderef;
106
107	sub data_structure {
108	my $arg = {@_};
109
110	die "need row argument" unless ($arg->{row});
111	die "need normalisation argument" unless ($arg->{rules});
112
113	_set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114	_set_ds( $arg->{row} );
115	_set_config( $arg->{config} ) if defined($arg->{config});
116	_clean_ds( %{ $arg } );
117	$load_row_coderef = $arg->{load_row_coderef};
118
119	no strict 'subs';
120	no warnings 'redefine';
121	eval "$arg->{rules};";
122	die "error evaling $arg->{rules}: $@\n" if ($@);
123
124	return _get_ds();
125	}
126
127	=head2 _set_ds
128
129	Set current record hash
130
131	_set_ds( $rec );
132
133	=cut
134
135	my $rec;
136
137	sub _set_ds {
138	$rec = shift or die "no record hash";
139	$WebPAC::Normalize::MARC::rec = $rec;
140	}
141
142	=head2
143
144	my $rec = _get_rec();
145
146	=cut
147
148	sub _get_rec { $rec };
149
150	=head2 _set_config
151
152	Set current config hash
153
154	_set_config( $config );
155
156	Magic keys are:
157
158	=over 4
159
160	=item _
161
162	Code of current database
163
164	=item _mfn
165
166	Current MFN
167
168	=back
169
170	=cut
171
172	my $config;
173
174	sub _set_config {
175	$config = shift;
176	}
177
178	=head2 _get_ds
179
180	Return hash formatted as data structure
181
182	my $ds = _get_ds();
183
184	=cut
185
186	my $out;
187
188	sub _get_ds {
189	#warn "## out = ",dump($out);
190	return $out;
191	}
192
193	=head2 _clean_ds
194
195	Clean data structure hash for next record
196
197	_clean_ds();
198
199	=cut
200
201	sub _clean_ds {
202	my $a = {@_};
203	$out = undef;
204	WebPAC::Normalize::MARC::_clean();
205	}
206
207	=head2 _set_lookup
208
209	Set current lookup hash
210
211	_set_lookup( $lookup );
212
213	=cut
214
215	my $lookup;
216
217	sub _set_lookup {
218	$lookup = shift;
219	}
220
221	=head2 _get_lookup
222
223	Get current lookup hash
224
225	my $lookup = _get_lookup();
226
227	=cut
228
229	sub _get_lookup {
230	return $lookup;
231	}
232
233	=head2 _set_load_row
234
235	Setup code reference which will return L<data_structure> from
236	L<WebPAC::Store>
237
238	_set_load_row(sub {
239	my ($database,$input,$mfn) = @_;
240	$store->load_row( database => $database, input => $input, id => $mfn );
241	});
242
243	=cut
244
245	sub _set_load_row {
246	my $coderef = shift;
247	confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
248
249	$load_row_coderef = $coderef;
250	}
251
252	=head2 _debug
253
254	Change level of debug warnings
255
256	_debug( 2 );
257
258	=cut
259
260	sub _debug {
261	my $l = shift;
262	return $debug unless defined($l);
263	warn "debug level $l",$/ if ($l > 0);
264	$debug = $l;
265	$WebPAC::Normalize::MARC::debug = $debug;
266	warn "#### MARC::debug = ",dump($WebPAC::Normalize::MARC::debug);
267	}
268
269	=head1 Functions to create C<data_structure>
270
271	Those functions generally have to first in your normalization file.
272
273	=head2 search_display
274
275	Define output for L<search> and L<display> at the same time
276
277	search_display('Title', rec('200','a') );
278
279
280	=cut
281
282	sub search_display {
283	my $name = shift or die "search_display needs name as first argument";
284	my @o = grep { defined($_) && $_ ne '' } @_;
285	return unless (@o);
286	$out->{$name}->{search} = \@o;
287	$out->{$name}->{display} = \@o;
288	}
289
290	=head2 tag
291
292	Old name for L<search_display>, but supported
293
294	=cut
295
296	sub tag {
297	search_display( @_ );
298	}
299
300	=head2 display
301
302	Define output just for I<display>
303
304	@v = display('Title', rec('200','a') );
305
306	=cut
307
308	sub _field {
309	my $type = shift or confess "need type -- BUG?";
310	my $name = shift or confess "needs name as first argument";
311	my @o = grep { defined($_) && $_ ne '' } @_;
312	return unless (@o);
313	$out->{$name}->{$type} = \@o;
314	}
315
316	sub display { _field( 'display', @_ ) }
317
318	=head2 search
319
320	Prepare values just for I<search>
321
322	@v = search('Title', rec('200','a') );
323
324	=cut
325
326	sub search { _field( 'search', @_ ) }
327
328	=head2 sorted
329
330	Insert into lists which will be automatically sorted
331
332	sorted('Title', rec('200','a') );
333
334	=cut
335
336	sub sorted { _field( 'sorted', @_ ) }
337
338
339
340	=head1 Functions to extract data from input
341
342	This function should be used inside functions to create C<data_structure> described
343	above.
344
345	=head2 _pack_subfields_hash
346
347	@subfields = _pack_subfields_hash( $h );
348	$subfields = _pack_subfields_hash( $h, 1 );
349
350	Return each subfield value in array or pack them all together and return scalar
351	with subfields (denoted by C<^>) and values.
352
353	=cut
354
355	sub _pack_subfields_hash {
356
357	warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
358
359	my ($h,$include_subfields) = @_;
360
361	# sanity and ease of use
362	return $h if (ref($h) ne 'HASH');
363
364	if ( defined($h->{subfields}) ) {
365	my $sfs = delete $h->{subfields} \|\| die "no subfields?";
366	my @out;
367	while (@$sfs) {
368	my $sf = shift @$sfs;
369	push @out, '^' . $sf if ($include_subfields);
370	my $o = shift @$sfs;
371	if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
372	# single element subfields are not arrays
373	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
374
375	push @out, $h->{$sf};
376	} else {
377	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
378	push @out, $h->{$sf}->[$o];
379	}
380	}
381	if ($include_subfields) {
382	return join('', @out);
383	} else {
384	return @out;
385	}
386	} else {
387	if ($include_subfields) {
388	my $out = '';
389	foreach my $sf (sort keys %$h) {
390	if (ref($h->{$sf}) eq 'ARRAY') {
391	$out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
392	} else {
393	$out .= '^' . $sf . $h->{$sf};
394	}
395	}
396	return $out;
397	} else {
398	# FIXME this should probably be in alphabetical order instead of hash order
399	values %{$h};
400	}
401	}
402	}
403
404	=head2 rec1
405
406	Return all values in some field
407
408	@v = rec1('200')
409
410	TODO: order of values is probably same as in source data, need to investigate that
411
412	=cut
413
414	sub rec1 {
415	my $f = shift;
416	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
417	return unless (defined($rec) && defined($rec->{$f}));
418	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
419	if (ref($rec->{$f}) eq 'ARRAY') {
420	my @out;
421	foreach my $h ( @{ $rec->{$f} } ) {
422	if (ref($h) eq 'HASH') {
423	push @out, ( _pack_subfields_hash( $h ) );
424	} else {
425	push @out, $h;
426	}
427	}
428	return @out;
429	} elsif( defined($rec->{$f}) ) {
430	return $rec->{$f};
431	}
432	}
433
434	=head2 rec2
435
436	Return all values in specific field and subfield
437
438	@v = rec2('200','a')
439
440	=cut
441
442	sub rec2 {
443	my $f = shift;
444	return unless (defined($rec && $rec->{$f}));
445	my $sf = shift;
446	warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
447	return map {
448	if (ref($_->{$sf}) eq 'ARRAY') {
449	@{ $_->{$sf} };
450	} else {
451	$_->{$sf};
452	}
453	} grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
454	}
455
456	=head2 rec
457
458	syntaxtic sugar for
459
460	@v = rec('200')
461	@v = rec('200','a')
462
463	If rec() returns just single value, it will
464	return scalar, not array.
465
466	=cut
467
468	sub rec {
469	my @out;
470	if ($#_ == 0) {
471	@out = rec1(@_);
472	} elsif ($#_ == 1) {
473	@out = rec2(@_);
474	}
475	if ($#out == 0 && ! wantarray) {
476	return $out[0];
477	} elsif (@out) {
478	return @out;
479	} else {
480	return '';
481	}
482	}
483
484	=head2 frec
485
486	Returns first value from field
487
488	$v = frec('200');
489	$v = frec('200','a');
490
491	=cut
492
493	sub frec {
494	my @out = rec(@_);
495	warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
496	return shift @out;
497	}
498
499	=head2 frec_eq
500
501	=head2 frec_ne
502
503	Check if first values from two fields are same or different
504
505	if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
506	# values are same
507	} else {
508	# values are different
509	}
510
511	Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
512	could write something like:
513
514	if ( frec( '900','a' ) eq frec( '910','c' ) ) {
515	# yada tada
516	}
517
518	but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
519	in order to parse text and create invalid function C<eqfrec>.
520
521	=cut
522
523	sub frec_eq {
524	my ( $f1,$sf1, $f2, $sf2 ) = @_;
525	return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
526	}
527
528	sub frec_ne {
529	return ! frec_eq( @_ );
530	}
531
532	=head2 regex
533
534	Apply regex to some or all values
535
536	@v = regex( 's/foo/bar/g', @v );
537
538	=cut
539
540	sub regex {
541	my $r = shift;
542	my @out;
543	#warn "r: $r\n", dump(\@_);
544	foreach my $t (@_) {
545	next unless ($t);
546	eval "\$t =~ $r";
547	push @out, $t if ($t && $t ne '');
548	}
549	return @out;
550	}
551
552	=head2 prefix
553
554	Prefix all values with a string
555
556	@v = prefix( 'my_', @v );
557
558	=cut
559
560	sub prefix {
561	my $p = shift;
562	return @_ unless defined( $p );
563	return map { $p . $_ } grep { defined($_) } @_;
564	}
565
566	=head2 suffix
567
568	suffix all values with a string
569
570	@v = suffix( '_my', @v );
571
572	=cut
573
574	sub suffix {
575	my $s = shift;
576	return @_ unless defined( $s );
577	return map { $_ . $s } grep { defined($_) } @_;
578	}
579
580	=head2 surround
581
582	surround all values with a two strings
583
584	@v = surround( 'prefix_', '_suffix', @v );
585
586	=cut
587
588	sub surround {
589	my $p = shift;
590	my $s = shift;
591	$p = '' unless defined( $p );
592	$s = '' unless defined( $s );
593	return map { $p . $_ . $s } grep { defined($_) } @_;
594	}
595
596	=head2 first
597
598	Return first element
599
600	$v = first( @v );
601
602	=cut
603
604	sub first {
605	my $r = shift;
606	return $r;
607	}
608
609	=head2 lookup
610
611	Consult lookup hashes for some value
612
613	@v = lookup(
614	sub {
615	'ffkk/peri/mfn'.rec('000')
616	},
617	'ffkk','peri','200-a-200-e',
618	sub {
619	first(rec(200,'a')).' '.first(rec('200','e'))
620	}
621	);
622
623	Code like above will be B<automatically generated> using L<WebPAC::Parse> from
624	normal lookup definition in C<conf/lookup/something.pl> which looks like:
625
626	lookup(
627	# which results to return from record recorded in lookup
628	sub { 'ffkk/peri/mfn' . rec('000') },
629	# from which database and input
630	'ffkk','peri',
631	# such that following values match
632	sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
633	# if this part is missing, we will try to match same fields
634	# from lookup record and current one, or you can override
635	# which records to use from current record using
636	sub { rec('900','x') . ' ' . rec('900','y') },
637	)
638
639	You can think about this lookup as SQL (if that helps):
640
641	select
642	sub { what }
643	from
644	database, input
645	where
646	sub { filter from lookuped record }
647	having
648	sub { optional filter on current record }
649
650	Easy as pie, right?
651
652	=cut
653
654	sub lookup {
655	my ($what, $database, $input, $key, $having) = @_;
656
657	confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
658
659	warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
660	return unless (defined($lookup->{$database}->{$input}->{$key}));
661
662	confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
663
664	my $mfns;
665	my @having = $having->();
666
667	warn "## having = ", dump( @having ) if ($debug > 2);
668
669	foreach my $h ( @having ) {
670	if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
671	warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
672	$mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
673	}
674	}
675
676	return unless ($mfns);
677
678	my @mfns = sort keys %$mfns;
679
680	warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
681
682	my $old_rec = $rec;
683	my @out;
684
685	foreach my $mfn (@mfns) {
686	$rec = $load_row_coderef->( $database, $input, $mfn );
687
688	warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
689
690	my @vals = $what->();
691
692	push @out, ( @vals );
693
694	warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
695	}
696
697	# if (ref($lookup->{$k}) eq 'ARRAY') {
698	# return @{ $lookup->{$k} };
699	# } else {
700	# return $lookup->{$k};
701	# }
702
703	$rec = $old_rec;
704
705	warn "## lookup returns = ", dump(@out), $/ if ($debug);
706
707	if ($#out == 0) {
708	return $out[0];
709	} else {
710	return @out;
711	}
712	}
713
714	=head2 save_into_lookup
715
716	Save value into lookup. It associates current database, input
717	and specific keys with one or more values which will be
718	associated over MFN.
719
720	MFN will be extracted from first occurence current of field 000
721	in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
722
723	my $nr = save_into_lookup($database,$input,$key,sub {
724	# code which produce one or more values
725	});
726
727	It returns number of items saved.
728
729	This function shouldn't be called directly, it's called from code created by
730	L<WebPAC::Parser>.
731
732	=cut
733
734	sub save_into_lookup {
735	my ($database,$input,$key,$coderef) = @_;
736	die "save_into_lookup needs database" unless defined($database);
737	die "save_into_lookup needs input" unless defined($input);
738	die "save_into_lookup needs key" unless defined($key);
739	die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
740
741	warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
742
743	my $mfn =
744	defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
745	defined($config->{_mfn}) ? $config->{_mfn} :
746	die "mfn not defined or zero";
747
748	my $nr = 0;
749
750	foreach my $v ( $coderef->() ) {
751	$lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
752	warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
753	$nr++;
754	}
755
756	return $nr;
757	}
758
759	=head2 config
760
761	Consult config values stored in C<config.yml>
762
763	# return database code (key under databases in yaml)
764	$database_code = config(); # use _ from hash
765	$database_name = config('name');
766	$database_input_name = config('input name');
767
768	Up to three levels are supported.
769
770	=cut
771
772	sub config {
773	return unless ($config);
774
775	my $p = shift;
776
777	$p \|\|= '';
778
779	my $v;
780
781	warn "### getting config($p)\n" if ($debug > 1);
782
783	my @p = split(/\s+/,$p);
784	if ($#p < 0) {
785	$v = $config->{ '_' }; # special, database code
786	} else {
787
788	my $c = dclone( $config );
789
790	foreach my $k (@p) {
791	warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
792	if (ref($c) eq 'ARRAY') {
793	$c = shift @$c;
794	warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
795	last;
796	}
797
798	if (! defined($c->{$k}) ) {
799	$c = undef;
800	last;
801	} else {
802	$c = $c->{$k};
803	}
804	}
805	$v = $c if ($c);
806
807	}
808
809	warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
810	warn "config( '$p' ) is empty\n" if (! $v);
811
812	return $v;
813	}
814
815	=head2 id
816
817	Returns unique id of this record
818
819	$id = id();
820
821	Returns C<42/2> for 2nd occurence of MFN 42.
822
823	=cut
824
825	sub id {
826	my $mfn = $config->{_mfn} \|\| die "no _mfn in config data";
827	return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() \|\| '' );
828	}
829
830	=head2 join_with
831
832	Joins walues with some delimiter
833
834	$v = join_with(", ", @v);
835
836	=cut
837
838	sub join_with {
839	my $d = shift;
840	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
841	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
842	return '' unless defined($v);
843	return $v;
844	}
845
846	=head2 split_rec_on
847
848	Split record subfield on some regex and take one of parts out
849
850	$a_before_semi_column =
851	split_rec_on('200','a', /\s;\s/, $part);
852
853	C<$part> is optional number of element. First element is
854	B<1>, not 0!
855
856	If there is no C<$part> parameter or C<$part> is 0, this function will
857	return all values produced by splitting.
858
859	=cut
860
861	sub split_rec_on {
862	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
863
864	my ($fld, $sf, $regex, $part) = @_;
865	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
866
867	my @r = rec( $fld, $sf );
868	my $v = shift @r;
869	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
870
871	return '' if ( ! defined($v) \|\| $v =~ /^\s*$/);
872
873	my @s = split( $regex, $v );
874	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
875	if ($part && $part > 0) {
876	return $s[ $part - 1 ];
877	} else {
878	return @s;
879	}
880	}
881
882	my $hash;
883
884	=head2 set
885
886	set( key => 'value' );
887
888	=cut
889
890	sub set {
891	my ($k,$v) = @_;
892	warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
893	$hash->{$k} = $v;
894	};
895
896	=head2 get
897
898	get( 'key' );
899
900	=cut
901
902	sub get {
903	my $k = shift \|\| return;
904	my $v = $hash->{$k};
905	warn "## get $k = ", dump( $v ), $/ if ( $debug );
906	return $v;
907	}
908
909	=head2 count
910
911	if ( count( @result ) == 1 ) {
912	# do something if only 1 result is there
913	}
914
915	=cut
916
917	sub count {
918	warn "## count ",dump(@_),$/ if ( $debug );
919	return @_ . '';
920	}
921
922	# END
923	1;