lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        to
        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;
_debug( $debug );

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

use Storable qw/dclone/;

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.36';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 to

Generic way to set values for some name

  to('field-name', 'name-value' => rec('200','a') );

There are many helpers defined below which might be easier to use.

=cut

sub to {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );

=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, it will probably be removed at one point.

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display { to( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { to( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { to( 'sorted', @_ ) }


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($hash,$include_subfields) = @_;

        # sanity and ease of use
        return $hash if (ref($hash) ne 'HASH');

        my $h = dclone( $hash );

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	our @EXPORT = qw/
4	_set_ds _set_lookup
5	_set_load_row
6	_get_ds _clean_ds
7	_debug
8	_pack_subfields_hash
9
10	to
11	search_display search display sorted
12
13	rec1 rec2 rec
14	frec frec_eq frec_ne
15	regex prefix suffix surround
16	first lookup join_with
17	save_into_lookup
18
19	split_rec_on
20
21	get set
22	count
23
24	/;
25
26	use warnings;
27	use strict;
28
29	#use base qw/WebPAC::Common/;
30	use Data::Dump qw/dump/;
31	use Carp qw/confess/;
32
33	# debugging warn(s)
34	my $debug = 0;
35	_debug( $debug );
36
37	# FIXME
38	use WebPAC::Normalize::ISBN;
39	push @EXPORT, ( 'isbn_10', 'isbn_13' );
40
41	use WebPAC::Normalize::MARC;
42	push @EXPORT, ( qw/
43	marc marc_indicators marc_repeatable_subfield
44	marc_compose marc_leader marc_fixed
45	marc_duplicate marc_remove marc_count
46	marc_original_order
47	marc_template
48	/);
49
50	use Storable qw/dclone/;
51
52	=head1 NAME
53
54	WebPAC::Normalize - describe normalisaton rules using sets
55
56	=cut
57
58	our $VERSION = '0.36';
59
60	=head1 SYNOPSIS
61
62	This module uses C<conf/normalize/*.pl> files to perform normalisation
63	from input records using perl functions which are specialized for set
64	processing.
65
66	Sets are implemented as arrays, and normalisation file is valid perl, which
67	means that you check it's validity before running WebPAC using
68	C<perl -c normalize.pl>.
69
70	Normalisation can generate multiple output normalized data. For now, supported output
71	types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
72	C<marc>.
73
74	=head1 FUNCTIONS
75
76	Functions which start with C<_> are private and used by WebPAC internally.
77	All other functions are available for use within normalisation rules.
78
79	=head2 data_structure
80
81	Return data structure
82
83	my $ds = WebPAC::Normalize::data_structure(
84	lookup => $lookup_hash,
85	row => $row,
86	rules => $normalize_pl_config,
87	marc_encoding => 'utf-8',
88	config => $config,
89	load_row_coderef => sub {
90	my ($database,$input,$mfn) = @_;
91	$store->load_row( database => $database, input => $input, id => $mfn );
92	},
93	);
94
95	Options C<row>, C<rules> and C<log> are mandatory while all
96	other are optional.
97
98	C<load_row_coderef> is closure only used when executing lookups, so they will
99	die if it's not defined.
100
101	This function will B<die> if normalizastion can't be evaled.
102
103	Since this function isn't exported you have to call it with
104	C<WebPAC::Normalize::data_structure>.
105
106	=cut
107
108	my $load_row_coderef;
109
110	sub data_structure {
111	my $arg = {@_};
112
113	die "need row argument" unless ($arg->{row});
114	die "need normalisation argument" unless ($arg->{rules});
115
116	_set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
117	_set_ds( $arg->{row} );
118	_set_config( $arg->{config} ) if defined($arg->{config});
119	_clean_ds( %{ $arg } );
120	$load_row_coderef = $arg->{load_row_coderef};
121
122	no strict 'subs';
123	no warnings 'redefine';
124	eval "$arg->{rules};";
125	die "error evaling $arg->{rules}: $@\n" if ($@);
126
127	return _get_ds();
128	}
129
130	=head2 _set_ds
131
132	Set current record hash
133
134	_set_ds( $rec );
135
136	=cut
137
138	my $rec;
139
140	sub _set_ds {
141	$rec = shift or die "no record hash";
142	$WebPAC::Normalize::MARC::rec = $rec;
143	}
144
145	=head2
146
147	my $rec = _get_rec();
148
149	=cut
150
151	sub _get_rec { $rec };
152
153	=head2 _set_config
154
155	Set current config hash
156
157	_set_config( $config );
158
159	Magic keys are:
160
161	=over 4
162
163	=item _
164
165	Code of current database
166
167	=item _mfn
168
169	Current MFN
170
171	=back
172
173	=cut
174
175	my $config;
176
177	sub _set_config {
178	$config = shift;
179	}
180
181	=head2 _get_ds
182
183	Return hash formatted as data structure
184
185	my $ds = _get_ds();
186
187	=cut
188
189	my $out;
190
191	sub _get_ds {
192	#warn "## out = ",dump($out);
193	return $out;
194	}
195
196	=head2 _clean_ds
197
198	Clean data structure hash for next record
199
200	_clean_ds();
201
202	=cut
203
204	sub _clean_ds {
205	my $a = {@_};
206	$out = undef;
207	WebPAC::Normalize::MARC::_clean();
208	}
209
210	=head2 _set_lookup
211
212	Set current lookup hash
213
214	_set_lookup( $lookup );
215
216	=cut
217
218	my $lookup;
219
220	sub _set_lookup {
221	$lookup = shift;
222	}
223
224	=head2 _get_lookup
225
226	Get current lookup hash
227
228	my $lookup = _get_lookup();
229
230	=cut
231
232	sub _get_lookup {
233	return $lookup;
234	}
235
236	=head2 _set_load_row
237
238	Setup code reference which will return L<data_structure> from
239	L<WebPAC::Store>
240
241	_set_load_row(sub {
242	my ($database,$input,$mfn) = @_;
243	$store->load_row( database => $database, input => $input, id => $mfn );
244	});
245
246	=cut
247
248	sub _set_load_row {
249	my $coderef = shift;
250	confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
251
252	$load_row_coderef = $coderef;
253	}
254
255	=head2 _debug
256
257	Change level of debug warnings
258
259	_debug( 2 );
260
261	=cut
262
263	sub _debug {
264	my $l = shift;
265	return $debug unless defined($l);
266	warn "debug level $l",$/ if ($l > 0);
267	$debug = $l;
268	$WebPAC::Normalize::MARC::debug = $debug;
269	}
270
271	=head1 Functions to create C<data_structure>
272
273	Those functions generally have to first in your normalization file.
274
275	=head2 to
276
277	Generic way to set values for some name
278
279	to('field-name', 'name-value' => rec('200','a') );
280
281	There are many helpers defined below which might be easier to use.
282
283	=cut
284
285	sub to {
286	my $type = shift or confess "need type -- BUG?";
287	my $name = shift or confess "needs name as first argument";
288	my @o = grep { defined($_) && $_ ne '' } @_;
289	return unless (@o);
290	$out->{$name}->{$type} = \@o;
291	}
292
293	=head2 search_display
294
295	Define output for L<search> and L<display> at the same time
296
297	search_display('Title', rec('200','a') );
298
299	=cut
300
301	sub search_display {
302	my $name = shift or die "search_display needs name as first argument";
303	my @o = grep { defined($_) && $_ ne '' } @_;
304	return unless (@o);
305	$out->{$name}->{search} = \@o;
306	$out->{$name}->{display} = \@o;
307	}
308
309	=head2 tag
310
311	Old name for L<search_display>, it will probably be removed at one point.
312
313	=cut
314
315	sub tag {
316	search_display( @_ );
317	}
318
319	=head2 display
320
321	Define output just for I<display>
322
323	@v = display('Title', rec('200','a') );
324
325	=cut
326
327	sub display { to( 'display', @_ ) }
328
329	=head2 search
330
331	Prepare values just for I<search>
332
333	@v = search('Title', rec('200','a') );
334
335	=cut
336
337	sub search { to( 'search', @_ ) }
338
339	=head2 sorted
340
341	Insert into lists which will be automatically sorted
342
343	sorted('Title', rec('200','a') );
344
345	=cut
346
347	sub sorted { to( 'sorted', @_ ) }
348
349
350	=head1 Functions to extract data from input
351
352	This function should be used inside functions to create C<data_structure> described
353	above.
354
355	=head2 _pack_subfields_hash
356
357	@subfields = _pack_subfields_hash( $h );
358	$subfields = _pack_subfields_hash( $h, 1 );
359
360	Return each subfield value in array or pack them all together and return scalar
361	with subfields (denoted by C<^>) and values.
362
363	=cut
364
365	sub _pack_subfields_hash {
366
367	warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
368
369	my ($hash,$include_subfields) = @_;
370
371	# sanity and ease of use
372	return $hash if (ref($hash) ne 'HASH');
373
374	my $h = dclone( $hash );
375
376	if ( defined($h->{subfields}) ) {
377	my $sfs = delete $h->{subfields} \|\| die "no subfields?";
378	my @out;
379	while (@$sfs) {
380	my $sf = shift @$sfs;
381	push @out, '^' . $sf if ($include_subfields);
382	my $o = shift @$sfs;
383	if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
384	# single element subfields are not arrays
385	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
386
387	push @out, $h->{$sf};
388	} else {
389	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
390	push @out, $h->{$sf}->[$o];
391	}
392	}
393	if ($include_subfields) {
394	return join('', @out);
395	} else {
396	return @out;
397	}
398	} else {
399	if ($include_subfields) {
400	my $out = '';
401	foreach my $sf (sort keys %$h) {
402	if (ref($h->{$sf}) eq 'ARRAY') {
403	$out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
404	} else {
405	$out .= '^' . $sf . $h->{$sf};
406	}
407	}
408	return $out;
409	} else {
410	# FIXME this should probably be in alphabetical order instead of hash order
411	values %{$h};
412	}
413	}
414	}
415
416	=head2 rec1
417
418	Return all values in some field
419
420	@v = rec1('200')
421
422	TODO: order of values is probably same as in source data, need to investigate that
423
424	=cut
425
426	sub rec1 {
427	my $f = shift;
428	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
429	return unless (defined($rec) && defined($rec->{$f}));
430	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
431	if (ref($rec->{$f}) eq 'ARRAY') {
432	my @out;
433	foreach my $h ( @{ $rec->{$f} } ) {
434	if (ref($h) eq 'HASH') {
435	push @out, ( _pack_subfields_hash( $h ) );
436	} else {
437	push @out, $h;
438	}
439	}
440	return @out;
441	} elsif( defined($rec->{$f}) ) {
442	return $rec->{$f};
443	}
444	}
445
446	=head2 rec2
447
448	Return all values in specific field and subfield
449
450	@v = rec2('200','a')
451
452	=cut
453
454	sub rec2 {
455	my $f = shift;
456	return unless (defined($rec && $rec->{$f}));
457	my $sf = shift;
458	warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
459	return map {
460	if (ref($_->{$sf}) eq 'ARRAY') {
461	@{ $_->{$sf} };
462	} else {
463	$_->{$sf};
464	}
465	} grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
466	}
467
468	=head2 rec
469
470	syntaxtic sugar for
471
472	@v = rec('200')
473	@v = rec('200','a')
474
475	If rec() returns just single value, it will
476	return scalar, not array.
477
478	=cut
479
480	sub rec {
481	my @out;
482	if ($#_ == 0) {
483	@out = rec1(@_);
484	} elsif ($#_ == 1) {
485	@out = rec2(@_);
486	}
487	if ($#out == 0 && ! wantarray) {
488	return $out[0];
489	} elsif (@out) {
490	return @out;
491	} else {
492	return '';
493	}
494	}
495
496	=head2 frec
497
498	Returns first value from field
499
500	$v = frec('200');
501	$v = frec('200','a');
502
503	=cut
504
505	sub frec {
506	my @out = rec(@_);
507	warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
508	return shift @out;
509	}
510
511	=head2 frec_eq
512
513	=head2 frec_ne
514
515	Check if first values from two fields are same or different
516
517	if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
518	# values are same
519	} else {
520	# values are different
521	}
522
523	Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
524	could write something like:
525
526	if ( frec( '900','a' ) eq frec( '910','c' ) ) {
527	# yada tada
528	}
529
530	but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
531	in order to parse text and create invalid function C<eqfrec>.
532
533	=cut
534
535	sub frec_eq {
536	my ( $f1,$sf1, $f2, $sf2 ) = @_;
537	return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
538	}
539
540	sub frec_ne {
541	return ! frec_eq( @_ );
542	}
543
544	=head2 regex
545
546	Apply regex to some or all values
547
548	@v = regex( 's/foo/bar/g', @v );
549
550	=cut
551
552	sub regex {
553	my $r = shift;
554	my @out;
555	#warn "r: $r\n", dump(\@_);
556	foreach my $t (@_) {
557	next unless ($t);
558	eval "\$t =~ $r";
559	push @out, $t if ($t && $t ne '');
560	}
561	return @out;
562	}
563
564	=head2 prefix
565
566	Prefix all values with a string
567
568	@v = prefix( 'my_', @v );
569
570	=cut
571
572	sub prefix {
573	my $p = shift;
574	return @_ unless defined( $p );
575	return map { $p . $_ } grep { defined($_) } @_;
576	}
577
578	=head2 suffix
579
580	suffix all values with a string
581
582	@v = suffix( '_my', @v );
583
584	=cut
585
586	sub suffix {
587	my $s = shift;
588	return @_ unless defined( $s );
589	return map { $_ . $s } grep { defined($_) } @_;
590	}
591
592	=head2 surround
593
594	surround all values with a two strings
595
596	@v = surround( 'prefix_', '_suffix', @v );
597
598	=cut
599
600	sub surround {
601	my $p = shift;
602	my $s = shift;
603	$p = '' unless defined( $p );
604	$s = '' unless defined( $s );
605	return map { $p . $_ . $s } grep { defined($_) } @_;
606	}
607
608	=head2 first
609
610	Return first element
611
612	$v = first( @v );
613
614	=cut
615
616	sub first {
617	my $r = shift;
618	return $r;
619	}
620
621	=head2 lookup
622
623	Consult lookup hashes for some value
624
625	@v = lookup(
626	sub {
627	'ffkk/peri/mfn'.rec('000')
628	},
629	'ffkk','peri','200-a-200-e',
630	sub {
631	first(rec(200,'a')).' '.first(rec('200','e'))
632	}
633	);
634
635	Code like above will be B<automatically generated> using L<WebPAC::Parse> from
636	normal lookup definition in C<conf/lookup/something.pl> which looks like:
637
638	lookup(
639	# which results to return from record recorded in lookup
640	sub { 'ffkk/peri/mfn' . rec('000') },
641	# from which database and input
642	'ffkk','peri',
643	# such that following values match
644	sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
645	# if this part is missing, we will try to match same fields
646	# from lookup record and current one, or you can override
647	# which records to use from current record using
648	sub { rec('900','x') . ' ' . rec('900','y') },
649	)
650
651	You can think about this lookup as SQL (if that helps):
652
653	select
654	sub { what }
655	from
656	database, input
657	where
658	sub { filter from lookuped record }
659	having
660	sub { optional filter on current record }
661
662	Easy as pie, right?
663
664	=cut
665
666	sub lookup {
667	my ($what, $database, $input, $key, $having) = @_;
668
669	confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
670
671	warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
672	return unless (defined($lookup->{$database}->{$input}->{$key}));
673
674	confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
675
676	my $mfns;
677	my @having = $having->();
678
679	warn "## having = ", dump( @having ) if ($debug > 2);
680
681	foreach my $h ( @having ) {
682	if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
683	warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
684	$mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
685	}
686	}
687
688	return unless ($mfns);
689
690	my @mfns = sort keys %$mfns;
691
692	warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
693
694	my $old_rec = $rec;
695	my @out;
696
697	foreach my $mfn (@mfns) {
698	$rec = $load_row_coderef->( $database, $input, $mfn );
699
700	warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
701
702	my @vals = $what->();
703
704	push @out, ( @vals );
705
706	warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
707	}
708
709	# if (ref($lookup->{$k}) eq 'ARRAY') {
710	# return @{ $lookup->{$k} };
711	# } else {
712	# return $lookup->{$k};
713	# }
714
715	$rec = $old_rec;
716
717	warn "## lookup returns = ", dump(@out), $/ if ($debug);
718
719	if ($#out == 0) {
720	return $out[0];
721	} else {
722	return @out;
723	}
724	}
725
726	=head2 save_into_lookup
727
728	Save value into lookup. It associates current database, input
729	and specific keys with one or more values which will be
730	associated over MFN.
731
732	MFN will be extracted from first occurence current of field 000
733	in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
734
735	my $nr = save_into_lookup($database,$input,$key,sub {
736	# code which produce one or more values
737	});
738
739	It returns number of items saved.
740
741	This function shouldn't be called directly, it's called from code created by
742	L<WebPAC::Parser>.
743
744	=cut
745
746	sub save_into_lookup {
747	my ($database,$input,$key,$coderef) = @_;
748	die "save_into_lookup needs database" unless defined($database);
749	die "save_into_lookup needs input" unless defined($input);
750	die "save_into_lookup needs key" unless defined($key);
751	die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
752
753	warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
754
755	my $mfn =
756	defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
757	defined($config->{_mfn}) ? $config->{_mfn} :
758	die "mfn not defined or zero";
759
760	my $nr = 0;
761
762	foreach my $v ( $coderef->() ) {
763	$lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
764	warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
765	$nr++;
766	}
767
768	return $nr;
769	}
770
771	=head2 config
772
773	Consult config values stored in C<config.yml>
774
775	# return database code (key under databases in yaml)
776	$database_code = config(); # use _ from hash
777	$database_name = config('name');
778	$database_input_name = config('input name');
779
780	Up to three levels are supported.
781
782	=cut
783
784	sub config {
785	return unless ($config);
786
787	my $p = shift;
788
789	$p \|\|= '';
790
791	my $v;
792
793	warn "### getting config($p)\n" if ($debug > 1);
794
795	my @p = split(/\s+/,$p);
796	if ($#p < 0) {
797	$v = $config->{ '_' }; # special, database code
798	} else {
799
800	my $c = dclone( $config );
801
802	foreach my $k (@p) {
803	warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
804	if (ref($c) eq 'ARRAY') {
805	$c = shift @$c;
806	warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
807	last;
808	}
809
810	if (! defined($c->{$k}) ) {
811	$c = undef;
812	last;
813	} else {
814	$c = $c->{$k};
815	}
816	}
817	$v = $c if ($c);
818
819	}
820
821	warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
822	warn "config( '$p' ) is empty\n" if (! $v);
823
824	return $v;
825	}
826
827	=head2 id
828
829	Returns unique id of this record
830
831	$id = id();
832
833	Returns C<42/2> for 2nd occurence of MFN 42.
834
835	=cut
836
837	sub id {
838	my $mfn = $config->{_mfn} \|\| die "no _mfn in config data";
839	return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() \|\| '' );
840	}
841
842	=head2 join_with
843
844	Joins walues with some delimiter
845
846	$v = join_with(", ", @v);
847
848	=cut
849
850	sub join_with {
851	my $d = shift;
852	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
853	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
854	return '' unless defined($v);
855	return $v;
856	}
857
858	=head2 split_rec_on
859
860	Split record subfield on some regex and take one of parts out
861
862	$a_before_semi_column =
863	split_rec_on('200','a', /\s;\s/, $part);
864
865	C<$part> is optional number of element. First element is
866	B<1>, not 0!
867
868	If there is no C<$part> parameter or C<$part> is 0, this function will
869	return all values produced by splitting.
870
871	=cut
872
873	sub split_rec_on {
874	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
875
876	my ($fld, $sf, $regex, $part) = @_;
877	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
878
879	my @r = rec( $fld, $sf );
880	my $v = shift @r;
881	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
882
883	return '' if ( ! defined($v) \|\| $v =~ /^\s*$/);
884
885	my @s = split( $regex, $v );
886	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
887	if ($part && $part > 0) {
888	return $s[ $part - 1 ];
889	} else {
890	return @s;
891	}
892	}
893
894	my $hash;
895
896	=head2 set
897
898	set( key => 'value' );
899
900	=cut
901
902	sub set {
903	my ($k,$v) = @_;
904	warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
905	$hash->{$k} = $v;
906	};
907
908	=head2 get
909
910	get( 'key' );
911
912	=cut
913
914	sub get {
915	my $k = shift \|\| return;
916	my $v = $hash->{$k};
917	warn "## get $k = ", dump( $v ), $/ if ( $debug );
918	return $v;
919	}
920
921	=head2 count
922
923	if ( count( @result ) == 1 ) {
924	# do something if only 1 result is there
925	}
926
927	=cut
928
929	sub count {
930	warn "## count ",dump(@_),$/ if ( $debug );
931	return @_ . '';
932	}
933
934	# END
935	1;