lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Storable qw/dclone/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.35';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );


=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, but supported

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub _field {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

sub display { _field( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { _field( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { _field( 'sorted', @_ ) }


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($h,$include_subfields) = @_;

        # sanity and ease of use
        return $h if (ref($h) ne 'HASH');

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	our @EXPORT = qw/
4	_set_ds _set_lookup
5	_set_load_row
6	_get_ds _clean_ds
7	_debug
8	_pack_subfields_hash
9
10	search_display search display sorted
11
12	rec1 rec2 rec
13	frec frec_eq frec_ne
14	regex prefix suffix surround
15	first lookup join_with
16	save_into_lookup
17
18	split_rec_on
19
20	get set
21	count
22
23	/;
24
25	use warnings;
26	use strict;
27
28	#use base qw/WebPAC::Common/;
29	use Data::Dump qw/dump/;
30	use Storable qw/dclone/;
31	use Carp qw/confess/;
32
33	# debugging warn(s)
34	my $debug = 0;
35
36	# FIXME
37	use WebPAC::Normalize::ISBN;
38	push @EXPORT, ( 'isbn_10', 'isbn_13' );
39
40	use WebPAC::Normalize::MARC;
41	push @EXPORT, ( qw/
42	marc marc_indicators marc_repeatable_subfield
43	marc_compose marc_leader marc_fixed
44	marc_duplicate marc_remove marc_count
45	marc_original_order
46	marc_template
47	/);
48
49	=head1 NAME
50
51	WebPAC::Normalize - describe normalisaton rules using sets
52
53	=cut
54
55	our $VERSION = '0.35';
56
57	=head1 SYNOPSIS
58
59	This module uses C<conf/normalize/*.pl> files to perform normalisation
60	from input records using perl functions which are specialized for set
61	processing.
62
63	Sets are implemented as arrays, and normalisation file is valid perl, which
64	means that you check it's validity before running WebPAC using
65	C<perl -c normalize.pl>.
66
67	Normalisation can generate multiple output normalized data. For now, supported output
68	types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69	C<marc>.
70
71	=head1 FUNCTIONS
72
73	Functions which start with C<_> are private and used by WebPAC internally.
74	All other functions are available for use within normalisation rules.
75
76	=head2 data_structure
77
78	Return data structure
79
80	my $ds = WebPAC::Normalize::data_structure(
81	lookup => $lookup_hash,
82	row => $row,
83	rules => $normalize_pl_config,
84	marc_encoding => 'utf-8',
85	config => $config,
86	load_row_coderef => sub {
87	my ($database,$input,$mfn) = @_;
88	$store->load_row( database => $database, input => $input, id => $mfn );
89	},
90	);
91
92	Options C<row>, C<rules> and C<log> are mandatory while all
93	other are optional.
94
95	C<load_row_coderef> is closure only used when executing lookups, so they will
96	die if it's not defined.
97
98	This function will B<die> if normalizastion can't be evaled.
99
100	Since this function isn't exported you have to call it with
101	C<WebPAC::Normalize::data_structure>.
102
103	=cut
104
105	my $load_row_coderef;
106
107	sub data_structure {
108	my $arg = {@_};
109
110	die "need row argument" unless ($arg->{row});
111	die "need normalisation argument" unless ($arg->{rules});
112
113	_set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114	_set_ds( $arg->{row} );
115	_set_config( $arg->{config} ) if defined($arg->{config});
116	_clean_ds( %{ $arg } );
117	$load_row_coderef = $arg->{load_row_coderef};
118
119	no strict 'subs';
120	no warnings 'redefine';
121	eval "$arg->{rules};";
122	die "error evaling $arg->{rules}: $@\n" if ($@);
123
124	return _get_ds();
125	}
126
127	=head2 _set_ds
128
129	Set current record hash
130
131	_set_ds( $rec );
132
133	=cut
134
135	my $rec;
136
137	sub _set_ds {
138	$rec = shift or die "no record hash";
139	$WebPAC::Normalize::MARC::rec = $rec;
140	}
141
142	=head2
143
144	my $rec = _get_rec();
145
146	=cut
147
148	sub _get_rec { $rec };
149
150	=head2 _set_config
151
152	Set current config hash
153
154	_set_config( $config );
155
156	Magic keys are:
157
158	=over 4
159
160	=item _
161
162	Code of current database
163
164	=item _mfn
165
166	Current MFN
167
168	=back
169
170	=cut
171
172	my $config;
173
174	sub _set_config {
175	$config = shift;
176	}
177
178	=head2 _get_ds
179
180	Return hash formatted as data structure
181
182	my $ds = _get_ds();
183
184	=cut
185
186	my $out;
187
188	sub _get_ds {
189	#warn "## out = ",dump($out);
190	return $out;
191	}
192
193	=head2 _clean_ds
194
195	Clean data structure hash for next record
196
197	_clean_ds();
198
199	=cut
200
201	sub _clean_ds {
202	my $a = {@_};
203	$out = undef;
204	WebPAC::Normalize::MARC::_clean();
205	}
206
207	=head2 _set_lookup
208
209	Set current lookup hash
210
211	_set_lookup( $lookup );
212
213	=cut
214
215	my $lookup;
216
217	sub _set_lookup {
218	$lookup = shift;
219	}
220
221	=head2 _get_lookup
222
223	Get current lookup hash
224
225	my $lookup = _get_lookup();
226
227	=cut
228
229	sub _get_lookup {
230	return $lookup;
231	}
232
233	=head2 _set_load_row
234
235	Setup code reference which will return L<data_structure> from
236	L<WebPAC::Store>
237
238	_set_load_row(sub {
239	my ($database,$input,$mfn) = @_;
240	$store->load_row( database => $database, input => $input, id => $mfn );
241	});
242
243	=cut
244
245	sub _set_load_row {
246	my $coderef = shift;
247	confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
248
249	$load_row_coderef = $coderef;
250	}
251
252	=head2 _debug
253
254	Change level of debug warnings
255
256	_debug( 2 );
257
258	=cut
259
260	sub _debug {
261	my $l = shift;
262	return $debug unless defined($l);
263	warn "debug level $l",$/ if ($l > 0);
264	$debug = $l;
265	$WebPAC::Normalize::MARC::debug = $debug;
266	}
267
268	=head1 Functions to create C<data_structure>
269
270	Those functions generally have to first in your normalization file.
271
272	=head2 search_display
273
274	Define output for L<search> and L<display> at the same time
275
276	search_display('Title', rec('200','a') );
277
278
279	=cut
280
281	sub search_display {
282	my $name = shift or die "search_display needs name as first argument";
283	my @o = grep { defined($_) && $_ ne '' } @_;
284	return unless (@o);
285	$out->{$name}->{search} = \@o;
286	$out->{$name}->{display} = \@o;
287	}
288
289	=head2 tag
290
291	Old name for L<search_display>, but supported
292
293	=cut
294
295	sub tag {
296	search_display( @_ );
297	}
298
299	=head2 display
300
301	Define output just for I<display>
302
303	@v = display('Title', rec('200','a') );
304
305	=cut
306
307	sub _field {
308	my $type = shift or confess "need type -- BUG?";
309	my $name = shift or confess "needs name as first argument";
310	my @o = grep { defined($_) && $_ ne '' } @_;
311	return unless (@o);
312	$out->{$name}->{$type} = \@o;
313	}
314
315	sub display { _field( 'display', @_ ) }
316
317	=head2 search
318
319	Prepare values just for I<search>
320
321	@v = search('Title', rec('200','a') );
322
323	=cut
324
325	sub search { _field( 'search', @_ ) }
326
327	=head2 sorted
328
329	Insert into lists which will be automatically sorted
330
331	sorted('Title', rec('200','a') );
332
333	=cut
334
335	sub sorted { _field( 'sorted', @_ ) }
336
337
338
339	=head1 Functions to extract data from input
340
341	This function should be used inside functions to create C<data_structure> described
342	above.
343
344	=head2 _pack_subfields_hash
345
346	@subfields = _pack_subfields_hash( $h );
347	$subfields = _pack_subfields_hash( $h, 1 );
348
349	Return each subfield value in array or pack them all together and return scalar
350	with subfields (denoted by C<^>) and values.
351
352	=cut
353
354	sub _pack_subfields_hash {
355
356	warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
357
358	my ($h,$include_subfields) = @_;
359
360	# sanity and ease of use
361	return $h if (ref($h) ne 'HASH');
362
363	if ( defined($h->{subfields}) ) {
364	my $sfs = delete $h->{subfields} \|\| die "no subfields?";
365	my @out;
366	while (@$sfs) {
367	my $sf = shift @$sfs;
368	push @out, '^' . $sf if ($include_subfields);
369	my $o = shift @$sfs;
370	if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
371	# single element subfields are not arrays
372	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
373
374	push @out, $h->{$sf};
375	} else {
376	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
377	push @out, $h->{$sf}->[$o];
378	}
379	}
380	if ($include_subfields) {
381	return join('', @out);
382	} else {
383	return @out;
384	}
385	} else {
386	if ($include_subfields) {
387	my $out = '';
388	foreach my $sf (sort keys %$h) {
389	if (ref($h->{$sf}) eq 'ARRAY') {
390	$out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
391	} else {
392	$out .= '^' . $sf . $h->{$sf};
393	}
394	}
395	return $out;
396	} else {
397	# FIXME this should probably be in alphabetical order instead of hash order
398	values %{$h};
399	}
400	}
401	}
402
403	=head2 rec1
404
405	Return all values in some field
406
407	@v = rec1('200')
408
409	TODO: order of values is probably same as in source data, need to investigate that
410
411	=cut
412
413	sub rec1 {
414	my $f = shift;
415	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
416	return unless (defined($rec) && defined($rec->{$f}));
417	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
418	if (ref($rec->{$f}) eq 'ARRAY') {
419	my @out;
420	foreach my $h ( @{ $rec->{$f} } ) {
421	if (ref($h) eq 'HASH') {
422	push @out, ( _pack_subfields_hash( $h ) );
423	} else {
424	push @out, $h;
425	}
426	}
427	return @out;
428	} elsif( defined($rec->{$f}) ) {
429	return $rec->{$f};
430	}
431	}
432
433	=head2 rec2
434
435	Return all values in specific field and subfield
436
437	@v = rec2('200','a')
438
439	=cut
440
441	sub rec2 {
442	my $f = shift;
443	return unless (defined($rec && $rec->{$f}));
444	my $sf = shift;
445	warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
446	return map {
447	if (ref($_->{$sf}) eq 'ARRAY') {
448	@{ $_->{$sf} };
449	} else {
450	$_->{$sf};
451	}
452	} grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
453	}
454
455	=head2 rec
456
457	syntaxtic sugar for
458
459	@v = rec('200')
460	@v = rec('200','a')
461
462	If rec() returns just single value, it will
463	return scalar, not array.
464
465	=cut
466
467	sub rec {
468	my @out;
469	if ($#_ == 0) {
470	@out = rec1(@_);
471	} elsif ($#_ == 1) {
472	@out = rec2(@_);
473	}
474	if ($#out == 0 && ! wantarray) {
475	return $out[0];
476	} elsif (@out) {
477	return @out;
478	} else {
479	return '';
480	}
481	}
482
483	=head2 frec
484
485	Returns first value from field
486
487	$v = frec('200');
488	$v = frec('200','a');
489
490	=cut
491
492	sub frec {
493	my @out = rec(@_);
494	warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
495	return shift @out;
496	}
497
498	=head2 frec_eq
499
500	=head2 frec_ne
501
502	Check if first values from two fields are same or different
503
504	if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
505	# values are same
506	} else {
507	# values are different
508	}
509
510	Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
511	could write something like:
512
513	if ( frec( '900','a' ) eq frec( '910','c' ) ) {
514	# yada tada
515	}
516
517	but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
518	in order to parse text and create invalid function C<eqfrec>.
519
520	=cut
521
522	sub frec_eq {
523	my ( $f1,$sf1, $f2, $sf2 ) = @_;
524	return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
525	}
526
527	sub frec_ne {
528	return ! frec_eq( @_ );
529	}
530
531	=head2 regex
532
533	Apply regex to some or all values
534
535	@v = regex( 's/foo/bar/g', @v );
536
537	=cut
538
539	sub regex {
540	my $r = shift;
541	my @out;
542	#warn "r: $r\n", dump(\@_);
543	foreach my $t (@_) {
544	next unless ($t);
545	eval "\$t =~ $r";
546	push @out, $t if ($t && $t ne '');
547	}
548	return @out;
549	}
550
551	=head2 prefix
552
553	Prefix all values with a string
554
555	@v = prefix( 'my_', @v );
556
557	=cut
558
559	sub prefix {
560	my $p = shift;
561	return @_ unless defined( $p );
562	return map { $p . $_ } grep { defined($_) } @_;
563	}
564
565	=head2 suffix
566
567	suffix all values with a string
568
569	@v = suffix( '_my', @v );
570
571	=cut
572
573	sub suffix {
574	my $s = shift;
575	return @_ unless defined( $s );
576	return map { $_ . $s } grep { defined($_) } @_;
577	}
578
579	=head2 surround
580
581	surround all values with a two strings
582
583	@v = surround( 'prefix_', '_suffix', @v );
584
585	=cut
586
587	sub surround {
588	my $p = shift;
589	my $s = shift;
590	$p = '' unless defined( $p );
591	$s = '' unless defined( $s );
592	return map { $p . $_ . $s } grep { defined($_) } @_;
593	}
594
595	=head2 first
596
597	Return first element
598
599	$v = first( @v );
600
601	=cut
602
603	sub first {
604	my $r = shift;
605	return $r;
606	}
607
608	=head2 lookup
609
610	Consult lookup hashes for some value
611
612	@v = lookup(
613	sub {
614	'ffkk/peri/mfn'.rec('000')
615	},
616	'ffkk','peri','200-a-200-e',
617	sub {
618	first(rec(200,'a')).' '.first(rec('200','e'))
619	}
620	);
621
622	Code like above will be B<automatically generated> using L<WebPAC::Parse> from
623	normal lookup definition in C<conf/lookup/something.pl> which looks like:
624
625	lookup(
626	# which results to return from record recorded in lookup
627	sub { 'ffkk/peri/mfn' . rec('000') },
628	# from which database and input
629	'ffkk','peri',
630	# such that following values match
631	sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
632	# if this part is missing, we will try to match same fields
633	# from lookup record and current one, or you can override
634	# which records to use from current record using
635	sub { rec('900','x') . ' ' . rec('900','y') },
636	)
637
638	You can think about this lookup as SQL (if that helps):
639
640	select
641	sub { what }
642	from
643	database, input
644	where
645	sub { filter from lookuped record }
646	having
647	sub { optional filter on current record }
648
649	Easy as pie, right?
650
651	=cut
652
653	sub lookup {
654	my ($what, $database, $input, $key, $having) = @_;
655
656	confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
657
658	warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
659	return unless (defined($lookup->{$database}->{$input}->{$key}));
660
661	confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
662
663	my $mfns;
664	my @having = $having->();
665
666	warn "## having = ", dump( @having ) if ($debug > 2);
667
668	foreach my $h ( @having ) {
669	if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
670	warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
671	$mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
672	}
673	}
674
675	return unless ($mfns);
676
677	my @mfns = sort keys %$mfns;
678
679	warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
680
681	my $old_rec = $rec;
682	my @out;
683
684	foreach my $mfn (@mfns) {
685	$rec = $load_row_coderef->( $database, $input, $mfn );
686
687	warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
688
689	my @vals = $what->();
690
691	push @out, ( @vals );
692
693	warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
694	}
695
696	# if (ref($lookup->{$k}) eq 'ARRAY') {
697	# return @{ $lookup->{$k} };
698	# } else {
699	# return $lookup->{$k};
700	# }
701
702	$rec = $old_rec;
703
704	warn "## lookup returns = ", dump(@out), $/ if ($debug);
705
706	if ($#out == 0) {
707	return $out[0];
708	} else {
709	return @out;
710	}
711	}
712
713	=head2 save_into_lookup
714
715	Save value into lookup. It associates current database, input
716	and specific keys with one or more values which will be
717	associated over MFN.
718
719	MFN will be extracted from first occurence current of field 000
720	in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
721
722	my $nr = save_into_lookup($database,$input,$key,sub {
723	# code which produce one or more values
724	});
725
726	It returns number of items saved.
727
728	This function shouldn't be called directly, it's called from code created by
729	L<WebPAC::Parser>.
730
731	=cut
732
733	sub save_into_lookup {
734	my ($database,$input,$key,$coderef) = @_;
735	die "save_into_lookup needs database" unless defined($database);
736	die "save_into_lookup needs input" unless defined($input);
737	die "save_into_lookup needs key" unless defined($key);
738	die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
739
740	warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
741
742	my $mfn =
743	defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
744	defined($config->{_mfn}) ? $config->{_mfn} :
745	die "mfn not defined or zero";
746
747	my $nr = 0;
748
749	foreach my $v ( $coderef->() ) {
750	$lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
751	warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
752	$nr++;
753	}
754
755	return $nr;
756	}
757
758	=head2 config
759
760	Consult config values stored in C<config.yml>
761
762	# return database code (key under databases in yaml)
763	$database_code = config(); # use _ from hash
764	$database_name = config('name');
765	$database_input_name = config('input name');
766
767	Up to three levels are supported.
768
769	=cut
770
771	sub config {
772	return unless ($config);
773
774	my $p = shift;
775
776	$p \|\|= '';
777
778	my $v;
779
780	warn "### getting config($p)\n" if ($debug > 1);
781
782	my @p = split(/\s+/,$p);
783	if ($#p < 0) {
784	$v = $config->{ '_' }; # special, database code
785	} else {
786
787	my $c = dclone( $config );
788
789	foreach my $k (@p) {
790	warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
791	if (ref($c) eq 'ARRAY') {
792	$c = shift @$c;
793	warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
794	last;
795	}
796
797	if (! defined($c->{$k}) ) {
798	$c = undef;
799	last;
800	} else {
801	$c = $c->{$k};
802	}
803	}
804	$v = $c if ($c);
805
806	}
807
808	warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
809	warn "config( '$p' ) is empty\n" if (! $v);
810
811	return $v;
812	}
813
814	=head2 id
815
816	Returns unique id of this record
817
818	$id = id();
819
820	Returns C<42/2> for 2nd occurence of MFN 42.
821
822	=cut
823
824	sub id {
825	my $mfn = $config->{_mfn} \|\| die "no _mfn in config data";
826	return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() \|\| '' );
827	}
828
829	=head2 join_with
830
831	Joins walues with some delimiter
832
833	$v = join_with(", ", @v);
834
835	=cut
836
837	sub join_with {
838	my $d = shift;
839	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
840	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
841	return '' unless defined($v);
842	return $v;
843	}
844
845	=head2 split_rec_on
846
847	Split record subfield on some regex and take one of parts out
848
849	$a_before_semi_column =
850	split_rec_on('200','a', /\s;\s/, $part);
851
852	C<$part> is optional number of element. First element is
853	B<1>, not 0!
854
855	If there is no C<$part> parameter or C<$part> is 0, this function will
856	return all values produced by splitting.
857
858	=cut
859
860	sub split_rec_on {
861	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
862
863	my ($fld, $sf, $regex, $part) = @_;
864	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
865
866	my @r = rec( $fld, $sf );
867	my $v = shift @r;
868	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
869
870	return '' if ( ! defined($v) \|\| $v =~ /^\s*$/);
871
872	my @s = split( $regex, $v );
873	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
874	if ($part && $part > 0) {
875	return $s[ $part - 1 ];
876	} else {
877	return @s;
878	}
879	}
880
881	my $hash;
882
883	=head2 set
884
885	set( key => 'value' );
886
887	=cut
888
889	sub set {
890	my ($k,$v) = @_;
891	warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
892	$hash->{$k} = $v;
893	};
894
895	=head2 get
896
897	get( 'key' );
898
899	=cut
900
901	sub get {
902	my $k = shift \|\| return;
903	my $v = $hash->{$k};
904	warn "## get $k = ", dump( $v ), $/ if ( $debug );
905	return $v;
906	}
907
908	=head2 count
909
910	if ( count( @result ) == 1 ) {
911	# do something if only 1 result is there
912	}
913
914	=cut
915
916	sub count {
917	warn "## count ",dump(@_),$/ if ( $debug );
918	return @_ . '';
919	}
920
921	# END
922	1;