lib/WebPAC/Normalize.pm

package WebPAC::Normalize;
use Exporter 'import';
our @EXPORT = qw/
        _set_ds _set_lookup
        _set_load_row
        _get_ds _clean_ds
        _debug
        _pack_subfields_hash

        to
        search_display search display sorted

        rec1 rec2 rec
        frec frec_eq frec_ne
        regex prefix suffix surround
        first lookup join_with
        save_into_lookup

        split_rec_on

        get set
        count

/;

use warnings;
use strict;

#use base qw/WebPAC::Common/;
use Data::Dump qw/dump/;
use Carp qw/confess/;

# debugging warn(s)
my $debug = 0;
_debug( $debug );

# FIXME
use WebPAC::Normalize::ISBN;
push @EXPORT, ( 'isbn_10', 'isbn_13' );

use WebPAC::Normalize::MARC;
push @EXPORT, ( qw/
        marc marc_indicators marc_repeatable_subfield
        marc_compose marc_leader marc_fixed
        marc_duplicate marc_remove marc_count
        marc_original_order
        marc_template
/);

=head1 NAME

WebPAC::Normalize - describe normalisaton rules using sets

=cut

our $VERSION = '0.36';

=head1 SYNOPSIS

This module uses C<conf/normalize/*.pl> files to perform normalisation
from input records using perl functions which are specialized for set
processing.

Sets are implemented as arrays, and normalisation file is valid perl, which
means that you check it's validity before running WebPAC using
C<perl -c normalize.pl>.

Normalisation can generate multiple output normalized data. For now, supported output
types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
C<marc>.

=head1 FUNCTIONS

Functions which start with C<_> are private and used by WebPAC internally.
All other functions are available for use within normalisation rules.

=head2 data_structure

Return data structure

  my $ds = WebPAC::Normalize::data_structure(
        lookup => $lookup_hash,
        row => $row,
        rules => $normalize_pl_config,
        marc_encoding => 'utf-8',
        config => $config,
        load_row_coderef => sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
        },
  );

Options C<row>, C<rules> and C<log> are mandatory while all
other are optional.

C<load_row_coderef> is closure only used when executing lookups, so they will
die if it's not defined.

This function will B<die> if normalizastion can't be evaled.

Since this function isn't exported you have to call it with 
C<WebPAC::Normalize::data_structure>.

=cut

my $load_row_coderef;

sub data_structure {
        my $arg = {@_};

        die "need row argument" unless ($arg->{row});
        die "need normalisation argument" unless ($arg->{rules});

        _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
        _set_ds( $arg->{row} );
        _set_config( $arg->{config} ) if defined($arg->{config});
        _clean_ds( %{ $arg } );
        $load_row_coderef = $arg->{load_row_coderef};

        no strict 'subs';
        no warnings 'redefine';
        eval "$arg->{rules};";
        die "error evaling $arg->{rules}: $@\n" if ($@);

        return _get_ds();
}

=head2 _set_ds

Set current record hash

  _set_ds( $rec );

=cut

my $rec;

sub _set_ds {
        $rec = shift or die "no record hash";
        $WebPAC::Normalize::MARC::rec = $rec;
}

=head2

  my $rec = _get_rec();

=cut

sub _get_rec { $rec };

=head2 _set_config

Set current config hash

  _set_config( $config );

Magic keys are:

=over 4

=item _

Code of current database

=item _mfn

Current MFN

=back

=cut

my $config;

sub _set_config {
        $config = shift;
}

=head2 _get_ds

Return hash formatted as data structure

  my $ds = _get_ds();

=cut

my $out;

sub _get_ds {
#warn "## out = ",dump($out);
        return $out;
}

=head2 _clean_ds

Clean data structure hash for next record

  _clean_ds();

=cut

sub _clean_ds {
        my $a = {@_};
        $out = undef;
        WebPAC::Normalize::MARC::_clean();
}

=head2 _set_lookup

Set current lookup hash

  _set_lookup( $lookup );

=cut

my $lookup;

sub _set_lookup {
        $lookup = shift;
}

=head2 _get_lookup

Get current lookup hash

  my $lookup = _get_lookup();

=cut

sub _get_lookup {
        return $lookup;
}

=head2 _set_load_row

Setup code reference which will return L<data_structure> from
L<WebPAC::Store>

  _set_load_row(sub {
                my ($database,$input,$mfn) = @_;
                $store->load_row( database => $database, input => $input, id => $mfn );
  });

=cut

sub _set_load_row {
        my $coderef = shift;
        confess "argument isn't CODE" unless ref($coderef) eq 'CODE';

        $load_row_coderef = $coderef;
}

=head2 _debug

Change level of debug warnings

  _debug( 2 );

=cut

sub _debug {
        my $l = shift;
        return $debug unless defined($l);
        warn "debug level $l",$/ if ($l > 0);
        $debug = $l;
        $WebPAC::Normalize::MARC::debug = $debug;
}

=head1 Functions to create C<data_structure>

Those functions generally have to first in your normalization file.

=head2 to

Generic way to set values for some name

  to('field-name', 'name-value' => rec('200','a') );

There are many helpers defined below which might be easier to use.

=cut

sub to {
        my $type = shift or confess "need type -- BUG?";
        my $name = shift or confess "needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{$type} = \@o;
}

=head2 search_display

Define output for L<search> and L<display> at the same time

  search_display('Title', rec('200','a') );

=cut

sub search_display {
        my $name = shift or die "search_display needs name as first argument";
        my @o = grep { defined($_) && $_ ne '' } @_;
        return unless (@o);
        $out->{$name}->{search} = \@o;
        $out->{$name}->{display} = \@o;
}

=head2 tag

Old name for L<search_display>, it will probably be removed at one point.

=cut

sub tag {
        search_display( @_ );
}

=head2 display

Define output just for I<display>

  @v = display('Title', rec('200','a') );

=cut

sub display { to( 'display', @_ ) }

=head2 search

Prepare values just for I<search>

  @v = search('Title', rec('200','a') );

=cut

sub search { to( 'search', @_ ) }

=head2 sorted

Insert into lists which will be automatically sorted

 sorted('Title', rec('200','a') );

=cut

sub sorted { to( 'sorted', @_ ) }


=head1 Functions to extract data from input

This function should be used inside functions to create C<data_structure> described
above.

=head2 _pack_subfields_hash

 @subfields = _pack_subfields_hash( $h );
 $subfields = _pack_subfields_hash( $h, 1 );

Return each subfield value in array or pack them all together and return scalar
with subfields (denoted by C<^>) and values.

=cut

sub _pack_subfields_hash {

        warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);

        my ($h,$include_subfields) = @_;

        # sanity and ease of use
        return $h if (ref($h) ne 'HASH');

        if ( defined($h->{subfields}) ) {
                my $sfs = delete $h->{subfields} || die "no subfields?";
                my @out;
                while (@$sfs) {
                        my $sf = shift @$sfs;
                        push @out, '^' . $sf if ($include_subfields);
                        my $o = shift @$sfs;
                        if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
                                # single element subfields are not arrays
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";

                                push @out, $h->{$sf};
                        } else {
#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
                                push @out, $h->{$sf}->[$o];
                        }
                }
                if ($include_subfields) {
                        return join('', @out);
                } else {
                        return @out;
                }
        } else {
                if ($include_subfields) {
                        my $out = '';
                        foreach my $sf (sort keys %$h) {
                                if (ref($h->{$sf}) eq 'ARRAY') {
                                        $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
                                } else {
                                        $out .= '^' . $sf . $h->{$sf};
                                }
                        }
                        return $out;
                } else {
                        # FIXME this should probably be in alphabetical order instead of hash order
                        values %{$h};
                }
        }
}

=head2 rec1

Return all values in some field

  @v = rec1('200')

TODO: order of values is probably same as in source data, need to investigate that

=cut

sub rec1 {
        my $f = shift;
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return unless (defined($rec) && defined($rec->{$f}));
        warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        if (ref($rec->{$f}) eq 'ARRAY') {
                my @out;
                foreach my $h ( @{ $rec->{$f} } ) {
                        if (ref($h) eq 'HASH') {
                                push @out, ( _pack_subfields_hash( $h ) );
                        } else {
                                push @out, $h;
                        }
                }
                return @out;
        } elsif( defined($rec->{$f}) ) {
                return $rec->{$f};
        }
}

=head2 rec2

Return all values in specific field and subfield

  @v = rec2('200','a')

=cut

sub rec2 {
        my $f = shift;
        return unless (defined($rec && $rec->{$f}));
        my $sf = shift;
        warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
        return map {
                if (ref($_->{$sf}) eq 'ARRAY') {
                        @{ $_->{$sf} };
                } else {
                        $_->{$sf};
                }
        } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
}

=head2 rec

syntaxtic sugar for

  @v = rec('200')
  @v = rec('200','a')

If rec() returns just single value, it will
return scalar, not array.

=cut

sub rec {
        my @out;
        if ($#_ == 0) {
                @out = rec1(@_);
        } elsif ($#_ == 1) {
                @out = rec2(@_);
        }
        if ($#out == 0 && ! wantarray) {
                return $out[0];
        } elsif (@out) {
                return @out;
        } else {
                return '';
        }
}

=head2 frec

Returns first value from field

  $v = frec('200');
  $v = frec('200','a');

=cut

sub frec {
        my @out = rec(@_);
        warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
        return shift @out;
}

=head2 frec_eq

=head2 frec_ne

Check if first values from two fields are same or different

  if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
        # values are same
  } else {
    # values are different
  }

Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
could write something like:

  if ( frec( '900','a' ) eq frec( '910','c' ) ) {
        # yada tada
  }

but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
in order to parse text and create invalid function C<eqfrec>.

=cut

sub frec_eq {
        my ( $f1,$sf1, $f2, $sf2 ) = @_;
        return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
}

sub frec_ne {
        return ! frec_eq( @_ );
}

=head2 regex

Apply regex to some or all values

  @v = regex( 's/foo/bar/g', @v );

=cut

sub regex {
        my $r = shift;
        my @out;
        #warn "r: $r\n", dump(\@_);
        foreach my $t (@_) {
                next unless ($t);
                eval "\$t =~ $r";
                push @out, $t if ($t && $t ne '');
        }
        return @out;
}

=head2 prefix

Prefix all values with a string

  @v = prefix( 'my_', @v );

=cut

sub prefix {
        my $p = shift;
        return @_ unless defined( $p );
        return map { $p . $_ } grep { defined($_) } @_;
}

=head2 suffix

suffix all values with a string

  @v = suffix( '_my', @v );

=cut

sub suffix {
        my $s = shift;
        return @_ unless defined( $s );
        return map { $_ . $s } grep { defined($_) } @_;
}

=head2 surround

surround all values with a two strings

  @v = surround( 'prefix_', '_suffix', @v );

=cut

sub surround {
        my $p = shift;
        my $s = shift;
        $p = '' unless defined( $p );
        $s = '' unless defined( $s );
        return map { $p . $_ . $s } grep { defined($_) } @_;
}

=head2 first

Return first element

  $v = first( @v );

=cut

sub first {
        my $r = shift;
        return $r;
}

=head2 lookup

Consult lookup hashes for some value

  @v = lookup(
        sub {
                'ffkk/peri/mfn'.rec('000')
        },
        'ffkk','peri','200-a-200-e',
        sub {
                first(rec(200,'a')).' '.first(rec('200','e'))
        }
  );

Code like above will be B<automatically generated> using L<WebPAC::Parse> from
normal lookup definition in C<conf/lookup/something.pl> which looks like:

  lookup(
        # which results to return from record recorded in lookup
        sub { 'ffkk/peri/mfn' . rec('000') },
        # from which database and input
        'ffkk','peri',
        # such that following values match
        sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
        # if this part is missing, we will try to match same fields
        # from lookup record and current one, or you can override
        # which records to use from current record using
        sub { rec('900','x') . ' ' . rec('900','y') },
  )

You can think about this lookup as SQL (if that helps):

  select
        sub { what }
  from
        database, input
  where
    sub { filter from lookuped record }
  having
    sub { optional filter on current record }

Easy as pie, right?

=cut

sub lookup {
        my ($what, $database, $input, $key, $having) = @_;

        confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);

        warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
        return unless (defined($lookup->{$database}->{$input}->{$key}));

        confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);

        my $mfns;
        my @having = $having->();

        warn "## having = ", dump( @having ) if ($debug > 2);

        foreach my $h ( @having ) {
                if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
                        warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
                        $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
                }
        }

        return unless ($mfns);

        my @mfns = sort keys %$mfns;

        warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);

        my $old_rec = $rec;
        my @out;

        foreach my $mfn (@mfns) {
                $rec = $load_row_coderef->( $database, $input, $mfn );

                warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);

                my @vals = $what->();

                push @out, ( @vals );

                warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
        }

#       if (ref($lookup->{$k}) eq 'ARRAY') {
#               return @{ $lookup->{$k} };
#       } else {
#               return $lookup->{$k};
#       }

        $rec = $old_rec;

        warn "## lookup returns = ", dump(@out), $/ if ($debug);

        if ($#out == 0) {
                return $out[0];
        } else {
                return @out;
        }
}

=head2 save_into_lookup

Save value into lookup. It associates current database, input
and specific keys with one or more values which will be
associated over MFN.

MFN will be extracted from first occurence current of field 000
in current record, or if it doesn't exist from L<_set_config> C<_mfn>.

  my $nr = save_into_lookup($database,$input,$key,sub {
        # code which produce one or more values 
  });

It returns number of items saved.

This function shouldn't be called directly, it's called from code created by
L<WebPAC::Parser>. 

=cut

sub save_into_lookup {
        my ($database,$input,$key,$coderef) = @_;
        die "save_into_lookup needs database" unless defined($database);
        die "save_into_lookup needs input" unless defined($input);
        die "save_into_lookup needs key" unless defined($key);
        die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );

        warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);

        my $mfn = 
                defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
                defined($config->{_mfn})        ?       $config->{_mfn}         :
                                                                                die "mfn not defined or zero";

        my $nr = 0;

        foreach my $v ( $coderef->() ) {
                $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
                warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
                $nr++;
        }

        return $nr;
}

=head2 config

Consult config values stored in C<config.yml>

  # return database code (key under databases in yaml)
  $database_code = config();    # use _ from hash
  $database_name = config('name');
  $database_input_name = config('input name');

Up to three levels are supported.

=cut

sub config {
        return unless ($config);

        my $p = shift;

        $p ||= '';

        my $v;

        warn "### getting config($p)\n" if ($debug > 1);

        my @p = split(/\s+/,$p);
        if ($#p < 0) {
                $v = $config->{ '_' };  # special, database code
        } else {

                my $c = dclone( $config );

                foreach my $k (@p) {
                        warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
                        if (ref($c) eq 'ARRAY') {
                                $c = shift @$c;
                                warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
                                last;
                        }

                        if (! defined($c->{$k}) ) {
                                $c = undef;
                                last;
                        } else {
                                $c = $c->{$k};
                        }
                }
                $v = $c if ($c);

        }

        warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
        warn "config( '$p' ) is empty\n" if (! $v);

        return $v;
}

=head2 id

Returns unique id of this record

  $id = id();

Returns C<42/2> for 2nd occurence of MFN 42.

=cut

sub id {
        my $mfn = $config->{_mfn} || die "no _mfn in config data";
        return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
}

=head2 join_with

Joins walues with some delimiter

  $v = join_with(", ", @v);

=cut

sub join_with {
        my $d = shift;
        warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
        my $v = join($d, grep { defined($_) && $_ ne '' } @_);
        return '' unless defined($v);
        return $v;
}

=head2 split_rec_on

Split record subfield on some regex and take one of parts out

  $a_before_semi_column =
        split_rec_on('200','a', /\s*;\s*/, $part);

C<$part> is optional number of element. First element is
B<1>, not 0!

If there is no C<$part> parameter or C<$part> is 0, this function will
return all values produced by splitting.

=cut

sub split_rec_on {
        die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);

        my ($fld, $sf, $regex, $part) = @_;
        warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);

        my @r = rec( $fld, $sf );
        my $v = shift @r;
        warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);

        return '' if ( ! defined($v) || $v =~ /^\s*$/);

        my @s = split( $regex, $v );
        warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
        if ($part && $part > 0) {
                return $s[ $part - 1 ];
        } else {
                return @s;
        }
}

my $hash;

=head2 set

  set( key => 'value' );

=cut

sub set {
        my ($k,$v) = @_;
        warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
        $hash->{$k} = $v;
};

=head2 get

  get( 'key' );

=cut

sub get {
        my $k = shift || return;
        my $v = $hash->{$k};
        warn "## get $k = ", dump( $v ), $/ if ( $debug );
        return $v;
}

=head2 count

  if ( count( @result ) == 1 ) {
        # do something if only 1 result is there
  }

=cut

sub count {
        warn "## count ",dump(@_),$/ if ( $debug );
        return @_ . '';
}

# END
1;
1	package WebPAC::Normalize;
2	use Exporter 'import';
3	our @EXPORT = qw/
4	_set_ds _set_lookup
5	_set_load_row
6	_get_ds _clean_ds
7	_debug
8	_pack_subfields_hash
9
10	to
11	search_display search display sorted
12
13	rec1 rec2 rec
14	frec frec_eq frec_ne
15	regex prefix suffix surround
16	first lookup join_with
17	save_into_lookup
18
19	split_rec_on
20
21	get set
22	count
23
24	/;
25
26	use warnings;
27	use strict;
28
29	#use base qw/WebPAC::Common/;
30	use Data::Dump qw/dump/;
31	use Carp qw/confess/;
32
33	# debugging warn(s)
34	my $debug = 0;
35	_debug( $debug );
36
37	# FIXME
38	use WebPAC::Normalize::ISBN;
39	push @EXPORT, ( 'isbn_10', 'isbn_13' );
40
41	use WebPAC::Normalize::MARC;
42	push @EXPORT, ( qw/
43	marc marc_indicators marc_repeatable_subfield
44	marc_compose marc_leader marc_fixed
45	marc_duplicate marc_remove marc_count
46	marc_original_order
47	marc_template
48	/);
49
50	=head1 NAME
51
52	WebPAC::Normalize - describe normalisaton rules using sets
53
54	=cut
55
56	our $VERSION = '0.36';
57
58	=head1 SYNOPSIS
59
60	This module uses C<conf/normalize/*.pl> files to perform normalisation
61	from input records using perl functions which are specialized for set
62	processing.
63
64	Sets are implemented as arrays, and normalisation file is valid perl, which
65	means that you check it's validity before running WebPAC using
66	C<perl -c normalize.pl>.
67
68	Normalisation can generate multiple output normalized data. For now, supported output
69	types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
70	C<marc>.
71
72	=head1 FUNCTIONS
73
74	Functions which start with C<_> are private and used by WebPAC internally.
75	All other functions are available for use within normalisation rules.
76
77	=head2 data_structure
78
79	Return data structure
80
81	my $ds = WebPAC::Normalize::data_structure(
82	lookup => $lookup_hash,
83	row => $row,
84	rules => $normalize_pl_config,
85	marc_encoding => 'utf-8',
86	config => $config,
87	load_row_coderef => sub {
88	my ($database,$input,$mfn) = @_;
89	$store->load_row( database => $database, input => $input, id => $mfn );
90	},
91	);
92
93	Options C<row>, C<rules> and C<log> are mandatory while all
94	other are optional.
95
96	C<load_row_coderef> is closure only used when executing lookups, so they will
97	die if it's not defined.
98
99	This function will B<die> if normalizastion can't be evaled.
100
101	Since this function isn't exported you have to call it with
102	C<WebPAC::Normalize::data_structure>.
103
104	=cut
105
106	my $load_row_coderef;
107
108	sub data_structure {
109	my $arg = {@_};
110
111	die "need row argument" unless ($arg->{row});
112	die "need normalisation argument" unless ($arg->{rules});
113
114	_set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
115	_set_ds( $arg->{row} );
116	_set_config( $arg->{config} ) if defined($arg->{config});
117	_clean_ds( %{ $arg } );
118	$load_row_coderef = $arg->{load_row_coderef};
119
120	no strict 'subs';
121	no warnings 'redefine';
122	eval "$arg->{rules};";
123	die "error evaling $arg->{rules}: $@\n" if ($@);
124
125	return _get_ds();
126	}
127
128	=head2 _set_ds
129
130	Set current record hash
131
132	_set_ds( $rec );
133
134	=cut
135
136	my $rec;
137
138	sub _set_ds {
139	$rec = shift or die "no record hash";
140	$WebPAC::Normalize::MARC::rec = $rec;
141	}
142
143	=head2
144
145	my $rec = _get_rec();
146
147	=cut
148
149	sub _get_rec { $rec };
150
151	=head2 _set_config
152
153	Set current config hash
154
155	_set_config( $config );
156
157	Magic keys are:
158
159	=over 4
160
161	=item _
162
163	Code of current database
164
165	=item _mfn
166
167	Current MFN
168
169	=back
170
171	=cut
172
173	my $config;
174
175	sub _set_config {
176	$config = shift;
177	}
178
179	=head2 _get_ds
180
181	Return hash formatted as data structure
182
183	my $ds = _get_ds();
184
185	=cut
186
187	my $out;
188
189	sub _get_ds {
190	#warn "## out = ",dump($out);
191	return $out;
192	}
193
194	=head2 _clean_ds
195
196	Clean data structure hash for next record
197
198	_clean_ds();
199
200	=cut
201
202	sub _clean_ds {
203	my $a = {@_};
204	$out = undef;
205	WebPAC::Normalize::MARC::_clean();
206	}
207
208	=head2 _set_lookup
209
210	Set current lookup hash
211
212	_set_lookup( $lookup );
213
214	=cut
215
216	my $lookup;
217
218	sub _set_lookup {
219	$lookup = shift;
220	}
221
222	=head2 _get_lookup
223
224	Get current lookup hash
225
226	my $lookup = _get_lookup();
227
228	=cut
229
230	sub _get_lookup {
231	return $lookup;
232	}
233
234	=head2 _set_load_row
235
236	Setup code reference which will return L<data_structure> from
237	L<WebPAC::Store>
238
239	_set_load_row(sub {
240	my ($database,$input,$mfn) = @_;
241	$store->load_row( database => $database, input => $input, id => $mfn );
242	});
243
244	=cut
245
246	sub _set_load_row {
247	my $coderef = shift;
248	confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
249
250	$load_row_coderef = $coderef;
251	}
252
253	=head2 _debug
254
255	Change level of debug warnings
256
257	_debug( 2 );
258
259	=cut
260
261	sub _debug {
262	my $l = shift;
263	return $debug unless defined($l);
264	warn "debug level $l",$/ if ($l > 0);
265	$debug = $l;
266	$WebPAC::Normalize::MARC::debug = $debug;
267	}
268
269	=head1 Functions to create C<data_structure>
270
271	Those functions generally have to first in your normalization file.
272
273	=head2 to
274
275	Generic way to set values for some name
276
277	to('field-name', 'name-value' => rec('200','a') );
278
279	There are many helpers defined below which might be easier to use.
280
281	=cut
282
283	sub to {
284	my $type = shift or confess "need type -- BUG?";
285	my $name = shift or confess "needs name as first argument";
286	my @o = grep { defined($_) && $_ ne '' } @_;
287	return unless (@o);
288	$out->{$name}->{$type} = \@o;
289	}
290
291	=head2 search_display
292
293	Define output for L<search> and L<display> at the same time
294
295	search_display('Title', rec('200','a') );
296
297	=cut
298
299	sub search_display {
300	my $name = shift or die "search_display needs name as first argument";
301	my @o = grep { defined($_) && $_ ne '' } @_;
302	return unless (@o);
303	$out->{$name}->{search} = \@o;
304	$out->{$name}->{display} = \@o;
305	}
306
307	=head2 tag
308
309	Old name for L<search_display>, it will probably be removed at one point.
310
311	=cut
312
313	sub tag {
314	search_display( @_ );
315	}
316
317	=head2 display
318
319	Define output just for I<display>
320
321	@v = display('Title', rec('200','a') );
322
323	=cut
324
325	sub display { to( 'display', @_ ) }
326
327	=head2 search
328
329	Prepare values just for I<search>
330
331	@v = search('Title', rec('200','a') );
332
333	=cut
334
335	sub search { to( 'search', @_ ) }
336
337	=head2 sorted
338
339	Insert into lists which will be automatically sorted
340
341	sorted('Title', rec('200','a') );
342
343	=cut
344
345	sub sorted { to( 'sorted', @_ ) }
346
347
348	=head1 Functions to extract data from input
349
350	This function should be used inside functions to create C<data_structure> described
351	above.
352
353	=head2 _pack_subfields_hash
354
355	@subfields = _pack_subfields_hash( $h );
356	$subfields = _pack_subfields_hash( $h, 1 );
357
358	Return each subfield value in array or pack them all together and return scalar
359	with subfields (denoted by C<^>) and values.
360
361	=cut
362
363	sub _pack_subfields_hash {
364
365	warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
366
367	my ($h,$include_subfields) = @_;
368
369	# sanity and ease of use
370	return $h if (ref($h) ne 'HASH');
371
372	if ( defined($h->{subfields}) ) {
373	my $sfs = delete $h->{subfields} \|\| die "no subfields?";
374	my @out;
375	while (@$sfs) {
376	my $sf = shift @$sfs;
377	push @out, '^' . $sf if ($include_subfields);
378	my $o = shift @$sfs;
379	if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
380	# single element subfields are not arrays
381	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
382
383	push @out, $h->{$sf};
384	} else {
385	#warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
386	push @out, $h->{$sf}->[$o];
387	}
388	}
389	if ($include_subfields) {
390	return join('', @out);
391	} else {
392	return @out;
393	}
394	} else {
395	if ($include_subfields) {
396	my $out = '';
397	foreach my $sf (sort keys %$h) {
398	if (ref($h->{$sf}) eq 'ARRAY') {
399	$out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
400	} else {
401	$out .= '^' . $sf . $h->{$sf};
402	}
403	}
404	return $out;
405	} else {
406	# FIXME this should probably be in alphabetical order instead of hash order
407	values %{$h};
408	}
409	}
410	}
411
412	=head2 rec1
413
414	Return all values in some field
415
416	@v = rec1('200')
417
418	TODO: order of values is probably same as in source data, need to investigate that
419
420	=cut
421
422	sub rec1 {
423	my $f = shift;
424	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
425	return unless (defined($rec) && defined($rec->{$f}));
426	warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
427	if (ref($rec->{$f}) eq 'ARRAY') {
428	my @out;
429	foreach my $h ( @{ $rec->{$f} } ) {
430	if (ref($h) eq 'HASH') {
431	push @out, ( _pack_subfields_hash( $h ) );
432	} else {
433	push @out, $h;
434	}
435	}
436	return @out;
437	} elsif( defined($rec->{$f}) ) {
438	return $rec->{$f};
439	}
440	}
441
442	=head2 rec2
443
444	Return all values in specific field and subfield
445
446	@v = rec2('200','a')
447
448	=cut
449
450	sub rec2 {
451	my $f = shift;
452	return unless (defined($rec && $rec->{$f}));
453	my $sf = shift;
454	warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
455	return map {
456	if (ref($_->{$sf}) eq 'ARRAY') {
457	@{ $_->{$sf} };
458	} else {
459	$_->{$sf};
460	}
461	} grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
462	}
463
464	=head2 rec
465
466	syntaxtic sugar for
467
468	@v = rec('200')
469	@v = rec('200','a')
470
471	If rec() returns just single value, it will
472	return scalar, not array.
473
474	=cut
475
476	sub rec {
477	my @out;
478	if ($#_ == 0) {
479	@out = rec1(@_);
480	} elsif ($#_ == 1) {
481	@out = rec2(@_);
482	}
483	if ($#out == 0 && ! wantarray) {
484	return $out[0];
485	} elsif (@out) {
486	return @out;
487	} else {
488	return '';
489	}
490	}
491
492	=head2 frec
493
494	Returns first value from field
495
496	$v = frec('200');
497	$v = frec('200','a');
498
499	=cut
500
501	sub frec {
502	my @out = rec(@_);
503	warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
504	return shift @out;
505	}
506
507	=head2 frec_eq
508
509	=head2 frec_ne
510
511	Check if first values from two fields are same or different
512
513	if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
514	# values are same
515	} else {
516	# values are different
517	}
518
519	Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
520	could write something like:
521
522	if ( frec( '900','a' ) eq frec( '910','c' ) ) {
523	# yada tada
524	}
525
526	but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
527	in order to parse text and create invalid function C<eqfrec>.
528
529	=cut
530
531	sub frec_eq {
532	my ( $f1,$sf1, $f2, $sf2 ) = @_;
533	return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
534	}
535
536	sub frec_ne {
537	return ! frec_eq( @_ );
538	}
539
540	=head2 regex
541
542	Apply regex to some or all values
543
544	@v = regex( 's/foo/bar/g', @v );
545
546	=cut
547
548	sub regex {
549	my $r = shift;
550	my @out;
551	#warn "r: $r\n", dump(\@_);
552	foreach my $t (@_) {
553	next unless ($t);
554	eval "\$t =~ $r";
555	push @out, $t if ($t && $t ne '');
556	}
557	return @out;
558	}
559
560	=head2 prefix
561
562	Prefix all values with a string
563
564	@v = prefix( 'my_', @v );
565
566	=cut
567
568	sub prefix {
569	my $p = shift;
570	return @_ unless defined( $p );
571	return map { $p . $_ } grep { defined($_) } @_;
572	}
573
574	=head2 suffix
575
576	suffix all values with a string
577
578	@v = suffix( '_my', @v );
579
580	=cut
581
582	sub suffix {
583	my $s = shift;
584	return @_ unless defined( $s );
585	return map { $_ . $s } grep { defined($_) } @_;
586	}
587
588	=head2 surround
589
590	surround all values with a two strings
591
592	@v = surround( 'prefix_', '_suffix', @v );
593
594	=cut
595
596	sub surround {
597	my $p = shift;
598	my $s = shift;
599	$p = '' unless defined( $p );
600	$s = '' unless defined( $s );
601	return map { $p . $_ . $s } grep { defined($_) } @_;
602	}
603
604	=head2 first
605
606	Return first element
607
608	$v = first( @v );
609
610	=cut
611
612	sub first {
613	my $r = shift;
614	return $r;
615	}
616
617	=head2 lookup
618
619	Consult lookup hashes for some value
620
621	@v = lookup(
622	sub {
623	'ffkk/peri/mfn'.rec('000')
624	},
625	'ffkk','peri','200-a-200-e',
626	sub {
627	first(rec(200,'a')).' '.first(rec('200','e'))
628	}
629	);
630
631	Code like above will be B<automatically generated> using L<WebPAC::Parse> from
632	normal lookup definition in C<conf/lookup/something.pl> which looks like:
633
634	lookup(
635	# which results to return from record recorded in lookup
636	sub { 'ffkk/peri/mfn' . rec('000') },
637	# from which database and input
638	'ffkk','peri',
639	# such that following values match
640	sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
641	# if this part is missing, we will try to match same fields
642	# from lookup record and current one, or you can override
643	# which records to use from current record using
644	sub { rec('900','x') . ' ' . rec('900','y') },
645	)
646
647	You can think about this lookup as SQL (if that helps):
648
649	select
650	sub { what }
651	from
652	database, input
653	where
654	sub { filter from lookuped record }
655	having
656	sub { optional filter on current record }
657
658	Easy as pie, right?
659
660	=cut
661
662	sub lookup {
663	my ($what, $database, $input, $key, $having) = @_;
664
665	confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
666
667	warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
668	return unless (defined($lookup->{$database}->{$input}->{$key}));
669
670	confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
671
672	my $mfns;
673	my @having = $having->();
674
675	warn "## having = ", dump( @having ) if ($debug > 2);
676
677	foreach my $h ( @having ) {
678	if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
679	warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
680	$mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
681	}
682	}
683
684	return unless ($mfns);
685
686	my @mfns = sort keys %$mfns;
687
688	warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
689
690	my $old_rec = $rec;
691	my @out;
692
693	foreach my $mfn (@mfns) {
694	$rec = $load_row_coderef->( $database, $input, $mfn );
695
696	warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
697
698	my @vals = $what->();
699
700	push @out, ( @vals );
701
702	warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
703	}
704
705	# if (ref($lookup->{$k}) eq 'ARRAY') {
706	# return @{ $lookup->{$k} };
707	# } else {
708	# return $lookup->{$k};
709	# }
710
711	$rec = $old_rec;
712
713	warn "## lookup returns = ", dump(@out), $/ if ($debug);
714
715	if ($#out == 0) {
716	return $out[0];
717	} else {
718	return @out;
719	}
720	}
721
722	=head2 save_into_lookup
723
724	Save value into lookup. It associates current database, input
725	and specific keys with one or more values which will be
726	associated over MFN.
727
728	MFN will be extracted from first occurence current of field 000
729	in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
730
731	my $nr = save_into_lookup($database,$input,$key,sub {
732	# code which produce one or more values
733	});
734
735	It returns number of items saved.
736
737	This function shouldn't be called directly, it's called from code created by
738	L<WebPAC::Parser>.
739
740	=cut
741
742	sub save_into_lookup {
743	my ($database,$input,$key,$coderef) = @_;
744	die "save_into_lookup needs database" unless defined($database);
745	die "save_into_lookup needs input" unless defined($input);
746	die "save_into_lookup needs key" unless defined($key);
747	die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
748
749	warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
750
751	my $mfn =
752	defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
753	defined($config->{_mfn}) ? $config->{_mfn} :
754	die "mfn not defined or zero";
755
756	my $nr = 0;
757
758	foreach my $v ( $coderef->() ) {
759	$lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
760	warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
761	$nr++;
762	}
763
764	return $nr;
765	}
766
767	=head2 config
768
769	Consult config values stored in C<config.yml>
770
771	# return database code (key under databases in yaml)
772	$database_code = config(); # use _ from hash
773	$database_name = config('name');
774	$database_input_name = config('input name');
775
776	Up to three levels are supported.
777
778	=cut
779
780	sub config {
781	return unless ($config);
782
783	my $p = shift;
784
785	$p \|\|= '';
786
787	my $v;
788
789	warn "### getting config($p)\n" if ($debug > 1);
790
791	my @p = split(/\s+/,$p);
792	if ($#p < 0) {
793	$v = $config->{ '_' }; # special, database code
794	} else {
795
796	my $c = dclone( $config );
797
798	foreach my $k (@p) {
799	warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
800	if (ref($c) eq 'ARRAY') {
801	$c = shift @$c;
802	warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
803	last;
804	}
805
806	if (! defined($c->{$k}) ) {
807	$c = undef;
808	last;
809	} else {
810	$c = $c->{$k};
811	}
812	}
813	$v = $c if ($c);
814
815	}
816
817	warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
818	warn "config( '$p' ) is empty\n" if (! $v);
819
820	return $v;
821	}
822
823	=head2 id
824
825	Returns unique id of this record
826
827	$id = id();
828
829	Returns C<42/2> for 2nd occurence of MFN 42.
830
831	=cut
832
833	sub id {
834	my $mfn = $config->{_mfn} \|\| die "no _mfn in config data";
835	return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() \|\| '' );
836	}
837
838	=head2 join_with
839
840	Joins walues with some delimiter
841
842	$v = join_with(", ", @v);
843
844	=cut
845
846	sub join_with {
847	my $d = shift;
848	warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
849	my $v = join($d, grep { defined($_) && $_ ne '' } @_);
850	return '' unless defined($v);
851	return $v;
852	}
853
854	=head2 split_rec_on
855
856	Split record subfield on some regex and take one of parts out
857
858	$a_before_semi_column =
859	split_rec_on('200','a', /\s;\s/, $part);
860
861	C<$part> is optional number of element. First element is
862	B<1>, not 0!
863
864	If there is no C<$part> parameter or C<$part> is 0, this function will
865	return all values produced by splitting.
866
867	=cut
868
869	sub split_rec_on {
870	die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
871
872	my ($fld, $sf, $regex, $part) = @_;
873	warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
874
875	my @r = rec( $fld, $sf );
876	my $v = shift @r;
877	warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
878
879	return '' if ( ! defined($v) \|\| $v =~ /^\s*$/);
880
881	my @s = split( $regex, $v );
882	warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
883	if ($part && $part > 0) {
884	return $s[ $part - 1 ];
885	} else {
886	return @s;
887	}
888	}
889
890	my $hash;
891
892	=head2 set
893
894	set( key => 'value' );
895
896	=cut
897
898	sub set {
899	my ($k,$v) = @_;
900	warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
901	$hash->{$k} = $v;
902	};
903
904	=head2 get
905
906	get( 'key' );
907
908	=cut
909
910	sub get {
911	my $k = shift \|\| return;
912	my $v = $hash->{$k};
913	warn "## get $k = ", dump( $v ), $/ if ( $debug );
914	return $v;
915	}
916
917	=head2 count
918
919	if ( count( @result ) == 1 ) {
920	# do something if only 1 result is there
921	}
922
923	=cut
924
925	sub count {
926	warn "## count ",dump(@_),$/ if ( $debug );
927	return @_ . '';
928	}
929
930	# END
931	1;