WebPAC/Lookup/Normalize.pm

package WebPAC::Lookup::Normalize;

use warnings;
use strict;
use blib;
use WebPAC::Common;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Lookup::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.09

=cut

our $VERSION = '0.09';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

B<WARNING:>

This code is obsolete. It moved to here so that I don't have to re-write
L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
just yet. But it will dissapear real soon!

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Lookup::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}

=head2 all_tags

Returns all tags in document in specified order

  my $sorted_tags = $self->all_tags();

=cut

sub all_tags {
        my $self = shift;

        if (! $self->{_tags_by_order}) {

                my $log = $self->_get_logger;
                # sanity check
                $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});

                my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
                $log->debug("unsorted tags: " . join(", ", @tags));

                @tags = sort { $self->_sort_by_order } @tags;

                $log->debug("sorted tags: " . join(",", @tags) );

                $self->{_tags_by_order} = \@tags;
        }

        return $self->{_tags_by_order};
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my $tags = $self->all_tags();

        $log->debug("tags: ",sub { join(", ",@{ $tags }) });

        my $ds;

        foreach my $field (@{ $tags }) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->_rec_to_arr($rec,$format,'fill_in');
                        } else {
                                @v = $self->_rec_to_arr($rec,$format,'parse');
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
                                next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i, $rec_size) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp if ($tmp);
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

There is optional parametar C<$record_size> which can be used to get sizes of
all C<field^subfield> combinations in this format.

 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my ($rec,$format,$i,$rec_size) = @_;

        $log->logconfess("need data record") unless ($rec);
        $log->logconfess("need format to parse") unless($format);

        # iteration (for repeatable fields)
        $i ||= 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        {
                # fix warnings
                no warnings 'uninitialized';

                # do actual replacement of placeholders
                # repeatable fields
                if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
                        $just_single = 0;
                }

                # non-repeatable fields
                if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
                        return if ($i > 0 && $just_single);
                }
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 _rec_to_arr

Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');

=cut

sub _rec_to_arr {
        my $self = shift;

        my ($rec, $format_utf8, $code) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        $log->debug("using $code on $format_utf8");

        my $i = 0;
        my $max = 0;
        my @arr;
        my $rec_size = {};

        while ($i <= $max) {
                my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
                if ($rec_size) {
                        foreach my $f (keys %{ $rec_size }) {
                                $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
                        }
                        $log->debug("max set to $max");
                        undef $rec_size;
                }
                if (@v) {
                        push @arr, @v;
                } else {
                        push @arr, '' if ($max > $i);
                }
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);

Required arguments are:

=over 8

=item C<$rec>

record reference

=item C<$f>

field

=item C<$sf>

optional subfield

=item C<$i>

index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )

=item C<$found>

optional variable that will be incremeted if preset

=item C<$rec_size>

hash to hold maximum occurances of C<field^subfield> combinations
(which can be accessed using keys in same format)

=back

Returns value or empty string, updates C<$found> and C<rec_size>
if present.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found,$cache) = @_;

        return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');

        if (defined($$cache)) {
                $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
        }

        return '' unless ($$rec->{$f}->[$i]);

        {
                no strict 'refs';
                if (defined($sf)) {
                        $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
                        return $$rec->{$f}->[$i]->{$sf};
                } else {
                        $$found++ if (defined($$found));
                        # it still might have subfields, just
                        # not specified, so we'll dump some debug info
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        my $v = $$rec->{$f}->[$i]->{$k};
                                        $out .= '$' . $k .':' . $v if ($v);
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Lookup::Normalize
1	package WebPAC::Lookup::Normalize;
2
3	use warnings;
4	use strict;
5	use blib;
6	use WebPAC::Common;
7	use base 'WebPAC::Common';
8	use Data::Dumper;
9
10	=head1 NAME
11
12	WebPAC::Lookup::Normalize - data mungling for normalisation
13
14	=head1 VERSION
15
16	Version 0.09
17
18	=cut
19
20	our $VERSION = '0.09';
21
22	=head1 SYNOPSIS
23
24	This package contains code that mungle data to produce normalized format.
25
26	B<WARNING:>
27
28	This code is obsolete. It moved to here so that I don't have to re-write
29	L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
30	just yet. But it will dissapear real soon!
31
32	It contains several assumptions:
33
34	=over
35
36	=item *
37
38	format of fields is defined using C<v123^a> notation for repeatable fields
39	or C<s123^a> for single (or first) value, where C<123> is field number and
40	C<a> is subfield.
41
42	=item *
43
44	source data records (C<$rec>) have unique identifiers in field C<000>
45
46	=item *
47
48	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
49	perl code that is evaluated before producing output (value of field will be
50	interpolated before that)
51
52	=item *
53
54	optional C<filter{filter_name}> at B<begining of format> will apply perl
55	code defined as code ref on format after field substitution to producing
56	output
57
58	There is one built-in filter called C<regex> which can be use like this:
59
60	filter{regex(s/foo/bar/)}
61
62	=item *
63
64	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
65
66	=item *
67
68	at end, optional C<format>s rules are resolved. Format rules are similar to
69	C<sprintf> and can also contain C<lookup{...}> which is performed after
70	values are inserted in format.
71
72	=back
73
74	This also describes order in which transformations are applied (eval,
75	filter, lookup, format) which is important to undestand when deciding how to
76	solve your data mungling and normalisation process.
77
78
79
80
81	=head1 FUNCTIONS
82
83	=head2 new
84
85	Create new normalisation object
86
87	my $n = new WebPAC::Lookup::Normalize::Something(
88	filter => {
89	'filter_name_1' => sub {
90	# filter code
91	return length($_);
92	}, ...
93	},
94	db => $db_obj,
95	lookup_regex => $lookup->regex,
96	lookup => $lookup_obj,
97	prefix => 'foobar',
98	);
99
100	Parametar C<filter> defines user supplied snippets of perl code which can
101	be use with C<filter{...}> notation.
102
103	C<prefix> is used to form filename for database record (to support multiple
104	source files which are joined in one database).
105
106	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
107	in structures. If you pass this parametar, you must also pass C<lookup>
108	which is C<WebPAC::Lookup> object.
109
110	=cut
111
112	sub new {
113	my $class = shift;
114	my $self = {@_};
115	bless($self, $class);
116
117	my $r = $self->{'lookup_regex'} ? 1 : 0;
118	my $l = $self->{'lookup'} ? 1 : 0;
119
120	my $log = $self->_get_logger();
121
122	# those two must be in pair
123	if ( ($r & $l) != ($r \|\| $l) ) {
124	my $log = $self->_get_logger();
125	$log->logdie("lookup_regex and lookup must be in pair");
126	}
127
128	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
129
130	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
131
132	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
133
134	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
135	$log->debug("adding built-in filter regex");
136	$self->{filter}->{regex} = sub {
137	my ($val, $regex) = @_;
138	eval "\$val =~ $regex";
139	return $val;
140	};
141	}
142
143	$self ? return $self : return undef;
144	}
145
146	=head2 all_tags
147
148	Returns all tags in document in specified order
149
150	my $sorted_tags = $self->all_tags();
151
152	=cut
153
154	sub all_tags {
155	my $self = shift;
156
157	if (! $self->{_tags_by_order}) {
158
159	my $log = $self->_get_logger;
160	# sanity check
161	$log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
162
163	my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
164	$log->debug("unsorted tags: " . join(", ", @tags));
165
166	@tags = sort { $self->_sort_by_order } @tags;
167
168	$log->debug("sorted tags: " . join(",", @tags) );
169
170	$self->{_tags_by_order} = \@tags;
171	}
172
173	return $self->{_tags_by_order};
174	}
175
176
177
178	=head2 data_structure
179
180	Create in-memory data structure which represents normalized layout from
181	C<conf/normalize/*.xml>.
182
183	This structures are used to produce output.
184
185	my $ds = $webpac->data_structure($rec);
186
187	=cut
188
189	sub data_structure {
190	my $self = shift;
191
192	my $log = $self->_get_logger();
193
194	my $rec = shift;
195	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
196
197	$log->debug("data_structure rec = ", sub { Dumper($rec) });
198
199	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
200
201	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
202
203	my $cache_file;
204
205	if ($self->{'db'}) {
206	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
207	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
208	return $ds if ($ds);
209	$log->debug("cache miss, creating");
210	}
211
212	my $tags = $self->all_tags();
213
214	$log->debug("tags: ",sub { join(", ",@{ $tags }) });
215
216	my $ds;
217
218	foreach my $field (@{ $tags }) {
219
220	my $row;
221
222	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
223
224	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
225	my $format;
226
227	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
228	$format = $tag->{'value'} \|\| $tag->{'content'};
229
230	my @v;
231	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
232	@v = $self->_rec_to_arr($rec,$format,'fill_in');
233	} else {
234	@v = $self->_rec_to_arr($rec,$format,'parse');
235	}
236	if (! @v) {
237	$log->debug("$field <",$self->{tag},"> format: $format no values");
238	next;
239	} else {
240	$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
241	}
242
243	if ($tag->{'sort'}) {
244	@v = $self->sort_arr(@v);
245	}
246
247	# use format?
248	if ($tag->{'format_name'}) {
249	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
250	}
251
252	# delimiter will join repeatable fields
253	if ($tag->{'delimiter'}) {
254	@v = ( join($tag->{'delimiter'}, @v) );
255	}
256
257	# default types
258	my @types = qw(display search);
259	# override by type attribute
260	@types = ( $tag->{'type'} ) if ($tag->{'type'});
261
262	foreach my $type (@types) {
263	# append to previous line?
264	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
265	if ($tag->{'append'}) {
266
267	# I will delimit appended part with
268	# delimiter (or ,)
269	my $d = $tag->{'delimiter'};
270	# default delimiter
271	$d \|\|= " ";
272
273	my $last = pop @{$row->{$type}};
274	$d = "" if (! $last);
275	$last .= $d . join($d, @v);
276	push @{$row->{$type}}, $last;
277
278	} else {
279	push @{$row->{$type}}, @v;
280	}
281	}
282
283
284	}
285
286	if ($row) {
287	$row->{'tag'} = $field;
288
289	# TODO: name_sigular, name_plural
290	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
291	my $row_name = $name ? $self->_x($name) : $field;
292
293	# post-sort all values in field
294	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
295	$log->warn("sort at field tag not implemented");
296	}
297
298	$ds->{$row_name} = $row;
299
300	$log->debug("row $field: ",sub { Dumper($row) });
301	}
302
303	}
304
305	$self->{'db'}->save_ds(
306	id => $id,
307	ds => $ds,
308	prefix => $self->{prefix},
309	) if ($self->{'db'});
310
311	$log->debug("ds: ", sub { Dumper($ds) });
312
313	$log->logconfess("data structure returned is not array any more!") if wantarray;
314
315	return $ds;
316
317	}
318
319	=head2 parse
320
321	Perform smart parsing of string, skipping delimiters for fields which aren't
322	defined. It can also eval code in format starting with C<eval{...}> and
323	return output or nothing depending on eval code.
324
325	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
326
327	Filters are implemented here. While simple form of filters looks like this:
328
329	filter{name_of_filter}
330
331	but, filters can also have variable number of parametars like this:
332
333	filter{name_of_filter(param,param,param)}
334
335	=cut
336
337	my $warn_once;
338
339	sub parse {
340	my $self = shift;
341
342	my ($rec, $format_utf8, $i, $rec_size) = @_;
343
344	return if (! $format_utf8);
345
346	my $log = $self->_get_logger();
347
348	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
349
350	$i = 0 if (! $i);
351
352	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
353
354	my @out;
355
356	$log->debug("format: $format [$i]");
357
358	my $eval_code;
359	# remove eval{...} from beginning
360	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
361
362	my $filter_name;
363	# remove filter{...} from beginning
364	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
365
366	# did we found any (att all) field from format in row?
367	my $found_any;
368	# prefix before first field which we preserve it $found_any
369	my $prefix;
370
371	my $f_step = 1;
372
373	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
374
375	my $del = $1 \|\| '';
376	$prefix = $del if ($f_step == 1);
377
378	my $fld_type = lc($2);
379
380	# repeatable index
381	my $r = $i;
382	if ($fld_type eq 's') {
383	if ($found_any->{'v'}) {
384	$r = 0;
385	} else {
386	return;
387	}
388	}
389
390	my $found = 0;
391	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
392
393	if ($found) {
394	$found_any->{$fld_type} += $found;
395
396	# we will skip delimiter before first occurence of field!
397	push @out, $del unless($found_any->{$fld_type} == 1);
398	push @out, $tmp if ($tmp);
399	}
400	$f_step++;
401	}
402
403	# test if any fields found?
404	return if (! $found_any->{'v'} && ! $found_any->{'s'});
405
406	my $out = join('',@out);
407
408	if ($out) {
409	# add rest of format (suffix)
410	$out .= $format;
411
412	# add prefix if not there
413	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
414
415	$log->debug("result: $out");
416	}
417
418	if ($eval_code) {
419	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
420	$log->debug("about to eval{$eval} format: $out");
421	return if (! $self->_eval($eval));
422	}
423
424	if ($filter_name) {
425	my @filter_args;
426	if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
427	@filter_args = split(/,/, $2);
428	}
429	if ($self->{'filter'}->{$filter_name}) {
430	$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
431	unshift @filter_args, $out;
432	$out = $self->{'filter'}->{$filter_name}->(@filter_args);
433	return unless(defined($out));
434	$log->debug("filter result: $out");
435	} elsif (! $warn_once->{$filter_name}) {
436	$log->warn("trying to use undefined filter $filter_name");
437	$warn_once->{$filter_name}++;
438	}
439	}
440
441	return $out;
442	}
443
444	=head2 fill_in
445
446	Workhourse of all: takes record from in-memory structure of database and
447	strings with placeholders and returns string or array of with substituted
448	values from record.
449
450	my $text = $webpac->fill_in($rec,'v250^a');
451
452	Optional argument is ordinal number for repeatable fields. By default,
453	it's assume to be first repeatable field (fields are perl array, so first
454	element is 0).
455	Following example will read second value from repeatable field.
456
457	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459	This function B<does not> perform parsing of format to inteligenty skip
460	delimiters before fields which aren't used.
461
462	This method will automatically decode UTF-8 string to local code page
463	if needed.
464
465	There is optional parametar C<$record_size> which can be used to get sizes of
466	all C<field^subfield> combinations in this format.
467
468	my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
469
470	=cut
471
472	sub fill_in {
473	my $self = shift;
474
475	my $log = $self->_get_logger();
476
477	my ($rec,$format,$i,$rec_size) = @_;
478
479	$log->logconfess("need data record") unless ($rec);
480	$log->logconfess("need format to parse") unless($format);
481
482	# iteration (for repeatable fields)
483	$i \|\|= 0;
484
485	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
486
487	# FIXME remove for speedup?
488	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
489
490	if (utf8::is_utf8($format)) {
491	$format = $self->_x($format);
492	}
493
494	my $found = 0;
495	my $just_single = 1;
496
497	my $eval_code;
498	# remove eval{...} from beginning
499	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
500
501	my $filter_name;
502	# remove filter{...} from beginning
503	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
504
505	{
506	# fix warnings
507	no warnings 'uninitialized';
508
509	# do actual replacement of placeholders
510	# repeatable fields
511	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
512	$just_single = 0;
513	}
514
515	# non-repeatable fields
516	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
517	return if ($i > 0 && $just_single);
518	}
519	}
520
521	if ($found) {
522	$log->debug("format: $format");
523	if ($eval_code) {
524	my $eval = $self->fill_in($rec,$eval_code,$i);
525	return if (! $self->_eval($eval));
526	}
527	if ($filter_name && $self->{'filter'}->{$filter_name}) {
528	$log->debug("filter '$filter_name' for $format");
529	$format = $self->{'filter'}->{$filter_name}->($format);
530	return unless(defined($format));
531	$log->debug("filter result: $format");
532	}
533	# do we have lookups?
534	if ($self->{'lookup'}) {
535	if ($self->{'lookup'}->can('lookup')) {
536	my @lookup = $self->{lookup}->lookup($format);
537	$log->debug("lookup $format", join(", ", @lookup));
538	return @lookup;
539	} else {
540	$log->warn("Have lookup object but can't invoke lookup method");
541	}
542	} else {
543	return $format;
544	}
545	} else {
546	return;
547	}
548	}
549
550
551	=head2 _rec_to_arr
552
553	Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
554	for fields which have lookups, so they shouldn't be parsed but rather
555	C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
556
557	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
558
559	=cut
560
561	sub _rec_to_arr {
562	my $self = shift;
563
564	my ($rec, $format_utf8, $code) = @_;
565
566	my $log = $self->_get_logger();
567
568	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
569	return if (! $format_utf8);
570
571	$log->debug("using $code on $format_utf8");
572
573	my $i = 0;
574	my $max = 0;
575	my @arr;
576	my $rec_size = {};
577
578	while ($i <= $max) {
579	my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
580	if ($rec_size) {
581	foreach my $f (keys %{ $rec_size }) {
582	$max = $rec_size->{$f} if ($rec_size->{$f} > $max);
583	}
584	$log->debug("max set to $max");
585	undef $rec_size;
586	}
587	if (@v) {
588	push @arr, @v;
589	} else {
590	push @arr, '' if ($max > $i);
591	}
592	}
593
594	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
595
596	return @arr;
597	}
598
599
600	=head2 get_data
601
602	Returns value from record.
603
604	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
605
606	Required arguments are:
607
608	=over 8
609
610	=item C<$rec>
611
612	record reference
613
614	=item C<$f>
615
616	field
617
618	=item C<$sf>
619
620	optional subfield
621
622	=item C<$i>
623
624	index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
625
626	=item C<$found>
627
628	optional variable that will be incremeted if preset
629
630	=item C<$rec_size>
631
632	hash to hold maximum occurances of C<field^subfield> combinations
633	(which can be accessed using keys in same format)
634
635	=back
636
637	Returns value or empty string, updates C<$found> and C<rec_size>
638	if present.
639
640	=cut
641
642	sub get_data {
643	my $self = shift;
644
645	my ($rec,$f,$sf,$i,$found,$cache) = @_;
646
647	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
648
649	if (defined($$cache)) {
650	$$cache->{ $f . ( $sf ? '^' . $sf : '' ) } \|\|= scalar @{ $$rec->{$f} };
651	}
652
653	return '' unless ($$rec->{$f}->[$i]);
654
655	{
656	no strict 'refs';
657	if (defined($sf)) {
658	$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
659	return $$rec->{$f}->[$i]->{$sf};
660	} else {
661	$$found++ if (defined($$found));
662	# it still might have subfields, just
663	# not specified, so we'll dump some debug info
664	if ($$rec->{$f}->[$i] =~ /HASH/o) {
665	my $out;
666	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
667	my $v = $$rec->{$f}->[$i]->{$k};
668	$out .= '$' . $k .':' . $v if ($v);
669	}
670	return $out;
671	} else {
672	return $$rec->{$f}->[$i];
673	}
674	}
675	}
676	}
677
678
679	=head2 apply_format
680
681	Apply format specified in tag with C<format_name="name"> and
682	C<format_delimiter=";;">.
683
684	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
685
686	Formats can contain C<lookup{...}> if you need them.
687
688	=cut
689
690	sub apply_format {
691	my $self = shift;
692
693	my ($name,$delimiter,$data) = @_;
694
695	my $log = $self->_get_logger();
696
697	if (! $self->{'import_xml'}->{'format'}->{$name}) {
698	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
699	return $data;
700	}
701
702	$log->warn("no delimiter for format $name") if (! $delimiter);
703
704	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
705
706	my @data = split(/\Q$delimiter\E/, $data);
707
708	my $out = sprintf($format, @data);
709	$log->debug("using format $name [$format] on $data to produce: $out");
710
711	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
712	return $self->{'lookup'}->lookup($out);
713	} else {
714	return $out;
715	}
716
717	}
718
719	=head2 sort_arr
720
721	Sort array ignoring case and html in data
722
723	my @sorted = $webpac->sort_arr(@unsorted);
724
725	=cut
726
727	sub sort_arr {
728	my $self = shift;
729
730	my $log = $self->_get_logger();
731
732	# FIXME add Schwartzian Transformation?
733
734	my @sorted = sort {
735	$a =~ s#<[^>]+/*>##;
736	$b =~ s#<[^>]+/*>##;
737	lc($b) cmp lc($a)
738	} @_;
739	$log->debug("sorted values: ",sub { join(", ",@sorted) });
740
741	return @sorted;
742	}
743
744
745	=head1 INTERNAL METHODS
746
747	=head2 _sort_by_order
748
749	Sort xml tags data structure accoding to C<order=""> attribute.
750
751	=cut
752
753	sub _sort_by_order {
754	my $self = shift;
755
756	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
757	$self->{'import_xml'}->{'indexer'}->{$a};
758	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
759	$self->{'import_xml'}->{'indexer'}->{$b};
760
761	return $va <=> $vb;
762	}
763
764	=head2 _x
765
766	Convert strings from C<conf/normalize/*.xml> encoding into application
767	specific encoding (optinally specified using C<code_page> to C<new>
768	constructor).
769
770	my $text = $n->_x('normalize text string');
771
772	This is a stub so that other modules doesn't have to implement it.
773
774	=cut
775
776	sub _x {
777	my $self = shift;
778	return shift;
779	}
780
781
782	=head1 AUTHOR
783
784	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
785
786	=head1 COPYRIGHT & LICENSE
787
788	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
789
790	This program is free software; you can redistribute it and/or modify it
791	under the same terms as Perl itself.
792
793	=cut
794
795	1; # End of WebPAC::Lookup::Normalize