lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use blib;
use WebPAC::Common;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.09

=cut

our $VERSION = '0.09';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}

=head2 all_tags

Returns all tags in document in specified order

  my $sorted_tags = $self->all_tags();

=cut

sub all_tags {
        my $self = shift;

        if (! $self->{_tags_by_order}) {

                my $log = $self->_get_logger;
                # sanity check
                $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});

                my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
                $log->debug("unsorted tags: " . join(", ", @tags));

                @tags = sort { $self->_sort_by_order } @tags;

                $log->debug("sorted tags: " . join(",", @tags) );

                $self->{_tags_by_order} = \@tags;
        }

        return $self->{_tags_by_order};
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my $tags = $self->all_tags();

        $log->debug("tags: ",sub { join(", ",@{ $tags }) });

        my $ds;

        foreach my $field (@{ $tags }) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->_rec_to_arr($rec,$format,'fill_in');
                        } else {
                                @v = $self->_rec_to_arr($rec,$format,'parse');
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
                                next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i, $rec_size) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

There is optional parametar C<$record_size> which can be used to get sizes of
all C<field^subfield> combinations in this format.

 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my ($rec,$format,$i,$rec_size) = @_;

        $log->logconfess("need data record") unless ($rec);
        $log->logconfess("need format to parse") unless($format);

        # iteration (for repeatable fields)
        $i ||= 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        {
                # fix warnings
                no warnings 'uninitialized';

                # do actual replacement of placeholders
                # repeatable fields
                if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
                        $just_single = 0;
                }

                # non-repeatable fields
                if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
                        return if ($i > 0 && $just_single);
                }
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 _rec_to_arr

Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');

=cut

sub _rec_to_arr {
        my $self = shift;

        my ($rec, $format_utf8, $code) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        $log->debug("using $code on $format_utf8");

        my $i = 0;
        my $max = 0;
        my @arr;
        my $rec_size = {};

        while ($i <= $max) {
                my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
                if ($rec_size) {
                        foreach my $f (keys %{ $rec_size }) {
                                $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
                        }
                        $log->debug("max set to $max");
                        undef $rec_size;
                }
                if (@v) {
                        push @arr, @v;
                } else {
                        push @arr, '' if ($max > $i);
                }
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);

Required arguments are:

=over 8

=item C<$rec>

record reference

=item C<$f>

field

=item C<$sf>

optional subfield

=item C<$i>

index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )

=item C<$found>

optional variable that will be incremeted if preset

=item C<$rec_size>

hash to hold maximum occurances of C<field^subfield> combinations
(which can be accessed using keys in same format)

=back

Returns value or empty string, updates C<$found> and C<rec_size>
if present.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found,$cache) = @_;

        return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');

        if (defined($$cache)) {
                $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
        }

        return '' unless ($$rec->{$f}->[$i]);

        {
                no strict 'refs';
                if (defined($sf)) {
                        $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
                        return $$rec->{$f}->[$i]->{$sf};
                } else {
                        $$found++ if (defined($$found));
                        # it still might have subfields, just
                        # not specified, so we'll dump some debug info
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use blib;
6	use WebPAC::Common;
7	use base 'WebPAC::Common';
8	use Data::Dumper;
9
10	=head1 NAME
11
12	WebPAC::Normalize - data mungling for normalisation
13
14	=head1 VERSION
15
16	Version 0.09
17
18	=cut
19
20	our $VERSION = '0.09';
21
22	=head1 SYNOPSIS
23
24	This package contains code that mungle data to produce normalized format.
25
26	It contains several assumptions:
27
28	=over
29
30	=item *
31
32	format of fields is defined using C<v123^a> notation for repeatable fields
33	or C<s123^a> for single (or first) value, where C<123> is field number and
34	C<a> is subfield.
35
36	=item *
37
38	source data records (C<$rec>) have unique identifiers in field C<000>
39
40	=item *
41
42	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43	perl code that is evaluated before producing output (value of field will be
44	interpolated before that)
45
46	=item *
47
48	optional C<filter{filter_name}> at B<begining of format> will apply perl
49	code defined as code ref on format after field substitution to producing
50	output
51
52	There is one built-in filter called C<regex> which can be use like this:
53
54	filter{regex(s/foo/bar/)}
55
56	=item *
57
58	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59
60	=item *
61
62	at end, optional C<format>s rules are resolved. Format rules are similar to
63	C<sprintf> and can also contain C<lookup{...}> which is performed after
64	values are inserted in format.
65
66	=back
67
68	This also describes order in which transformations are applied (eval,
69	filter, lookup, format) which is important to undestand when deciding how to
70	solve your data mungling and normalisation process.
71
72
73
74
75	=head1 FUNCTIONS
76
77	=head2 new
78
79	Create new normalisation object
80
81	my $n = new WebPAC::Normalize::Something(
82	filter => {
83	'filter_name_1' => sub {
84	# filter code
85	return length($_);
86	}, ...
87	},
88	db => $db_obj,
89	lookup_regex => $lookup->regex,
90	lookup => $lookup_obj,
91	prefix => 'foobar',
92	);
93
94	Parametar C<filter> defines user supplied snippets of perl code which can
95	be use with C<filter{...}> notation.
96
97	C<prefix> is used to form filename for database record (to support multiple
98	source files which are joined in one database).
99
100	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101	in structures. If you pass this parametar, you must also pass C<lookup>
102	which is C<WebPAC::Lookup> object.
103
104	=cut
105
106	sub new {
107	my $class = shift;
108	my $self = {@_};
109	bless($self, $class);
110
111	my $r = $self->{'lookup_regex'} ? 1 : 0;
112	my $l = $self->{'lookup'} ? 1 : 0;
113
114	my $log = $self->_get_logger();
115
116	# those two must be in pair
117	if ( ($r & $l) != ($r \|\| $l) ) {
118	my $log = $self->_get_logger();
119	$log->logdie("lookup_regex and lookup must be in pair");
120	}
121
122	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123
124	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125
126	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127
128	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
129	$log->debug("adding built-in filter regex");
130	$self->{filter}->{regex} = sub {
131	my ($val, $regex) = @_;
132	eval "\$val =~ $regex";
133	return $val;
134	};
135	}
136
137	$self ? return $self : return undef;
138	}
139
140	=head2 all_tags
141
142	Returns all tags in document in specified order
143
144	my $sorted_tags = $self->all_tags();
145
146	=cut
147
148	sub all_tags {
149	my $self = shift;
150
151	if (! $self->{_tags_by_order}) {
152
153	my $log = $self->_get_logger;
154	# sanity check
155	$log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
156
157	my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
158	$log->debug("unsorted tags: " . join(", ", @tags));
159
160	@tags = sort { $self->_sort_by_order } @tags;
161
162	$log->debug("sorted tags: " . join(",", @tags) );
163
164	$self->{_tags_by_order} = \@tags;
165	}
166
167	return $self->{_tags_by_order};
168	}
169
170
171
172	=head2 data_structure
173
174	Create in-memory data structure which represents normalized layout from
175	C<conf/normalize/*.xml>.
176
177	This structures are used to produce output.
178
179	my $ds = $webpac->data_structure($rec);
180
181	=cut
182
183	sub data_structure {
184	my $self = shift;
185
186	my $log = $self->_get_logger();
187
188	my $rec = shift;
189	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
190
191	$log->debug("data_structure rec = ", sub { Dumper($rec) });
192
193	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
194
195	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
196
197	my $cache_file;
198
199	if ($self->{'db'}) {
200	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
201	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
202	return $ds if ($ds);
203	$log->debug("cache miss, creating");
204	}
205
206	my $tags = $self->all_tags();
207
208	$log->debug("tags: ",sub { join(", ",@{ $tags }) });
209
210	my $ds;
211
212	foreach my $field (@{ $tags }) {
213
214	my $row;
215
216	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
217
218	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
219	my $format;
220
221	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
222	$format = $tag->{'value'} \|\| $tag->{'content'};
223
224	my @v;
225	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
226	@v = $self->_rec_to_arr($rec,$format,'fill_in');
227	} else {
228	@v = $self->_rec_to_arr($rec,$format,'parse');
229	}
230	if (! @v) {
231	$log->debug("$field <",$self->{tag},"> format: $format no values");
232	next;
233	} else {
234	$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
235	}
236
237	if ($tag->{'sort'}) {
238	@v = $self->sort_arr(@v);
239	}
240
241	# use format?
242	if ($tag->{'format_name'}) {
243	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
244	}
245
246	# delimiter will join repeatable fields
247	if ($tag->{'delimiter'}) {
248	@v = ( join($tag->{'delimiter'}, @v) );
249	}
250
251	# default types
252	my @types = qw(display search);
253	# override by type attribute
254	@types = ( $tag->{'type'} ) if ($tag->{'type'});
255
256	foreach my $type (@types) {
257	# append to previous line?
258	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
259	if ($tag->{'append'}) {
260
261	# I will delimit appended part with
262	# delimiter (or ,)
263	my $d = $tag->{'delimiter'};
264	# default delimiter
265	$d \|\|= " ";
266
267	my $last = pop @{$row->{$type}};
268	$d = "" if (! $last);
269	$last .= $d . join($d, @v);
270	push @{$row->{$type}}, $last;
271
272	} else {
273	push @{$row->{$type}}, @v;
274	}
275	}
276
277
278	}
279
280	if ($row) {
281	$row->{'tag'} = $field;
282
283	# TODO: name_sigular, name_plural
284	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
285	my $row_name = $name ? $self->_x($name) : $field;
286
287	# post-sort all values in field
288	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
289	$log->warn("sort at field tag not implemented");
290	}
291
292	$ds->{$row_name} = $row;
293
294	$log->debug("row $field: ",sub { Dumper($row) });
295	}
296
297	}
298
299	$self->{'db'}->save_ds(
300	id => $id,
301	ds => $ds,
302	prefix => $self->{prefix},
303	) if ($self->{'db'});
304
305	$log->debug("ds: ", sub { Dumper($ds) });
306
307	$log->logconfess("data structure returned is not array any more!") if wantarray;
308
309	return $ds;
310
311	}
312
313	=head2 parse
314
315	Perform smart parsing of string, skipping delimiters for fields which aren't
316	defined. It can also eval code in format starting with C<eval{...}> and
317	return output or nothing depending on eval code.
318
319	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
320
321	Filters are implemented here. While simple form of filters looks like this:
322
323	filter{name_of_filter}
324
325	but, filters can also have variable number of parametars like this:
326
327	filter{name_of_filter(param,param,param)}
328
329	=cut
330
331	my $warn_once;
332
333	sub parse {
334	my $self = shift;
335
336	my ($rec, $format_utf8, $i, $rec_size) = @_;
337
338	return if (! $format_utf8);
339
340	my $log = $self->_get_logger();
341
342	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
343
344	$i = 0 if (! $i);
345
346	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
347
348	my @out;
349
350	$log->debug("format: $format [$i]");
351
352	my $eval_code;
353	# remove eval{...} from beginning
354	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
355
356	my $filter_name;
357	# remove filter{...} from beginning
358	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
359
360	# did we found any (att all) field from format in row?
361	my $found_any;
362	# prefix before first field which we preserve it $found_any
363	my $prefix;
364
365	my $f_step = 1;
366
367	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
368
369	my $del = $1 \|\| '';
370	$prefix = $del if ($f_step == 1);
371
372	my $fld_type = lc($2);
373
374	# repeatable index
375	my $r = $i;
376	if ($fld_type eq 's') {
377	if ($found_any->{'v'}) {
378	$r = 0;
379	} else {
380	return;
381	}
382	}
383
384	my $found = 0;
385	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
386
387	if ($found) {
388	$found_any->{$fld_type} += $found;
389
390	# we will skip delimiter before first occurence of field!
391	push @out, $del unless($found_any->{$fld_type} == 1);
392	push @out, $tmp;
393	}
394	$f_step++;
395	}
396
397	# test if any fields found?
398	return if (! $found_any->{'v'} && ! $found_any->{'s'});
399
400	my $out = join('',@out);
401
402	if ($out) {
403	# add rest of format (suffix)
404	$out .= $format;
405
406	# add prefix if not there
407	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
408
409	$log->debug("result: $out");
410	}
411
412	if ($eval_code) {
413	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
414	$log->debug("about to eval{$eval} format: $out");
415	return if (! $self->_eval($eval));
416	}
417
418	if ($filter_name) {
419	my @filter_args;
420	if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
421	@filter_args = split(/,/, $2);
422	}
423	if ($self->{'filter'}->{$filter_name}) {
424	$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
425	unshift @filter_args, $out;
426	$out = $self->{'filter'}->{$filter_name}->(@filter_args);
427	return unless(defined($out));
428	$log->debug("filter result: $out");
429	} elsif (! $warn_once->{$filter_name}) {
430	$log->warn("trying to use undefined filter $filter_name");
431	$warn_once->{$filter_name}++;
432	}
433	}
434
435	return $out;
436	}
437
438	=head2 fill_in
439
440	Workhourse of all: takes record from in-memory structure of database and
441	strings with placeholders and returns string or array of with substituted
442	values from record.
443
444	my $text = $webpac->fill_in($rec,'v250^a');
445
446	Optional argument is ordinal number for repeatable fields. By default,
447	it's assume to be first repeatable field (fields are perl array, so first
448	element is 0).
449	Following example will read second value from repeatable field.
450
451	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
452
453	This function B<does not> perform parsing of format to inteligenty skip
454	delimiters before fields which aren't used.
455
456	This method will automatically decode UTF-8 string to local code page
457	if needed.
458
459	There is optional parametar C<$record_size> which can be used to get sizes of
460	all C<field^subfield> combinations in this format.
461
462	my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
463
464	=cut
465
466	sub fill_in {
467	my $self = shift;
468
469	my $log = $self->_get_logger();
470
471	my ($rec,$format,$i,$rec_size) = @_;
472
473	$log->logconfess("need data record") unless ($rec);
474	$log->logconfess("need format to parse") unless($format);
475
476	# iteration (for repeatable fields)
477	$i \|\|= 0;
478
479	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
480
481	# FIXME remove for speedup?
482	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
483
484	if (utf8::is_utf8($format)) {
485	$format = $self->_x($format);
486	}
487
488	my $found = 0;
489	my $just_single = 1;
490
491	my $eval_code;
492	# remove eval{...} from beginning
493	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
494
495	my $filter_name;
496	# remove filter{...} from beginning
497	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
498
499	{
500	# fix warnings
501	no warnings 'uninitialized';
502
503	# do actual replacement of placeholders
504	# repeatable fields
505	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
506	$just_single = 0;
507	}
508
509	# non-repeatable fields
510	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
511	return if ($i > 0 && $just_single);
512	}
513	}
514
515	if ($found) {
516	$log->debug("format: $format");
517	if ($eval_code) {
518	my $eval = $self->fill_in($rec,$eval_code,$i);
519	return if (! $self->_eval($eval));
520	}
521	if ($filter_name && $self->{'filter'}->{$filter_name}) {
522	$log->debug("filter '$filter_name' for $format");
523	$format = $self->{'filter'}->{$filter_name}->($format);
524	return unless(defined($format));
525	$log->debug("filter result: $format");
526	}
527	# do we have lookups?
528	if ($self->{'lookup'}) {
529	if ($self->{'lookup'}->can('lookup')) {
530	my @lookup = $self->{lookup}->lookup($format);
531	$log->debug("lookup $format", join(", ", @lookup));
532	return @lookup;
533	} else {
534	$log->warn("Have lookup object but can't invoke lookup method");
535	}
536	} else {
537	return $format;
538	}
539	} else {
540	return;
541	}
542	}
543
544
545	=head2 _rec_to_arr
546
547	Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
548	for fields which have lookups, so they shouldn't be parsed but rather
549	C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
550
551	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
552
553	=cut
554
555	sub _rec_to_arr {
556	my $self = shift;
557
558	my ($rec, $format_utf8, $code) = @_;
559
560	my $log = $self->_get_logger();
561
562	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
563	return if (! $format_utf8);
564
565	$log->debug("using $code on $format_utf8");
566
567	my $i = 0;
568	my $max = 0;
569	my @arr;
570	my $rec_size = {};
571
572	while ($i <= $max) {
573	my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
574	if ($rec_size) {
575	foreach my $f (keys %{ $rec_size }) {
576	$max = $rec_size->{$f} if ($rec_size->{$f} > $max);
577	}
578	$log->debug("max set to $max");
579	undef $rec_size;
580	}
581	if (@v) {
582	push @arr, @v;
583	} else {
584	push @arr, '' if ($max > $i);
585	}
586	}
587
588	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
589
590	return @arr;
591	}
592
593
594	=head2 get_data
595
596	Returns value from record.
597
598	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
599
600	Required arguments are:
601
602	=over 8
603
604	=item C<$rec>
605
606	record reference
607
608	=item C<$f>
609
610	field
611
612	=item C<$sf>
613
614	optional subfield
615
616	=item C<$i>
617
618	index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
619
620	=item C<$found>
621
622	optional variable that will be incremeted if preset
623
624	=item C<$rec_size>
625
626	hash to hold maximum occurances of C<field^subfield> combinations
627	(which can be accessed using keys in same format)
628
629	=back
630
631	Returns value or empty string, updates C<$found> and C<rec_size>
632	if present.
633
634	=cut
635
636	sub get_data {
637	my $self = shift;
638
639	my ($rec,$f,$sf,$i,$found,$cache) = @_;
640
641	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
642
643	if (defined($$cache)) {
644	$$cache->{ $f . ( $sf ? '^' . $sf : '' ) } \|\|= scalar @{ $$rec->{$f} };
645	}
646
647	return '' unless ($$rec->{$f}->[$i]);
648
649	{
650	no strict 'refs';
651	if (defined($sf)) {
652	$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
653	return $$rec->{$f}->[$i]->{$sf};
654	} else {
655	$$found++ if (defined($$found));
656	# it still might have subfields, just
657	# not specified, so we'll dump some debug info
658	if ($$rec->{$f}->[$i] =~ /HASH/o) {
659	my $out;
660	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
661	$out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
662	}
663	return $out;
664	} else {
665	return $$rec->{$f}->[$i];
666	}
667	}
668	}
669	}
670
671
672	=head2 apply_format
673
674	Apply format specified in tag with C<format_name="name"> and
675	C<format_delimiter=";;">.
676
677	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
678
679	Formats can contain C<lookup{...}> if you need them.
680
681	=cut
682
683	sub apply_format {
684	my $self = shift;
685
686	my ($name,$delimiter,$data) = @_;
687
688	my $log = $self->_get_logger();
689
690	if (! $self->{'import_xml'}->{'format'}->{$name}) {
691	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
692	return $data;
693	}
694
695	$log->warn("no delimiter for format $name") if (! $delimiter);
696
697	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
698
699	my @data = split(/\Q$delimiter\E/, $data);
700
701	my $out = sprintf($format, @data);
702	$log->debug("using format $name [$format] on $data to produce: $out");
703
704	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
705	return $self->{'lookup'}->lookup($out);
706	} else {
707	return $out;
708	}
709
710	}
711
712	=head2 sort_arr
713
714	Sort array ignoring case and html in data
715
716	my @sorted = $webpac->sort_arr(@unsorted);
717
718	=cut
719
720	sub sort_arr {
721	my $self = shift;
722
723	my $log = $self->_get_logger();
724
725	# FIXME add Schwartzian Transformation?
726
727	my @sorted = sort {
728	$a =~ s#<[^>]+/*>##;
729	$b =~ s#<[^>]+/*>##;
730	lc($b) cmp lc($a)
731	} @_;
732	$log->debug("sorted values: ",sub { join(", ",@sorted) });
733
734	return @sorted;
735	}
736
737
738	=head1 INTERNAL METHODS
739
740	=head2 _sort_by_order
741
742	Sort xml tags data structure accoding to C<order=""> attribute.
743
744	=cut
745
746	sub _sort_by_order {
747	my $self = shift;
748
749	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
750	$self->{'import_xml'}->{'indexer'}->{$a};
751	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
752	$self->{'import_xml'}->{'indexer'}->{$b};
753
754	return $va <=> $vb;
755	}
756
757	=head2 _x
758
759	Convert strings from C<conf/normalize/*.xml> encoding into application
760	specific encoding (optinally specified using C<code_page> to C<new>
761	constructor).
762
763	my $text = $n->_x('normalize text string');
764
765	This is a stub so that other modules doesn't have to implement it.
766
767	=cut
768
769	sub _x {
770	my $self = shift;
771	return shift;
772	}
773
774
775	=head1 AUTHOR
776
777	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
778
779	=head1 COPYRIGHT & LICENSE
780
781	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
782
783	This program is free software; you can redistribute it and/or modify it
784	under the same terms as Perl itself.
785
786	=cut
787
788	1; # End of WebPAC::Normalize