lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.08

=cut

our $VERSION = '0.08';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
#                               next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
                $just_single = 0;
        }

        # non-repeatable fields
        if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
                return if ($i > 0 && $just_single);
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        my $v = $$rec->{$f}->[$i]->{$k};
                                        $out .= "$v " if ($v);
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14	dpavlin	317	Version 0.08
15	dpavlin	10
16			=cut
17
18	dpavlin	317	our $VERSION = '0.08';
19	dpavlin	10
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50	dpavlin	260	There is one built-in filter called C<regex> which can be use like this:
51
52			filter{regex(s/foo/bar/)}
53
54	dpavlin	15	=item *
55
56			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58			=item *
59
60			at end, optional C<format>s rules are resolved. Format rules are similar to
61			C<sprintf> and can also contain C<lookup{...}> which is performed after
62			values are inserted in format.
63
64			=back
65
66			This also describes order in which transformations are applied (eval,
67			filter, lookup, format) which is important to undestand when deciding how to
68			solve your data mungling and normalisation process.
69
70
71
72
73	dpavlin	10	=head1 FUNCTIONS
74
75	dpavlin	13	=head2 new
76	dpavlin	10
77	dpavlin	13	Create new normalisation object
78
79			my $n = new WebPAC::Normalize::Something(
80	dpavlin	15	filter => {
81			'filter_name_1' => sub {
82			# filter code
83			return length($_);
84			}, ...
85			},
86	dpavlin	29	db => $db_obj,
87	dpavlin	13	lookup_regex => $lookup->regex,
88	dpavlin	31	lookup => $lookup_obj,
89	dpavlin	219	prefix => 'foobar',
90	dpavlin	13	);
91
92	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
93			be use with C<filter{...}> notation.
94
95	dpavlin	219	C<prefix> is used to form filename for database record (to support multiple
96			source files which are joined in one database).
97
98	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
100			which is C<WebPAC::Lookup> object.
101	dpavlin	13
102	dpavlin	10	=cut
103
104	dpavlin	13	sub new {
105			my $class = shift;
106			my $self = {@_};
107			bless($self, $class);
108
109	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
110			my $l = $self->{'lookup'} ? 1 : 0;
111
112			my $log = $self->_get_logger();
113
114			# those two must be in pair
115			if ( ($r & $l) != ($r \|\| $l) ) {
116			my $log = $self->_get_logger();
117			$log->logdie("lookup_regex and lookup must be in pair");
118			}
119
120			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122	dpavlin	219	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124	dpavlin	252	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126	dpavlin	295	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
127	dpavlin	260	$log->debug("adding built-in filter regex");
128			$self->{filter}->{regex} = sub {
129			my ($val, $regex) = @_;
130			eval "\$val =~ $regex";
131			return $val;
132			};
133			}
134
135	dpavlin	13	$self ? return $self : return undef;
136	dpavlin	10	}
137
138	dpavlin	13
139			=head2 data_structure
140
141			Create in-memory data structure which represents normalized layout from
142			C<conf/normalize/*.xml>.
143
144			This structures are used to produce output.
145
146	dpavlin	70	my $ds = $webpac->data_structure($rec);
147	dpavlin	13
148			=cut
149
150			sub data_structure {
151			my $self = shift;
152
153			my $log = $self->_get_logger();
154
155			my $rec = shift;
156			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158	dpavlin	125	$log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160	dpavlin	312	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161	dpavlin	125
162	dpavlin	219	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
163	dpavlin	125
164	dpavlin	13	my $cache_file;
165
166	dpavlin	18	if ($self->{'db'}) {
167	dpavlin	219	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168	dpavlin	70	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169			return $ds if ($ds);
170	dpavlin	29	$log->debug("cache miss, creating");
171	dpavlin	13	}
172
173			my @sorted_tags;
174			if ($self->{tags_by_order}) {
175			@sorted_tags = @{$self->{tags_by_order}};
176			} else {
177			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178			$self->{tags_by_order} = \@sorted_tags;
179			}
180
181	dpavlin	70	my $ds;
182	dpavlin	13
183			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185			foreach my $field (@sorted_tags) {
186
187			my $row;
188
189			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192	dpavlin	38	my $format;
193	dpavlin	13
194	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195			$format = $tag->{'value'} \|\| $tag->{'content'};
196
197	dpavlin	13	my @v;
198			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
199			@v = $self->fill_in_to_arr($rec,$format);
200			} else {
201			@v = $self->parse_to_arr($rec,$format);
202			}
203	dpavlin	364	if (! @v) {
204			$log->debug("$field <",$self->{tag},"> format: $format no values");
205			# next;
206			} else {
207			$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
208			}
209	dpavlin	13
210			if ($tag->{'sort'}) {
211			@v = $self->sort_arr(@v);
212			}
213
214			# use format?
215			if ($tag->{'format_name'}) {
216			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
217			}
218
219			# delimiter will join repeatable fields
220			if ($tag->{'delimiter'}) {
221			@v = ( join($tag->{'delimiter'}, @v) );
222			}
223
224			# default types
225	dpavlin	74	my @types = qw(display search);
226	dpavlin	13	# override by type attribute
227			@types = ( $tag->{'type'} ) if ($tag->{'type'});
228
229			foreach my $type (@types) {
230			# append to previous line?
231	dpavlin	364	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
232	dpavlin	13	if ($tag->{'append'}) {
233
234			# I will delimit appended part with
235			# delimiter (or ,)
236			my $d = $tag->{'delimiter'};
237			# default delimiter
238			$d \|\|= " ";
239
240			my $last = pop @{$row->{$type}};
241			$d = "" if (! $last);
242			$last .= $d . join($d, @v);
243			push @{$row->{$type}}, $last;
244
245			} else {
246			push @{$row->{$type}}, @v;
247			}
248			}
249
250
251			}
252
253			if ($row) {
254			$row->{'tag'} = $field;
255
256			# TODO: name_sigular, name_plural
257			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258	dpavlin	70	my $row_name = $name ? $self->_x($name) : $field;
259	dpavlin	13
260			# post-sort all values in field
261			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262			$log->warn("sort at field tag not implemented");
263			}
264
265	dpavlin	70	$ds->{$row_name} = $row;
266	dpavlin	13
267			$log->debug("row $field: ",sub { Dumper($row) });
268			}
269
270			}
271
272	dpavlin	22	$self->{'db'}->save_ds(
273	dpavlin	219	id => $id,
274	dpavlin	70	ds => $ds,
275	dpavlin	219	prefix => $self->{prefix},
276	dpavlin	18	) if ($self->{'db'});
277	dpavlin	13
278	dpavlin	70	$log->debug("ds: ", sub { Dumper($ds) });
279	dpavlin	29
280	dpavlin	70	$log->logconfess("data structure returned is not array any more!") if wantarray;
281	dpavlin	13
282	dpavlin	70	return $ds;
283
284	dpavlin	13	}
285
286			=head2 parse
287
288			Perform smart parsing of string, skipping delimiters for fields which aren't
289			defined. It can also eval code in format starting with C<eval{...}> and
290			return output or nothing depending on eval code.
291
292			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
293
294	dpavlin	260	Filters are implemented here. While simple form of filters looks like this:
295
296			filter{name_of_filter}
297
298			but, filters can also have variable number of parametars like this:
299
300			filter{name_of_filter(param,param,param)}
301
302	dpavlin	13	=cut
303
304	dpavlin	261	my $warn_once;
305
306	dpavlin	13	sub parse {
307			my $self = shift;
308
309			my ($rec, $format_utf8, $i) = @_;
310
311			return if (! $format_utf8);
312
313			my $log = $self->_get_logger();
314
315			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
316
317			$i = 0 if (! $i);
318
319			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
320
321			my @out;
322
323	dpavlin	340	$log->debug("format: $format [$i]");
324	dpavlin	13
325			my $eval_code;
326			# remove eval{...} from beginning
327			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328
329			my $filter_name;
330			# remove filter{...} from beginning
331			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
332
333	dpavlin	317	# did we found any (att all) field from format in row?
334	dpavlin	340	my $found_any;
335	dpavlin	317	# prefix before first field which we preserve it $found_any
336	dpavlin	13	my $prefix;
337
338	dpavlin	317	my $f_step = 1;
339
340	dpavlin	13	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
341
342			my $del = $1 \|\| '';
343	dpavlin	317	$prefix = $del if ($f_step == 1);
344	dpavlin	13
345	dpavlin	340	my $fld_type = lc($2);
346
347	dpavlin	13	# repeatable index
348			my $r = $i;
349	dpavlin	340	if ($fld_type eq 's') {
350			if ($found_any->{'v'}) {
351			$r = 0;
352			} else {
353			return;
354			}
355			}
356	dpavlin	13
357			my $found = 0;
358			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
359
360			if ($found) {
361	dpavlin	340	$found_any->{$fld_type} += $found;
362	dpavlin	317
363			# we will skip delimiter before first occurence of field!
364	dpavlin	344	push @out, $del unless($found_any->{$fld_type} == 1);
365	dpavlin	13	push @out, $tmp;
366			}
367	dpavlin	317	$f_step++;
368	dpavlin	13	}
369
370	dpavlin	340	# test if any fields found?
371			return if (! $found_any->{'v'} && ! $found_any->{'s'});
372	dpavlin	13
373			my $out = join('',@out);
374
375			if ($out) {
376			# add rest of format (suffix)
377			$out .= $format;
378
379			# add prefix if not there
380			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
381
382			$log->debug("result: $out");
383			}
384
385			if ($eval_code) {
386			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
387			$log->debug("about to eval{$eval} format: $out");
388			return if (! $self->_eval($eval));
389			}
390
391	dpavlin	260	if ($filter_name) {
392			my @filter_args;
393			if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
394			@filter_args = split(/,/, $2);
395			}
396			if ($self->{'filter'}->{$filter_name}) {
397			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
398			unshift @filter_args, $out;
399			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
400			return unless(defined($out));
401			$log->debug("filter result: $out");
402	dpavlin	261	} elsif (! $warn_once->{$filter_name}) {
403	dpavlin	260	$log->warn("trying to use undefined filter $filter_name");
404	dpavlin	261	$warn_once->{$filter_name}++;
405	dpavlin	260	}
406	dpavlin	13	}
407
408			return $out;
409			}
410
411			=head2 parse_to_arr
412
413			Similar to C<parse>, but returns array of all repeatable fields
414
415			my @arr = $webpac->parse_to_arr($rec,'v250^a');
416
417			=cut
418
419			sub parse_to_arr {
420			my $self = shift;
421
422			my ($rec, $format_utf8) = @_;
423
424			my $log = $self->_get_logger();
425
426			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
427			return if (! $format_utf8);
428
429			my $i = 0;
430			my @arr;
431
432			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
433			push @arr, $v;
434			}
435
436			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
437
438			return @arr;
439			}
440
441	dpavlin	15
442			=head2 fill_in
443
444			Workhourse of all: takes record from in-memory structure of database and
445			strings with placeholders and returns string or array of with substituted
446			values from record.
447
448			my $text = $webpac->fill_in($rec,'v250^a');
449
450			Optional argument is ordinal number for repeatable fields. By default,
451			it's assume to be first repeatable field (fields are perl array, so first
452			element is 0).
453			Following example will read second value from repeatable field.
454
455			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
456
457			This function B<does not> perform parsing of format to inteligenty skip
458			delimiters before fields which aren't used.
459
460			This method will automatically decode UTF-8 string to local code page
461			if needed.
462
463			=cut
464
465			sub fill_in {
466			my $self = shift;
467
468			my $log = $self->_get_logger();
469
470			my $rec = shift \|\| $log->logconfess("need data record");
471			my $format = shift \|\| $log->logconfess("need format to parse");
472			# iteration (for repeatable fields)
473			my $i = shift \|\| 0;
474
475			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
476
477			# FIXME remove for speedup?
478			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
479
480			if (utf8::is_utf8($format)) {
481			$format = $self->_x($format);
482			}
483
484			my $found = 0;
485	dpavlin	333	my $just_single = 1;
486	dpavlin	15
487			my $eval_code;
488			# remove eval{...} from beginning
489			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
490
491			my $filter_name;
492			# remove filter{...} from beginning
493			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
494
495			# do actual replacement of placeholders
496			# repeatable fields
497	dpavlin	333	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
498			$just_single = 0;
499			}
500	dpavlin	364
501	dpavlin	15	# non-repeatable fields
502	dpavlin	333	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
503			return if ($i > 0 && $just_single);
504			}
505	dpavlin	15
506			if ($found) {
507			$log->debug("format: $format");
508			if ($eval_code) {
509			my $eval = $self->fill_in($rec,$eval_code,$i);
510			return if (! $self->_eval($eval));
511			}
512			if ($filter_name && $self->{'filter'}->{$filter_name}) {
513			$log->debug("filter '$filter_name' for $format");
514			$format = $self->{'filter'}->{$filter_name}->($format);
515			return unless(defined($format));
516			$log->debug("filter result: $format");
517			}
518			# do we have lookups?
519			if ($self->{'lookup'}) {
520	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
521	dpavlin	252	my @lookup = $self->{lookup}->lookup($format);
522	dpavlin	253	$log->debug("lookup $format", join(", ", @lookup));
523	dpavlin	252	return @lookup;
524	dpavlin	31	} else {
525			$log->warn("Have lookup object but can't invoke lookup method");
526			}
527	dpavlin	15	} else {
528			return $format;
529			}
530			} else {
531			return;
532			}
533			}
534
535
536	dpavlin	13	=head2 fill_in_to_arr
537
538			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
539			for fields which have lookups, so they shouldn't be parsed but rather
540			C<fill_id>ed.
541
542			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
543
544			=cut
545
546			sub fill_in_to_arr {
547			my $self = shift;
548
549			my ($rec, $format_utf8) = @_;
550
551			my $log = $self->_get_logger();
552
553			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
554			return if (! $format_utf8);
555
556			my $i = 0;
557			my @arr;
558
559	dpavlin	364	while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
560			push @arr, $v;
561	dpavlin	13	}
562
563			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
564
565			return @arr;
566			}
567
568	dpavlin	15
569			=head2 get_data
570
571			Returns value from record.
572
573			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
574
575			Arguments are:
576			record reference C<$rec>,
577			field C<$f>,
578			optional subfiled C<$sf>,
579			index for repeatable values C<$i>.
580
581			Optinal variable C<$found> will be incremeted if there
582			is field.
583
584			Returns value or empty string.
585
586			=cut
587
588			sub get_data {
589			my $self = shift;
590
591			my ($rec,$f,$sf,$i,$found) = @_;
592
593			if ($$rec->{$f}) {
594			return '' if (! $$rec->{$f}->[$i]);
595			no strict 'refs';
596			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
597			$$found++ if (defined($$found));
598			return $$rec->{$f}->[$i]->{$sf};
599	dpavlin	64	} elsif (! $sf && $$rec->{$f}->[$i]) {
600	dpavlin	15	$$found++ if (defined($$found));
601			# it still might have subfield, just
602			# not specified, so we'll dump all
603			if ($$rec->{$f}->[$i] =~ /HASH/o) {
604			my $out;
605			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
606	dpavlin	346	my $v = $$rec->{$f}->[$i]->{$k};
607			$out .= "$v " if ($v);
608	dpavlin	15	}
609			return $out;
610			} else {
611			return $$rec->{$f}->[$i];
612			}
613	dpavlin	64	} else {
614			return '';
615	dpavlin	15	}
616			} else {
617			return '';
618			}
619			}
620
621
622			=head2 apply_format
623
624			Apply format specified in tag with C<format_name="name"> and
625			C<format_delimiter=";;">.
626
627			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
628
629			Formats can contain C<lookup{...}> if you need them.
630
631			=cut
632
633			sub apply_format {
634			my $self = shift;
635
636			my ($name,$delimiter,$data) = @_;
637
638			my $log = $self->_get_logger();
639
640			if (! $self->{'import_xml'}->{'format'}->{$name}) {
641			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
642			return $data;
643			}
644
645			$log->warn("no delimiter for format $name") if (! $delimiter);
646
647			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
648
649			my @data = split(/\Q$delimiter\E/, $data);
650
651			my $out = sprintf($format, @data);
652			$log->debug("using format $name [$format] on $data to produce: $out");
653
654			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
655	dpavlin	31	return $self->{'lookup'}->lookup($out);
656	dpavlin	15	} else {
657			return $out;
658			}
659
660			}
661
662	dpavlin	13	=head2 sort_arr
663
664			Sort array ignoring case and html in data
665
666			my @sorted = $webpac->sort_arr(@unsorted);
667
668			=cut
669
670			sub sort_arr {
671			my $self = shift;
672
673			my $log = $self->_get_logger();
674
675			# FIXME add Schwartzian Transformation?
676
677			my @sorted = sort {
678			$a =~ s#<[^>]+/*>##;
679			$b =~ s#<[^>]+/*>##;
680			lc($b) cmp lc($a)
681			} @_;
682			$log->debug("sorted values: ",sub { join(", ",@sorted) });
683
684			return @sorted;
685			}
686
687
688	dpavlin	15	=head1 INTERNAL METHODS
689
690	dpavlin	13	=head2 _sort_by_order
691
692			Sort xml tags data structure accoding to C<order=""> attribute.
693
694			=cut
695
696			sub _sort_by_order {
697			my $self = shift;
698
699			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
700			$self->{'import_xml'}->{'indexer'}->{$a};
701			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
702			$self->{'import_xml'}->{'indexer'}->{$b};
703
704			return $va <=> $vb;
705			}
706
707			=head2 _x
708
709	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
710			specific encoding (optinally specified using C<code_page> to C<new>
711			constructor).
712	dpavlin	13
713			my $text = $n->_x('normalize text string');
714
715			This is a stub so that other modules doesn't have to implement it.
716
717			=cut
718
719			sub _x {
720			my $self = shift;
721			return shift;
722			}
723
724
725	dpavlin	10	=head1 AUTHOR
726
727			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
728
729			=head1 COPYRIGHT & LICENSE
730
731			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
732
733			This program is free software; you can redistribute it and/or modify it
734			under the same terms as Perl itself.
735
736			=cut
737
738	dpavlin	209	1; # End of WebPAC::Normalize