lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.07

=cut

our $VERSION = '0.07';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14	dpavlin	295	Version 0.07
15	dpavlin	10
16			=cut
17
18	dpavlin	295	our $VERSION = '0.07';
19	dpavlin	10
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50	dpavlin	260	There is one built-in filter called C<regex> which can be use like this:
51
52			filter{regex(s/foo/bar/)}
53
54	dpavlin	15	=item *
55
56			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58			=item *
59
60			at end, optional C<format>s rules are resolved. Format rules are similar to
61			C<sprintf> and can also contain C<lookup{...}> which is performed after
62			values are inserted in format.
63
64			=back
65
66			This also describes order in which transformations are applied (eval,
67			filter, lookup, format) which is important to undestand when deciding how to
68			solve your data mungling and normalisation process.
69
70
71
72
73	dpavlin	10	=head1 FUNCTIONS
74
75	dpavlin	13	=head2 new
76	dpavlin	10
77	dpavlin	13	Create new normalisation object
78
79			my $n = new WebPAC::Normalize::Something(
80	dpavlin	15	filter => {
81			'filter_name_1' => sub {
82			# filter code
83			return length($_);
84			}, ...
85			},
86	dpavlin	29	db => $db_obj,
87	dpavlin	13	lookup_regex => $lookup->regex,
88	dpavlin	31	lookup => $lookup_obj,
89	dpavlin	219	prefix => 'foobar',
90	dpavlin	13	);
91
92	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
93			be use with C<filter{...}> notation.
94
95	dpavlin	219	C<prefix> is used to form filename for database record (to support multiple
96			source files which are joined in one database).
97
98	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
100			which is C<WebPAC::Lookup> object.
101	dpavlin	13
102	dpavlin	10	=cut
103
104	dpavlin	13	sub new {
105			my $class = shift;
106			my $self = {@_};
107			bless($self, $class);
108
109	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
110			my $l = $self->{'lookup'} ? 1 : 0;
111
112			my $log = $self->_get_logger();
113
114			# those two must be in pair
115			if ( ($r & $l) != ($r \|\| $l) ) {
116			my $log = $self->_get_logger();
117			$log->logdie("lookup_regex and lookup must be in pair");
118			}
119
120			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122	dpavlin	219	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124	dpavlin	252	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126	dpavlin	295	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
127	dpavlin	260	$log->debug("adding built-in filter regex");
128			$self->{filter}->{regex} = sub {
129			my ($val, $regex) = @_;
130			eval "\$val =~ $regex";
131			return $val;
132			};
133			}
134
135	dpavlin	13	$self ? return $self : return undef;
136	dpavlin	10	}
137
138	dpavlin	13
139			=head2 data_structure
140
141			Create in-memory data structure which represents normalized layout from
142			C<conf/normalize/*.xml>.
143
144			This structures are used to produce output.
145
146	dpavlin	70	my $ds = $webpac->data_structure($rec);
147	dpavlin	13
148			=cut
149
150			sub data_structure {
151			my $self = shift;
152
153			my $log = $self->_get_logger();
154
155			my $rec = shift;
156			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158	dpavlin	125	$log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160	dpavlin	312	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161	dpavlin	125
162	dpavlin	219	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
163	dpavlin	125
164	dpavlin	13	my $cache_file;
165
166	dpavlin	18	if ($self->{'db'}) {
167	dpavlin	219	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168	dpavlin	70	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169			return $ds if ($ds);
170	dpavlin	29	$log->debug("cache miss, creating");
171	dpavlin	13	}
172
173			my @sorted_tags;
174			if ($self->{tags_by_order}) {
175			@sorted_tags = @{$self->{tags_by_order}};
176			} else {
177			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178			$self->{tags_by_order} = \@sorted_tags;
179			}
180
181	dpavlin	70	my $ds;
182	dpavlin	13
183			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185			foreach my $field (@sorted_tags) {
186
187			my $row;
188
189			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192	dpavlin	38	my $format;
193	dpavlin	13
194	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195			$format = $tag->{'value'} \|\| $tag->{'content'};
196
197	dpavlin	13	$log->debug("format: $format");
198
199			my @v;
200			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201			@v = $self->fill_in_to_arr($rec,$format);
202			} else {
203			@v = $self->parse_to_arr($rec,$format);
204			}
205			next if (! @v);
206
207			if ($tag->{'sort'}) {
208			@v = $self->sort_arr(@v);
209			}
210
211			# use format?
212			if ($tag->{'format_name'}) {
213			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214			}
215
216			# delimiter will join repeatable fields
217			if ($tag->{'delimiter'}) {
218			@v = ( join($tag->{'delimiter'}, @v) );
219			}
220
221			# default types
222	dpavlin	74	my @types = qw(display search);
223	dpavlin	13	# override by type attribute
224			@types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226			foreach my $type (@types) {
227			# append to previous line?
228	dpavlin	252	$log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} \|\| 'no append');
229	dpavlin	13	if ($tag->{'append'}) {
230
231			# I will delimit appended part with
232			# delimiter (or ,)
233			my $d = $tag->{'delimiter'};
234			# default delimiter
235			$d \|\|= " ";
236
237			my $last = pop @{$row->{$type}};
238			$d = "" if (! $last);
239			$last .= $d . join($d, @v);
240			push @{$row->{$type}}, $last;
241
242			} else {
243			push @{$row->{$type}}, @v;
244			}
245			}
246
247
248			}
249
250			if ($row) {
251			$row->{'tag'} = $field;
252
253			# TODO: name_sigular, name_plural
254			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255	dpavlin	70	my $row_name = $name ? $self->_x($name) : $field;
256	dpavlin	13
257			# post-sort all values in field
258			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259			$log->warn("sort at field tag not implemented");
260			}
261
262	dpavlin	70	$ds->{$row_name} = $row;
263	dpavlin	13
264			$log->debug("row $field: ",sub { Dumper($row) });
265			}
266
267			}
268
269	dpavlin	22	$self->{'db'}->save_ds(
270	dpavlin	219	id => $id,
271	dpavlin	70	ds => $ds,
272	dpavlin	219	prefix => $self->{prefix},
273	dpavlin	18	) if ($self->{'db'});
274	dpavlin	13
275	dpavlin	70	$log->debug("ds: ", sub { Dumper($ds) });
276	dpavlin	29
277	dpavlin	70	$log->logconfess("data structure returned is not array any more!") if wantarray;
278	dpavlin	13
279	dpavlin	70	return $ds;
280
281	dpavlin	13	}
282
283			=head2 parse
284
285			Perform smart parsing of string, skipping delimiters for fields which aren't
286			defined. It can also eval code in format starting with C<eval{...}> and
287			return output or nothing depending on eval code.
288
289			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291	dpavlin	260	Filters are implemented here. While simple form of filters looks like this:
292
293			filter{name_of_filter}
294
295			but, filters can also have variable number of parametars like this:
296
297			filter{name_of_filter(param,param,param)}
298
299	dpavlin	13	=cut
300
301	dpavlin	261	my $warn_once;
302
303	dpavlin	13	sub parse {
304			my $self = shift;
305
306			my ($rec, $format_utf8, $i) = @_;
307
308			return if (! $format_utf8);
309
310			my $log = $self->_get_logger();
311
312			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314			$i = 0 if (! $i);
315
316			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318			my @out;
319
320			$log->debug("format: $format");
321
322			my $eval_code;
323			# remove eval{...} from beginning
324			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326			my $filter_name;
327			# remove filter{...} from beginning
328			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330			my $prefix;
331			my $all_found=0;
332
333			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
334
335			my $del = $1 \|\| '';
336			$prefix \|\|= $del if ($all_found == 0);
337
338			# repeatable index
339			my $r = $i;
340			$r = 0 if (lc("$2") eq 's');
341
342			my $found = 0;
343			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
344
345			if ($found) {
346			push @out, $del;
347			push @out, $tmp;
348			$all_found += $found;
349			}
350			}
351
352			return if (! $all_found);
353
354			my $out = join('',@out);
355
356			if ($out) {
357			# add rest of format (suffix)
358			$out .= $format;
359
360			# add prefix if not there
361			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
362
363			$log->debug("result: $out");
364			}
365
366			if ($eval_code) {
367			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
368			$log->debug("about to eval{$eval} format: $out");
369			return if (! $self->_eval($eval));
370			}
371
372	dpavlin	260	if ($filter_name) {
373			my @filter_args;
374			if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
375			@filter_args = split(/,/, $2);
376			}
377			if ($self->{'filter'}->{$filter_name}) {
378			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
379			unshift @filter_args, $out;
380			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
381			return unless(defined($out));
382			$log->debug("filter result: $out");
383	dpavlin	261	} elsif (! $warn_once->{$filter_name}) {
384	dpavlin	260	$log->warn("trying to use undefined filter $filter_name");
385	dpavlin	261	$warn_once->{$filter_name}++;
386	dpavlin	260	}
387	dpavlin	13	}
388
389			return $out;
390			}
391
392			=head2 parse_to_arr
393
394			Similar to C<parse>, but returns array of all repeatable fields
395
396			my @arr = $webpac->parse_to_arr($rec,'v250^a');
397
398			=cut
399
400			sub parse_to_arr {
401			my $self = shift;
402
403			my ($rec, $format_utf8) = @_;
404
405			my $log = $self->_get_logger();
406
407			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
408			return if (! $format_utf8);
409
410			my $i = 0;
411			my @arr;
412
413			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
414			push @arr, $v;
415			}
416
417			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
418
419			return @arr;
420			}
421
422	dpavlin	15
423			=head2 fill_in
424
425			Workhourse of all: takes record from in-memory structure of database and
426			strings with placeholders and returns string or array of with substituted
427			values from record.
428
429			my $text = $webpac->fill_in($rec,'v250^a');
430
431			Optional argument is ordinal number for repeatable fields. By default,
432			it's assume to be first repeatable field (fields are perl array, so first
433			element is 0).
434			Following example will read second value from repeatable field.
435
436			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
437
438			This function B<does not> perform parsing of format to inteligenty skip
439			delimiters before fields which aren't used.
440
441			This method will automatically decode UTF-8 string to local code page
442			if needed.
443
444			=cut
445
446			sub fill_in {
447			my $self = shift;
448
449			my $log = $self->_get_logger();
450
451			my $rec = shift \|\| $log->logconfess("need data record");
452			my $format = shift \|\| $log->logconfess("need format to parse");
453			# iteration (for repeatable fields)
454			my $i = shift \|\| 0;
455
456			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
457
458			# FIXME remove for speedup?
459			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460
461			if (utf8::is_utf8($format)) {
462			$format = $self->_x($format);
463			}
464
465			my $found = 0;
466
467			my $eval_code;
468			# remove eval{...} from beginning
469			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
470
471			my $filter_name;
472			# remove filter{...} from beginning
473			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
474
475			# do actual replacement of placeholders
476			# repeatable fields
477			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
478			# non-repeatable fields
479			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
480
481			if ($found) {
482			$log->debug("format: $format");
483			if ($eval_code) {
484			my $eval = $self->fill_in($rec,$eval_code,$i);
485			return if (! $self->_eval($eval));
486			}
487			if ($filter_name && $self->{'filter'}->{$filter_name}) {
488			$log->debug("filter '$filter_name' for $format");
489			$format = $self->{'filter'}->{$filter_name}->($format);
490			return unless(defined($format));
491			$log->debug("filter result: $format");
492			}
493			# do we have lookups?
494			if ($self->{'lookup'}) {
495	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
496	dpavlin	252	my @lookup = $self->{lookup}->lookup($format);
497	dpavlin	253	$log->debug("lookup $format", join(", ", @lookup));
498	dpavlin	252	return @lookup;
499	dpavlin	31	} else {
500			$log->warn("Have lookup object but can't invoke lookup method");
501			}
502	dpavlin	15	} else {
503			return $format;
504			}
505			} else {
506			return;
507			}
508			}
509
510
511	dpavlin	13	=head2 fill_in_to_arr
512
513			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
514			for fields which have lookups, so they shouldn't be parsed but rather
515			C<fill_id>ed.
516
517			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
518
519			=cut
520
521			sub fill_in_to_arr {
522			my $self = shift;
523
524			my ($rec, $format_utf8) = @_;
525
526			my $log = $self->_get_logger();
527
528			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
529			return if (! $format_utf8);
530
531			my $i = 0;
532			my @arr;
533
534			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
535			push @arr, @v;
536			}
537
538			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
539
540			return @arr;
541			}
542
543	dpavlin	15
544			=head2 get_data
545
546			Returns value from record.
547
548			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
549
550			Arguments are:
551			record reference C<$rec>,
552			field C<$f>,
553			optional subfiled C<$sf>,
554			index for repeatable values C<$i>.
555
556			Optinal variable C<$found> will be incremeted if there
557			is field.
558
559			Returns value or empty string.
560
561			=cut
562
563			sub get_data {
564			my $self = shift;
565
566			my ($rec,$f,$sf,$i,$found) = @_;
567
568			if ($$rec->{$f}) {
569			return '' if (! $$rec->{$f}->[$i]);
570			no strict 'refs';
571			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
572			$$found++ if (defined($$found));
573			return $$rec->{$f}->[$i]->{$sf};
574	dpavlin	64	} elsif (! $sf && $$rec->{$f}->[$i]) {
575	dpavlin	15	$$found++ if (defined($$found));
576			# it still might have subfield, just
577			# not specified, so we'll dump all
578			if ($$rec->{$f}->[$i] =~ /HASH/o) {
579			my $out;
580			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
581			$out .= $$rec->{$f}->[$i]->{$k}." ";
582			}
583			return $out;
584			} else {
585			return $$rec->{$f}->[$i];
586			}
587	dpavlin	64	} else {
588			return '';
589	dpavlin	15	}
590			} else {
591			return '';
592			}
593			}
594
595
596			=head2 apply_format
597
598			Apply format specified in tag with C<format_name="name"> and
599			C<format_delimiter=";;">.
600
601			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
602
603			Formats can contain C<lookup{...}> if you need them.
604
605			=cut
606
607			sub apply_format {
608			my $self = shift;
609
610			my ($name,$delimiter,$data) = @_;
611
612			my $log = $self->_get_logger();
613
614			if (! $self->{'import_xml'}->{'format'}->{$name}) {
615			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
616			return $data;
617			}
618
619			$log->warn("no delimiter for format $name") if (! $delimiter);
620
621			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
622
623			my @data = split(/\Q$delimiter\E/, $data);
624
625			my $out = sprintf($format, @data);
626			$log->debug("using format $name [$format] on $data to produce: $out");
627
628			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
629	dpavlin	31	return $self->{'lookup'}->lookup($out);
630	dpavlin	15	} else {
631			return $out;
632			}
633
634			}
635
636	dpavlin	13	=head2 sort_arr
637
638			Sort array ignoring case and html in data
639
640			my @sorted = $webpac->sort_arr(@unsorted);
641
642			=cut
643
644			sub sort_arr {
645			my $self = shift;
646
647			my $log = $self->_get_logger();
648
649			# FIXME add Schwartzian Transformation?
650
651			my @sorted = sort {
652			$a =~ s#<[^>]+/*>##;
653			$b =~ s#<[^>]+/*>##;
654			lc($b) cmp lc($a)
655			} @_;
656			$log->debug("sorted values: ",sub { join(", ",@sorted) });
657
658			return @sorted;
659			}
660
661
662	dpavlin	15	=head1 INTERNAL METHODS
663
664	dpavlin	13	=head2 _sort_by_order
665
666			Sort xml tags data structure accoding to C<order=""> attribute.
667
668			=cut
669
670			sub _sort_by_order {
671			my $self = shift;
672
673			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
674			$self->{'import_xml'}->{'indexer'}->{$a};
675			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
676			$self->{'import_xml'}->{'indexer'}->{$b};
677
678			return $va <=> $vb;
679			}
680
681			=head2 _x
682
683	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
684			specific encoding (optinally specified using C<code_page> to C<new>
685			constructor).
686	dpavlin	13
687			my $text = $n->_x('normalize text string');
688
689			This is a stub so that other modules doesn't have to implement it.
690
691			=cut
692
693			sub _x {
694			my $self = shift;
695			return shift;
696			}
697
698
699	dpavlin	10	=head1 AUTHOR
700
701			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
702
703			=head1 COPYRIGHT & LICENSE
704
705			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
706
707			This program is free software; you can redistribute it and/or modify it
708			under the same terms as Perl itself.
709
710			=cut
711
712	dpavlin	209	1; # End of WebPAC::Normalize