lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.08

=cut

our $VERSION = '0.08';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        if (! @v) {
                                $log->debug("$field <",$self->{tag},"> format: $format no values");
#                               next;
                        } else {
                                $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
                        }

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
                $just_single = 0;
        }

        # non-repeatable fields
        if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
                return if ($i > 0 && $just_single);
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        my $v = $$rec->{$f}->[$i]->{$k};
                                        $out .= "$v " if ($v);
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.08
15
16	=cut
17
18	our $VERSION = '0.08';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	There is one built-in filter called C<regex> which can be use like this:
51
52	filter{regex(s/foo/bar/)}
53
54	=item *
55
56	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58	=item *
59
60	at end, optional C<format>s rules are resolved. Format rules are similar to
61	C<sprintf> and can also contain C<lookup{...}> which is performed after
62	values are inserted in format.
63
64	=back
65
66	This also describes order in which transformations are applied (eval,
67	filter, lookup, format) which is important to undestand when deciding how to
68	solve your data mungling and normalisation process.
69
70
71
72
73	=head1 FUNCTIONS
74
75	=head2 new
76
77	Create new normalisation object
78
79	my $n = new WebPAC::Normalize::Something(
80	filter => {
81	'filter_name_1' => sub {
82	# filter code
83	return length($_);
84	}, ...
85	},
86	db => $db_obj,
87	lookup_regex => $lookup->regex,
88	lookup => $lookup_obj,
89	prefix => 'foobar',
90	);
91
92	Parametar C<filter> defines user supplied snippets of perl code which can
93	be use with C<filter{...}> notation.
94
95	C<prefix> is used to form filename for database record (to support multiple
96	source files which are joined in one database).
97
98	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99	in structures. If you pass this parametar, you must also pass C<lookup>
100	which is C<WebPAC::Lookup> object.
101
102	=cut
103
104	sub new {
105	my $class = shift;
106	my $self = {@_};
107	bless($self, $class);
108
109	my $r = $self->{'lookup_regex'} ? 1 : 0;
110	my $l = $self->{'lookup'} ? 1 : 0;
111
112	my $log = $self->_get_logger();
113
114	# those two must be in pair
115	if ( ($r & $l) != ($r \|\| $l) ) {
116	my $log = $self->_get_logger();
117	$log->logdie("lookup_regex and lookup must be in pair");
118	}
119
120	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
127	$log->debug("adding built-in filter regex");
128	$self->{filter}->{regex} = sub {
129	my ($val, $regex) = @_;
130	eval "\$val =~ $regex";
131	return $val;
132	};
133	}
134
135	$self ? return $self : return undef;
136	}
137
138
139	=head2 data_structure
140
141	Create in-memory data structure which represents normalized layout from
142	C<conf/normalize/*.xml>.
143
144	This structures are used to produce output.
145
146	my $ds = $webpac->data_structure($rec);
147
148	=cut
149
150	sub data_structure {
151	my $self = shift;
152
153	my $log = $self->_get_logger();
154
155	my $rec = shift;
156	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158	$log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
163
164	my $cache_file;
165
166	if ($self->{'db'}) {
167	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169	return $ds if ($ds);
170	$log->debug("cache miss, creating");
171	}
172
173	my @sorted_tags;
174	if ($self->{tags_by_order}) {
175	@sorted_tags = @{$self->{tags_by_order}};
176	} else {
177	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178	$self->{tags_by_order} = \@sorted_tags;
179	}
180
181	my $ds;
182
183	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185	foreach my $field (@sorted_tags) {
186
187	my $row;
188
189	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192	my $format;
193
194	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195	$format = $tag->{'value'} \|\| $tag->{'content'};
196
197	my @v;
198	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
199	@v = $self->fill_in_to_arr($rec,$format);
200	} else {
201	@v = $self->parse_to_arr($rec,$format);
202	}
203	if (! @v) {
204	$log->debug("$field <",$self->{tag},"> format: $format no values");
205	# next;
206	} else {
207	$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
208	}
209
210	if ($tag->{'sort'}) {
211	@v = $self->sort_arr(@v);
212	}
213
214	# use format?
215	if ($tag->{'format_name'}) {
216	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
217	}
218
219	# delimiter will join repeatable fields
220	if ($tag->{'delimiter'}) {
221	@v = ( join($tag->{'delimiter'}, @v) );
222	}
223
224	# default types
225	my @types = qw(display search);
226	# override by type attribute
227	@types = ( $tag->{'type'} ) if ($tag->{'type'});
228
229	foreach my $type (@types) {
230	# append to previous line?
231	$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} \|\| 'no append');
232	if ($tag->{'append'}) {
233
234	# I will delimit appended part with
235	# delimiter (or ,)
236	my $d = $tag->{'delimiter'};
237	# default delimiter
238	$d \|\|= " ";
239
240	my $last = pop @{$row->{$type}};
241	$d = "" if (! $last);
242	$last .= $d . join($d, @v);
243	push @{$row->{$type}}, $last;
244
245	} else {
246	push @{$row->{$type}}, @v;
247	}
248	}
249
250
251	}
252
253	if ($row) {
254	$row->{'tag'} = $field;
255
256	# TODO: name_sigular, name_plural
257	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258	my $row_name = $name ? $self->_x($name) : $field;
259
260	# post-sort all values in field
261	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262	$log->warn("sort at field tag not implemented");
263	}
264
265	$ds->{$row_name} = $row;
266
267	$log->debug("row $field: ",sub { Dumper($row) });
268	}
269
270	}
271
272	$self->{'db'}->save_ds(
273	id => $id,
274	ds => $ds,
275	prefix => $self->{prefix},
276	) if ($self->{'db'});
277
278	$log->debug("ds: ", sub { Dumper($ds) });
279
280	$log->logconfess("data structure returned is not array any more!") if wantarray;
281
282	return $ds;
283
284	}
285
286	=head2 parse
287
288	Perform smart parsing of string, skipping delimiters for fields which aren't
289	defined. It can also eval code in format starting with C<eval{...}> and
290	return output or nothing depending on eval code.
291
292	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
293
294	Filters are implemented here. While simple form of filters looks like this:
295
296	filter{name_of_filter}
297
298	but, filters can also have variable number of parametars like this:
299
300	filter{name_of_filter(param,param,param)}
301
302	=cut
303
304	my $warn_once;
305
306	sub parse {
307	my $self = shift;
308
309	my ($rec, $format_utf8, $i) = @_;
310
311	return if (! $format_utf8);
312
313	my $log = $self->_get_logger();
314
315	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
316
317	$i = 0 if (! $i);
318
319	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
320
321	my @out;
322
323	$log->debug("format: $format [$i]");
324
325	my $eval_code;
326	# remove eval{...} from beginning
327	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328
329	my $filter_name;
330	# remove filter{...} from beginning
331	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
332
333	# did we found any (att all) field from format in row?
334	my $found_any;
335	# prefix before first field which we preserve it $found_any
336	my $prefix;
337
338	my $f_step = 1;
339
340	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
341
342	my $del = $1 \|\| '';
343	$prefix = $del if ($f_step == 1);
344
345	my $fld_type = lc($2);
346
347	# repeatable index
348	my $r = $i;
349	if ($fld_type eq 's') {
350	if ($found_any->{'v'}) {
351	$r = 0;
352	} else {
353	return;
354	}
355	}
356
357	my $found = 0;
358	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
359
360	if ($found) {
361	$found_any->{$fld_type} += $found;
362
363	# we will skip delimiter before first occurence of field!
364	push @out, $del unless($found_any->{$fld_type} == 1);
365	push @out, $tmp;
366	}
367	$f_step++;
368	}
369
370	# test if any fields found?
371	return if (! $found_any->{'v'} && ! $found_any->{'s'});
372
373	my $out = join('',@out);
374
375	if ($out) {
376	# add rest of format (suffix)
377	$out .= $format;
378
379	# add prefix if not there
380	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
381
382	$log->debug("result: $out");
383	}
384
385	if ($eval_code) {
386	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
387	$log->debug("about to eval{$eval} format: $out");
388	return if (! $self->_eval($eval));
389	}
390
391	if ($filter_name) {
392	my @filter_args;
393	if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
394	@filter_args = split(/,/, $2);
395	}
396	if ($self->{'filter'}->{$filter_name}) {
397	$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
398	unshift @filter_args, $out;
399	$out = $self->{'filter'}->{$filter_name}->(@filter_args);
400	return unless(defined($out));
401	$log->debug("filter result: $out");
402	} elsif (! $warn_once->{$filter_name}) {
403	$log->warn("trying to use undefined filter $filter_name");
404	$warn_once->{$filter_name}++;
405	}
406	}
407
408	return $out;
409	}
410
411	=head2 parse_to_arr
412
413	Similar to C<parse>, but returns array of all repeatable fields
414
415	my @arr = $webpac->parse_to_arr($rec,'v250^a');
416
417	=cut
418
419	sub parse_to_arr {
420	my $self = shift;
421
422	my ($rec, $format_utf8) = @_;
423
424	my $log = $self->_get_logger();
425
426	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
427	return if (! $format_utf8);
428
429	my $i = 0;
430	my @arr;
431
432	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
433	push @arr, $v;
434	}
435
436	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
437
438	return @arr;
439	}
440
441
442	=head2 fill_in
443
444	Workhourse of all: takes record from in-memory structure of database and
445	strings with placeholders and returns string or array of with substituted
446	values from record.
447
448	my $text = $webpac->fill_in($rec,'v250^a');
449
450	Optional argument is ordinal number for repeatable fields. By default,
451	it's assume to be first repeatable field (fields are perl array, so first
452	element is 0).
453	Following example will read second value from repeatable field.
454
455	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
456
457	This function B<does not> perform parsing of format to inteligenty skip
458	delimiters before fields which aren't used.
459
460	This method will automatically decode UTF-8 string to local code page
461	if needed.
462
463	=cut
464
465	sub fill_in {
466	my $self = shift;
467
468	my $log = $self->_get_logger();
469
470	my $rec = shift \|\| $log->logconfess("need data record");
471	my $format = shift \|\| $log->logconfess("need format to parse");
472	# iteration (for repeatable fields)
473	my $i = shift \|\| 0;
474
475	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
476
477	# FIXME remove for speedup?
478	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
479
480	if (utf8::is_utf8($format)) {
481	$format = $self->_x($format);
482	}
483
484	my $found = 0;
485	my $just_single = 1;
486
487	my $eval_code;
488	# remove eval{...} from beginning
489	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
490
491	my $filter_name;
492	# remove filter{...} from beginning
493	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
494
495	# do actual replacement of placeholders
496	# repeatable fields
497	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
498	$just_single = 0;
499	}
500
501	# non-repeatable fields
502	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
503	return if ($i > 0 && $just_single);
504	}
505
506	if ($found) {
507	$log->debug("format: $format");
508	if ($eval_code) {
509	my $eval = $self->fill_in($rec,$eval_code,$i);
510	return if (! $self->_eval($eval));
511	}
512	if ($filter_name && $self->{'filter'}->{$filter_name}) {
513	$log->debug("filter '$filter_name' for $format");
514	$format = $self->{'filter'}->{$filter_name}->($format);
515	return unless(defined($format));
516	$log->debug("filter result: $format");
517	}
518	# do we have lookups?
519	if ($self->{'lookup'}) {
520	if ($self->{'lookup'}->can('lookup')) {
521	my @lookup = $self->{lookup}->lookup($format);
522	$log->debug("lookup $format", join(", ", @lookup));
523	return @lookup;
524	} else {
525	$log->warn("Have lookup object but can't invoke lookup method");
526	}
527	} else {
528	return $format;
529	}
530	} else {
531	return;
532	}
533	}
534
535
536	=head2 fill_in_to_arr
537
538	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
539	for fields which have lookups, so they shouldn't be parsed but rather
540	C<fill_id>ed.
541
542	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
543
544	=cut
545
546	sub fill_in_to_arr {
547	my $self = shift;
548
549	my ($rec, $format_utf8) = @_;
550
551	my $log = $self->_get_logger();
552
553	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
554	return if (! $format_utf8);
555
556	my $i = 0;
557	my @arr;
558
559	while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
560	push @arr, $v;
561	}
562
563	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
564
565	return @arr;
566	}
567
568
569	=head2 get_data
570
571	Returns value from record.
572
573	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
574
575	Arguments are:
576	record reference C<$rec>,
577	field C<$f>,
578	optional subfiled C<$sf>,
579	index for repeatable values C<$i>.
580
581	Optinal variable C<$found> will be incremeted if there
582	is field.
583
584	Returns value or empty string.
585
586	=cut
587
588	sub get_data {
589	my $self = shift;
590
591	my ($rec,$f,$sf,$i,$found) = @_;
592
593	if ($$rec->{$f}) {
594	return '' if (! $$rec->{$f}->[$i]);
595	no strict 'refs';
596	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
597	$$found++ if (defined($$found));
598	return $$rec->{$f}->[$i]->{$sf};
599	} elsif (! $sf && $$rec->{$f}->[$i]) {
600	$$found++ if (defined($$found));
601	# it still might have subfield, just
602	# not specified, so we'll dump all
603	if ($$rec->{$f}->[$i] =~ /HASH/o) {
604	my $out;
605	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
606	my $v = $$rec->{$f}->[$i]->{$k};
607	$out .= "$v " if ($v);
608	}
609	return $out;
610	} else {
611	return $$rec->{$f}->[$i];
612	}
613	} else {
614	return '';
615	}
616	} else {
617	return '';
618	}
619	}
620
621
622	=head2 apply_format
623
624	Apply format specified in tag with C<format_name="name"> and
625	C<format_delimiter=";;">.
626
627	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
628
629	Formats can contain C<lookup{...}> if you need them.
630
631	=cut
632
633	sub apply_format {
634	my $self = shift;
635
636	my ($name,$delimiter,$data) = @_;
637
638	my $log = $self->_get_logger();
639
640	if (! $self->{'import_xml'}->{'format'}->{$name}) {
641	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
642	return $data;
643	}
644
645	$log->warn("no delimiter for format $name") if (! $delimiter);
646
647	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
648
649	my @data = split(/\Q$delimiter\E/, $data);
650
651	my $out = sprintf($format, @data);
652	$log->debug("using format $name [$format] on $data to produce: $out");
653
654	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
655	return $self->{'lookup'}->lookup($out);
656	} else {
657	return $out;
658	}
659
660	}
661
662	=head2 sort_arr
663
664	Sort array ignoring case and html in data
665
666	my @sorted = $webpac->sort_arr(@unsorted);
667
668	=cut
669
670	sub sort_arr {
671	my $self = shift;
672
673	my $log = $self->_get_logger();
674
675	# FIXME add Schwartzian Transformation?
676
677	my @sorted = sort {
678	$a =~ s#<[^>]+/*>##;
679	$b =~ s#<[^>]+/*>##;
680	lc($b) cmp lc($a)
681	} @_;
682	$log->debug("sorted values: ",sub { join(", ",@sorted) });
683
684	return @sorted;
685	}
686
687
688	=head1 INTERNAL METHODS
689
690	=head2 _sort_by_order
691
692	Sort xml tags data structure accoding to C<order=""> attribute.
693
694	=cut
695
696	sub _sort_by_order {
697	my $self = shift;
698
699	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
700	$self->{'import_xml'}->{'indexer'}->{$a};
701	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
702	$self->{'import_xml'}->{'indexer'}->{$b};
703
704	return $va <=> $vb;
705	}
706
707	=head2 _x
708
709	Convert strings from C<conf/normalize/*.xml> encoding into application
710	specific encoding (optinally specified using C<code_page> to C<new>
711	constructor).
712
713	my $text = $n->_x('normalize text string');
714
715	This is a stub so that other modules doesn't have to implement it.
716
717	=cut
718
719	sub _x {
720	my $self = shift;
721	return shift;
722	}
723
724
725	=head1 AUTHOR
726
727	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
728
729	=head1 COPYRIGHT & LICENSE
730
731	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
732
733	This program is free software; you can redistribute it and/or modify it
734	under the same terms as Perl itself.
735
736	=cut
737
738	1; # End of WebPAC::Normalize