lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.08

=cut

our $VERSION = '0.08';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

There is one built-in filter called C<regex> which can be use like this:

  filter{regex(s/foo/bar/)}

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        if (! $self->{filter} || ! $self->{filter}->{regex}) {
                $log->debug("adding built-in filter regex");
                $self->{filter}->{regex} = sub {
                        my ($val, $regex) = @_;
                        eval "\$val =~ $regex";
                        return $val;
                };
        }

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

Filters are implemented here. While simple form of filters looks like this:

  filter{name_of_filter}

but, filters can also have variable number of parametars like this:

  filter{name_of_filter(param,param,param)}

=cut

my $warn_once;

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format [$i]");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # did we found any (att all) field from format in row?
        my $found_any;
        # prefix before first field which we preserve it $found_any
        my $prefix;

        my $f_step = 1;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix = $del if ($f_step == 1);

                my $fld_type = lc($2);

                # repeatable index
                my $r = $i;
                if ($fld_type eq 's') {
                        if ($found_any->{'v'}) {
                                $r = 0;
                        } else {
                                return;
                        }
                }

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        $found_any->{$fld_type} += $found;

                        # we will skip delimiter before first occurence of field!
                        push @out, $del unless($found_any->{$fld_type} == 1);
                        push @out, $tmp;
                }
                $f_step++;
        }

        # test if any fields found?
        return if (! $found_any->{'v'} && ! $found_any->{'s'});

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name) {
                my @filter_args;
                if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
                        @filter_args = split(/,/, $2);
                }
                if ($self->{'filter'}->{$filter_name}) {
                        $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
                        unshift @filter_args, $out;
                        $out = $self->{'filter'}->{$filter_name}->(@filter_args);
                        return unless(defined($out));
                        $log->debug("filter result: $out");
                } elsif (! $warn_once->{$filter_name}) {
                        $log->warn("trying to use undefined filter $filter_name");
                        $warn_once->{$filter_name}++;
                }
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;
        my $just_single = 1;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
                $just_single = 0;
        }
        
        # non-repeatable fields
        if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
                return if ($i > 0 && $just_single);
        }

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.08
15
16	=cut
17
18	our $VERSION = '0.08';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	There is one built-in filter called C<regex> which can be use like this:
51
52	filter{regex(s/foo/bar/)}
53
54	=item *
55
56	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58	=item *
59
60	at end, optional C<format>s rules are resolved. Format rules are similar to
61	C<sprintf> and can also contain C<lookup{...}> which is performed after
62	values are inserted in format.
63
64	=back
65
66	This also describes order in which transformations are applied (eval,
67	filter, lookup, format) which is important to undestand when deciding how to
68	solve your data mungling and normalisation process.
69
70
71
72
73	=head1 FUNCTIONS
74
75	=head2 new
76
77	Create new normalisation object
78
79	my $n = new WebPAC::Normalize::Something(
80	filter => {
81	'filter_name_1' => sub {
82	# filter code
83	return length($_);
84	}, ...
85	},
86	db => $db_obj,
87	lookup_regex => $lookup->regex,
88	lookup => $lookup_obj,
89	prefix => 'foobar',
90	);
91
92	Parametar C<filter> defines user supplied snippets of perl code which can
93	be use with C<filter{...}> notation.
94
95	C<prefix> is used to form filename for database record (to support multiple
96	source files which are joined in one database).
97
98	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99	in structures. If you pass this parametar, you must also pass C<lookup>
100	which is C<WebPAC::Lookup> object.
101
102	=cut
103
104	sub new {
105	my $class = shift;
106	my $self = {@_};
107	bless($self, $class);
108
109	my $r = $self->{'lookup_regex'} ? 1 : 0;
110	my $l = $self->{'lookup'} ? 1 : 0;
111
112	my $log = $self->_get_logger();
113
114	# those two must be in pair
115	if ( ($r & $l) != ($r \|\| $l) ) {
116	my $log = $self->_get_logger();
117	$log->logdie("lookup_regex and lookup must be in pair");
118	}
119
120	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126	if (! $self->{filter} \|\| ! $self->{filter}->{regex}) {
127	$log->debug("adding built-in filter regex");
128	$self->{filter}->{regex} = sub {
129	my ($val, $regex) = @_;
130	eval "\$val =~ $regex";
131	return $val;
132	};
133	}
134
135	$self ? return $self : return undef;
136	}
137
138
139	=head2 data_structure
140
141	Create in-memory data structure which represents normalized layout from
142	C<conf/normalize/*.xml>.
143
144	This structures are used to produce output.
145
146	my $ds = $webpac->data_structure($rec);
147
148	=cut
149
150	sub data_structure {
151	my $self = shift;
152
153	my $log = $self->_get_logger();
154
155	my $rec = shift;
156	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158	$log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
163
164	my $cache_file;
165
166	if ($self->{'db'}) {
167	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169	return $ds if ($ds);
170	$log->debug("cache miss, creating");
171	}
172
173	my @sorted_tags;
174	if ($self->{tags_by_order}) {
175	@sorted_tags = @{$self->{tags_by_order}};
176	} else {
177	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178	$self->{tags_by_order} = \@sorted_tags;
179	}
180
181	my $ds;
182
183	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185	foreach my $field (@sorted_tags) {
186
187	my $row;
188
189	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192	my $format;
193
194	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195	$format = $tag->{'value'} \|\| $tag->{'content'};
196
197	$log->debug("format: $format");
198
199	my @v;
200	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201	@v = $self->fill_in_to_arr($rec,$format);
202	} else {
203	@v = $self->parse_to_arr($rec,$format);
204	}
205	next if (! @v);
206
207	if ($tag->{'sort'}) {
208	@v = $self->sort_arr(@v);
209	}
210
211	# use format?
212	if ($tag->{'format_name'}) {
213	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214	}
215
216	# delimiter will join repeatable fields
217	if ($tag->{'delimiter'}) {
218	@v = ( join($tag->{'delimiter'}, @v) );
219	}
220
221	# default types
222	my @types = qw(display search);
223	# override by type attribute
224	@types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226	foreach my $type (@types) {
227	# append to previous line?
228	$log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} \|\| 'no append');
229	if ($tag->{'append'}) {
230
231	# I will delimit appended part with
232	# delimiter (or ,)
233	my $d = $tag->{'delimiter'};
234	# default delimiter
235	$d \|\|= " ";
236
237	my $last = pop @{$row->{$type}};
238	$d = "" if (! $last);
239	$last .= $d . join($d, @v);
240	push @{$row->{$type}}, $last;
241
242	} else {
243	push @{$row->{$type}}, @v;
244	}
245	}
246
247
248	}
249
250	if ($row) {
251	$row->{'tag'} = $field;
252
253	# TODO: name_sigular, name_plural
254	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255	my $row_name = $name ? $self->_x($name) : $field;
256
257	# post-sort all values in field
258	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259	$log->warn("sort at field tag not implemented");
260	}
261
262	$ds->{$row_name} = $row;
263
264	$log->debug("row $field: ",sub { Dumper($row) });
265	}
266
267	}
268
269	$self->{'db'}->save_ds(
270	id => $id,
271	ds => $ds,
272	prefix => $self->{prefix},
273	) if ($self->{'db'});
274
275	$log->debug("ds: ", sub { Dumper($ds) });
276
277	$log->logconfess("data structure returned is not array any more!") if wantarray;
278
279	return $ds;
280
281	}
282
283	=head2 parse
284
285	Perform smart parsing of string, skipping delimiters for fields which aren't
286	defined. It can also eval code in format starting with C<eval{...}> and
287	return output or nothing depending on eval code.
288
289	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291	Filters are implemented here. While simple form of filters looks like this:
292
293	filter{name_of_filter}
294
295	but, filters can also have variable number of parametars like this:
296
297	filter{name_of_filter(param,param,param)}
298
299	=cut
300
301	my $warn_once;
302
303	sub parse {
304	my $self = shift;
305
306	my ($rec, $format_utf8, $i) = @_;
307
308	return if (! $format_utf8);
309
310	my $log = $self->_get_logger();
311
312	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314	$i = 0 if (! $i);
315
316	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318	my @out;
319
320	$log->debug("format: $format [$i]");
321
322	my $eval_code;
323	# remove eval{...} from beginning
324	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326	my $filter_name;
327	# remove filter{...} from beginning
328	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330	# did we found any (att all) field from format in row?
331	my $found_any;
332	# prefix before first field which we preserve it $found_any
333	my $prefix;
334
335	my $f_step = 1;
336
337	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
338
339	my $del = $1 \|\| '';
340	$prefix = $del if ($f_step == 1);
341
342	my $fld_type = lc($2);
343
344	# repeatable index
345	my $r = $i;
346	if ($fld_type eq 's') {
347	if ($found_any->{'v'}) {
348	$r = 0;
349	} else {
350	return;
351	}
352	}
353
354	my $found = 0;
355	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
356
357	if ($found) {
358	$found_any->{$fld_type} += $found;
359
360	# we will skip delimiter before first occurence of field!
361	push @out, $del unless($found_any->{$fld_type} == 1);
362	push @out, $tmp;
363	}
364	$f_step++;
365	}
366
367	# test if any fields found?
368	return if (! $found_any->{'v'} && ! $found_any->{'s'});
369
370	my $out = join('',@out);
371
372	if ($out) {
373	# add rest of format (suffix)
374	$out .= $format;
375
376	# add prefix if not there
377	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
378
379	$log->debug("result: $out");
380	}
381
382	if ($eval_code) {
383	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
384	$log->debug("about to eval{$eval} format: $out");
385	return if (! $self->_eval($eval));
386	}
387
388	if ($filter_name) {
389	my @filter_args;
390	if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
391	@filter_args = split(/,/, $2);
392	}
393	if ($self->{'filter'}->{$filter_name}) {
394	$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
395	unshift @filter_args, $out;
396	$out = $self->{'filter'}->{$filter_name}->(@filter_args);
397	return unless(defined($out));
398	$log->debug("filter result: $out");
399	} elsif (! $warn_once->{$filter_name}) {
400	$log->warn("trying to use undefined filter $filter_name");
401	$warn_once->{$filter_name}++;
402	}
403	}
404
405	return $out;
406	}
407
408	=head2 parse_to_arr
409
410	Similar to C<parse>, but returns array of all repeatable fields
411
412	my @arr = $webpac->parse_to_arr($rec,'v250^a');
413
414	=cut
415
416	sub parse_to_arr {
417	my $self = shift;
418
419	my ($rec, $format_utf8) = @_;
420
421	my $log = $self->_get_logger();
422
423	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
424	return if (! $format_utf8);
425
426	my $i = 0;
427	my @arr;
428
429	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
430	push @arr, $v;
431	}
432
433	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
434
435	return @arr;
436	}
437
438
439	=head2 fill_in
440
441	Workhourse of all: takes record from in-memory structure of database and
442	strings with placeholders and returns string or array of with substituted
443	values from record.
444
445	my $text = $webpac->fill_in($rec,'v250^a');
446
447	Optional argument is ordinal number for repeatable fields. By default,
448	it's assume to be first repeatable field (fields are perl array, so first
449	element is 0).
450	Following example will read second value from repeatable field.
451
452	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
453
454	This function B<does not> perform parsing of format to inteligenty skip
455	delimiters before fields which aren't used.
456
457	This method will automatically decode UTF-8 string to local code page
458	if needed.
459
460	=cut
461
462	sub fill_in {
463	my $self = shift;
464
465	my $log = $self->_get_logger();
466
467	my $rec = shift \|\| $log->logconfess("need data record");
468	my $format = shift \|\| $log->logconfess("need format to parse");
469	# iteration (for repeatable fields)
470	my $i = shift \|\| 0;
471
472	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
473
474	# FIXME remove for speedup?
475	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
476
477	if (utf8::is_utf8($format)) {
478	$format = $self->_x($format);
479	}
480
481	my $found = 0;
482	my $just_single = 1;
483
484	my $eval_code;
485	# remove eval{...} from beginning
486	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
487
488	my $filter_name;
489	# remove filter{...} from beginning
490	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
491
492	# do actual replacement of placeholders
493	# repeatable fields
494	if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
495	$just_single = 0;
496	}
497
498	# non-repeatable fields
499	if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
500	return if ($i > 0 && $just_single);
501	}
502
503	if ($found) {
504	$log->debug("format: $format");
505	if ($eval_code) {
506	my $eval = $self->fill_in($rec,$eval_code,$i);
507	return if (! $self->_eval($eval));
508	}
509	if ($filter_name && $self->{'filter'}->{$filter_name}) {
510	$log->debug("filter '$filter_name' for $format");
511	$format = $self->{'filter'}->{$filter_name}->($format);
512	return unless(defined($format));
513	$log->debug("filter result: $format");
514	}
515	# do we have lookups?
516	if ($self->{'lookup'}) {
517	if ($self->{'lookup'}->can('lookup')) {
518	my @lookup = $self->{lookup}->lookup($format);
519	$log->debug("lookup $format", join(", ", @lookup));
520	return @lookup;
521	} else {
522	$log->warn("Have lookup object but can't invoke lookup method");
523	}
524	} else {
525	return $format;
526	}
527	} else {
528	return;
529	}
530	}
531
532
533	=head2 fill_in_to_arr
534
535	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
536	for fields which have lookups, so they shouldn't be parsed but rather
537	C<fill_id>ed.
538
539	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
540
541	=cut
542
543	sub fill_in_to_arr {
544	my $self = shift;
545
546	my ($rec, $format_utf8) = @_;
547
548	my $log = $self->_get_logger();
549
550	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
551	return if (! $format_utf8);
552
553	my $i = 0;
554	my @arr;
555
556	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
557	push @arr, @v;
558	}
559
560	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
561
562	return @arr;
563	}
564
565
566	=head2 get_data
567
568	Returns value from record.
569
570	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
571
572	Arguments are:
573	record reference C<$rec>,
574	field C<$f>,
575	optional subfiled C<$sf>,
576	index for repeatable values C<$i>.
577
578	Optinal variable C<$found> will be incremeted if there
579	is field.
580
581	Returns value or empty string.
582
583	=cut
584
585	sub get_data {
586	my $self = shift;
587
588	my ($rec,$f,$sf,$i,$found) = @_;
589
590	if ($$rec->{$f}) {
591	return '' if (! $$rec->{$f}->[$i]);
592	no strict 'refs';
593	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
594	$$found++ if (defined($$found));
595	return $$rec->{$f}->[$i]->{$sf};
596	} elsif (! $sf && $$rec->{$f}->[$i]) {
597	$$found++ if (defined($$found));
598	# it still might have subfield, just
599	# not specified, so we'll dump all
600	if ($$rec->{$f}->[$i] =~ /HASH/o) {
601	my $out;
602	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
603	$out .= $$rec->{$f}->[$i]->{$k}." ";
604	}
605	return $out;
606	} else {
607	return $$rec->{$f}->[$i];
608	}
609	} else {
610	return '';
611	}
612	} else {
613	return '';
614	}
615	}
616
617
618	=head2 apply_format
619
620	Apply format specified in tag with C<format_name="name"> and
621	C<format_delimiter=";;">.
622
623	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
624
625	Formats can contain C<lookup{...}> if you need them.
626
627	=cut
628
629	sub apply_format {
630	my $self = shift;
631
632	my ($name,$delimiter,$data) = @_;
633
634	my $log = $self->_get_logger();
635
636	if (! $self->{'import_xml'}->{'format'}->{$name}) {
637	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
638	return $data;
639	}
640
641	$log->warn("no delimiter for format $name") if (! $delimiter);
642
643	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
644
645	my @data = split(/\Q$delimiter\E/, $data);
646
647	my $out = sprintf($format, @data);
648	$log->debug("using format $name [$format] on $data to produce: $out");
649
650	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
651	return $self->{'lookup'}->lookup($out);
652	} else {
653	return $out;
654	}
655
656	}
657
658	=head2 sort_arr
659
660	Sort array ignoring case and html in data
661
662	my @sorted = $webpac->sort_arr(@unsorted);
663
664	=cut
665
666	sub sort_arr {
667	my $self = shift;
668
669	my $log = $self->_get_logger();
670
671	# FIXME add Schwartzian Transformation?
672
673	my @sorted = sort {
674	$a =~ s#<[^>]+/*>##;
675	$b =~ s#<[^>]+/*>##;
676	lc($b) cmp lc($a)
677	} @_;
678	$log->debug("sorted values: ",sub { join(", ",@sorted) });
679
680	return @sorted;
681	}
682
683
684	=head1 INTERNAL METHODS
685
686	=head2 _sort_by_order
687
688	Sort xml tags data structure accoding to C<order=""> attribute.
689
690	=cut
691
692	sub _sort_by_order {
693	my $self = shift;
694
695	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
696	$self->{'import_xml'}->{'indexer'}->{$a};
697	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
698	$self->{'import_xml'}->{'indexer'}->{$b};
699
700	return $va <=> $vb;
701	}
702
703	=head2 _x
704
705	Convert strings from C<conf/normalize/*.xml> encoding into application
706	specific encoding (optinally specified using C<code_page> to C<new>
707	constructor).
708
709	my $text = $n->_x('normalize text string');
710
711	This is a stub so that other modules doesn't have to implement it.
712
713	=cut
714
715	sub _x {
716	my $self = shift;
717	return shift;
718	}
719
720
721	=head1 AUTHOR
722
723	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
724
725	=head1 COPYRIGHT & LICENSE
726
727	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
728
729	This program is free software; you can redistribute it and/or modify it
730	under the same terms as Perl itself.
731
732	=cut
733
734	1; # End of WebPAC::Normalize