lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.04

=cut

our $VERSION = '0.04';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.04
15
16	=cut
17
18	our $VERSION = '0.04';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	lookup => $lookup_obj,
85	prefix => 'foobar',
86	);
87
88	Parametar C<filter> defines user supplied snippets of perl code which can
89	be use with C<filter{...}> notation.
90
91	C<prefix> is used to form filename for database record (to support multiple
92	source files which are joined in one database).
93
94	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95	in structures. If you pass this parametar, you must also pass C<lookup>
96	which is C<WebPAC::Lookup> object.
97
98	=cut
99
100	sub new {
101	my $class = shift;
102	my $self = {@_};
103	bless($self, $class);
104
105	my $r = $self->{'lookup_regex'} ? 1 : 0;
106	my $l = $self->{'lookup'} ? 1 : 0;
107
108	my $log = $self->_get_logger();
109
110	# those two must be in pair
111	if ( ($r & $l) != ($r \|\| $l) ) {
112	my $log = $self->_get_logger();
113	$log->logdie("lookup_regex and lookup must be in pair");
114	}
115
116	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120	$self ? return $self : return undef;
121	}
122
123
124	=head2 data_structure
125
126	Create in-memory data structure which represents normalized layout from
127	C<conf/normalize/*.xml>.
128
129	This structures are used to produce output.
130
131	my $ds = $webpac->data_structure($rec);
132
133	=cut
134
135	sub data_structure {
136	my $self = shift;
137
138	my $log = $self->_get_logger();
139
140	my $rec = shift;
141	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143	$log->debug("data_structure rec = ", sub { Dumper($rec) });
144
145	$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
146
147	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
148
149	my $cache_file;
150
151	if ($self->{'db'}) {
152	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
153	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
154	return $ds if ($ds);
155	$log->debug("cache miss, creating");
156	}
157
158	undef $self->{'currnet_filename'};
159	undef $self->{'headline'};
160
161	my @sorted_tags;
162	if ($self->{tags_by_order}) {
163	@sorted_tags = @{$self->{tags_by_order}};
164	} else {
165	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
166	$self->{tags_by_order} = \@sorted_tags;
167	}
168
169	my $ds;
170
171	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
172
173	foreach my $field (@sorted_tags) {
174
175	my $row;
176
177	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
178
179	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
180	my $format;
181
182	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
183	$format = $tag->{'value'} \|\| $tag->{'content'};
184
185	$log->debug("format: $format");
186
187	my @v;
188	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
189	@v = $self->fill_in_to_arr($rec,$format);
190	} else {
191	@v = $self->parse_to_arr($rec,$format);
192	}
193	next if (! @v);
194
195	if ($tag->{'sort'}) {
196	@v = $self->sort_arr(@v);
197	}
198
199	# use format?
200	if ($tag->{'format_name'}) {
201	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
202	}
203
204	# delimiter will join repeatable fields
205	if ($tag->{'delimiter'}) {
206	@v = ( join($tag->{'delimiter'}, @v) );
207	}
208
209	# default types
210	my @types = qw(display search);
211	# override by type attribute
212	@types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214	foreach my $type (@types) {
215	# append to previous line?
216	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
217	if ($tag->{'append'}) {
218
219	# I will delimit appended part with
220	# delimiter (or ,)
221	my $d = $tag->{'delimiter'};
222	# default delimiter
223	$d \|\|= " ";
224
225	my $last = pop @{$row->{$type}};
226	$d = "" if (! $last);
227	$last .= $d . join($d, @v);
228	push @{$row->{$type}}, $last;
229
230	} else {
231	push @{$row->{$type}}, @v;
232	}
233	}
234
235
236	}
237
238	if ($row) {
239	$row->{'tag'} = $field;
240
241	# TODO: name_sigular, name_plural
242	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243	my $row_name = $name ? $self->_x($name) : $field;
244
245	# post-sort all values in field
246	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247	$log->warn("sort at field tag not implemented");
248	}
249
250	$ds->{$row_name} = $row;
251
252	$log->debug("row $field: ",sub { Dumper($row) });
253	}
254
255	}
256
257	$self->{'db'}->save_ds(
258	id => $id,
259	ds => $ds,
260	prefix => $self->{prefix},
261	) if ($self->{'db'});
262
263	$log->debug("ds: ", sub { Dumper($ds) });
264
265	$log->logconfess("data structure returned is not array any more!") if wantarray;
266
267	return $ds;
268
269	}
270
271	=head2 parse
272
273	Perform smart parsing of string, skipping delimiters for fields which aren't
274	defined. It can also eval code in format starting with C<eval{...}> and
275	return output or nothing depending on eval code.
276
277	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
278
279	=cut
280
281	sub parse {
282	my $self = shift;
283
284	my ($rec, $format_utf8, $i) = @_;
285
286	return if (! $format_utf8);
287
288	my $log = $self->_get_logger();
289
290	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
291
292	$i = 0 if (! $i);
293
294	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
295
296	my @out;
297
298	$log->debug("format: $format");
299
300	my $eval_code;
301	# remove eval{...} from beginning
302	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
303
304	my $filter_name;
305	# remove filter{...} from beginning
306	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
307
308	my $prefix;
309	my $all_found=0;
310
311	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
312
313	my $del = $1 \|\| '';
314	$prefix \|\|= $del if ($all_found == 0);
315
316	# repeatable index
317	my $r = $i;
318	$r = 0 if (lc("$2") eq 's');
319
320	my $found = 0;
321	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
322
323	if ($found) {
324	push @out, $del;
325	push @out, $tmp;
326	$all_found += $found;
327	}
328	}
329
330	return if (! $all_found);
331
332	my $out = join('',@out);
333
334	if ($out) {
335	# add rest of format (suffix)
336	$out .= $format;
337
338	# add prefix if not there
339	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
340
341	$log->debug("result: $out");
342	}
343
344	if ($eval_code) {
345	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
346	$log->debug("about to eval{$eval} format: $out");
347	return if (! $self->_eval($eval));
348	}
349
350	if ($filter_name && $self->{'filter'}->{$filter_name}) {
351	$log->debug("about to filter{$filter_name} format: $out");
352	$out = $self->{'filter'}->{$filter_name}->($out);
353	return unless(defined($out));
354	$log->debug("filter result: $out");
355	}
356
357	return $out;
358	}
359
360	=head2 parse_to_arr
361
362	Similar to C<parse>, but returns array of all repeatable fields
363
364	my @arr = $webpac->parse_to_arr($rec,'v250^a');
365
366	=cut
367
368	sub parse_to_arr {
369	my $self = shift;
370
371	my ($rec, $format_utf8) = @_;
372
373	my $log = $self->_get_logger();
374
375	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
376	return if (! $format_utf8);
377
378	my $i = 0;
379	my @arr;
380
381	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
382	push @arr, $v;
383	}
384
385	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
386
387	return @arr;
388	}
389
390
391	=head2 fill_in
392
393	Workhourse of all: takes record from in-memory structure of database and
394	strings with placeholders and returns string or array of with substituted
395	values from record.
396
397	my $text = $webpac->fill_in($rec,'v250^a');
398
399	Optional argument is ordinal number for repeatable fields. By default,
400	it's assume to be first repeatable field (fields are perl array, so first
401	element is 0).
402	Following example will read second value from repeatable field.
403
404	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
405
406	This function B<does not> perform parsing of format to inteligenty skip
407	delimiters before fields which aren't used.
408
409	This method will automatically decode UTF-8 string to local code page
410	if needed.
411
412	=cut
413
414	sub fill_in {
415	my $self = shift;
416
417	my $log = $self->_get_logger();
418
419	my $rec = shift \|\| $log->logconfess("need data record");
420	my $format = shift \|\| $log->logconfess("need format to parse");
421	# iteration (for repeatable fields)
422	my $i = shift \|\| 0;
423
424	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
425
426	# FIXME remove for speedup?
427	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
428
429	if (utf8::is_utf8($format)) {
430	$format = $self->_x($format);
431	}
432
433	my $found = 0;
434
435	my $eval_code;
436	# remove eval{...} from beginning
437	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
438
439	my $filter_name;
440	# remove filter{...} from beginning
441	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
442
443	# do actual replacement of placeholders
444	# repeatable fields
445	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
446	# non-repeatable fields
447	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
448
449	if ($found) {
450	$log->debug("format: $format");
451	if ($eval_code) {
452	my $eval = $self->fill_in($rec,$eval_code,$i);
453	return if (! $self->_eval($eval));
454	}
455	if ($filter_name && $self->{'filter'}->{$filter_name}) {
456	$log->debug("filter '$filter_name' for $format");
457	$format = $self->{'filter'}->{$filter_name}->($format);
458	return unless(defined($format));
459	$log->debug("filter result: $format");
460	}
461	# do we have lookups?
462	if ($self->{'lookup'}) {
463	if ($self->{'lookup'}->can('lookup')) {
464	return $self->{'lookup'}->lookup($format);
465	} else {
466	$log->warn("Have lookup object but can't invoke lookup method");
467	}
468	} else {
469	return $format;
470	}
471	} else {
472	return;
473	}
474	}
475
476
477	=head2 fill_in_to_arr
478
479	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
480	for fields which have lookups, so they shouldn't be parsed but rather
481	C<fill_id>ed.
482
483	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
484
485	=cut
486
487	sub fill_in_to_arr {
488	my $self = shift;
489
490	my ($rec, $format_utf8) = @_;
491
492	my $log = $self->_get_logger();
493
494	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
495	return if (! $format_utf8);
496
497	my $i = 0;
498	my @arr;
499
500	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
501	push @arr, @v;
502	}
503
504	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
505
506	return @arr;
507	}
508
509
510	=head2 get_data
511
512	Returns value from record.
513
514	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
515
516	Arguments are:
517	record reference C<$rec>,
518	field C<$f>,
519	optional subfiled C<$sf>,
520	index for repeatable values C<$i>.
521
522	Optinal variable C<$found> will be incremeted if there
523	is field.
524
525	Returns value or empty string.
526
527	=cut
528
529	sub get_data {
530	my $self = shift;
531
532	my ($rec,$f,$sf,$i,$found) = @_;
533
534	if ($$rec->{$f}) {
535	return '' if (! $$rec->{$f}->[$i]);
536	no strict 'refs';
537	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
538	$$found++ if (defined($$found));
539	return $$rec->{$f}->[$i]->{$sf};
540	} elsif (! $sf && $$rec->{$f}->[$i]) {
541	$$found++ if (defined($$found));
542	# it still might have subfield, just
543	# not specified, so we'll dump all
544	if ($$rec->{$f}->[$i] =~ /HASH/o) {
545	my $out;
546	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
547	$out .= $$rec->{$f}->[$i]->{$k}." ";
548	}
549	return $out;
550	} else {
551	return $$rec->{$f}->[$i];
552	}
553	} else {
554	return '';
555	}
556	} else {
557	return '';
558	}
559	}
560
561
562	=head2 apply_format
563
564	Apply format specified in tag with C<format_name="name"> and
565	C<format_delimiter=";;">.
566
567	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
568
569	Formats can contain C<lookup{...}> if you need them.
570
571	=cut
572
573	sub apply_format {
574	my $self = shift;
575
576	my ($name,$delimiter,$data) = @_;
577
578	my $log = $self->_get_logger();
579
580	if (! $self->{'import_xml'}->{'format'}->{$name}) {
581	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
582	return $data;
583	}
584
585	$log->warn("no delimiter for format $name") if (! $delimiter);
586
587	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
588
589	my @data = split(/\Q$delimiter\E/, $data);
590
591	my $out = sprintf($format, @data);
592	$log->debug("using format $name [$format] on $data to produce: $out");
593
594	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
595	return $self->{'lookup'}->lookup($out);
596	} else {
597	return $out;
598	}
599
600	}
601
602	=head2 sort_arr
603
604	Sort array ignoring case and html in data
605
606	my @sorted = $webpac->sort_arr(@unsorted);
607
608	=cut
609
610	sub sort_arr {
611	my $self = shift;
612
613	my $log = $self->_get_logger();
614
615	# FIXME add Schwartzian Transformation?
616
617	my @sorted = sort {
618	$a =~ s#<[^>]+/*>##;
619	$b =~ s#<[^>]+/*>##;
620	lc($b) cmp lc($a)
621	} @_;
622	$log->debug("sorted values: ",sub { join(", ",@sorted) });
623
624	return @sorted;
625	}
626
627
628	=head1 INTERNAL METHODS
629
630	=head2 _sort_by_order
631
632	Sort xml tags data structure accoding to C<order=""> attribute.
633
634	=cut
635
636	sub _sort_by_order {
637	my $self = shift;
638
639	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
640	$self->{'import_xml'}->{'indexer'}->{$a};
641	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
642	$self->{'import_xml'}->{'indexer'}->{$b};
643
644	return $va <=> $vb;
645	}
646
647	=head2 _x
648
649	Convert strings from C<conf/normalize/*.xml> encoding into application
650	specific encoding (optinally specified using C<code_page> to C<new>
651	constructor).
652
653	my $text = $n->_x('normalize text string');
654
655	This is a stub so that other modules doesn't have to implement it.
656
657	=cut
658
659	sub _x {
660	my $self = shift;
661	return shift;
662	}
663
664
665	=head1 AUTHOR
666
667	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
668
669	=head1 COPYRIGHT & LICENSE
670
671	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
672
673	This program is free software; you can redistribute it and/or modify it
674	under the same terms as Perl itself.
675
676	=cut
677
678	1; # End of WebPAC::Normalize