lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
                return @ds if ($#ds > 0);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper(@ds) });

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.01
15
16	=cut
17
18	our $VERSION = '0.01';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	lookup => $lookup_obj,
85	);
86
87	Parametar C<filter> defines user supplied snippets of perl code which can
88	be use with C<filter{...}> notation.
89
90	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91	in structures. If you pass this parametar, you must also pass C<lookup>
92	which is C<WebPAC::Lookup> object.
93
94	=cut
95
96	sub new {
97	my $class = shift;
98	my $self = {@_};
99	bless($self, $class);
100
101	my $r = $self->{'lookup_regex'} ? 1 : 0;
102	my $l = $self->{'lookup'} ? 1 : 0;
103
104	my $log = $self->_get_logger();
105
106	# those two must be in pair
107	if ( ($r & $l) != ($r \|\| $l) ) {
108	my $log = $self->_get_logger();
109	$log->logdie("lookup_regex and lookup must be in pair");
110	}
111
112	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114	$self ? return $self : return undef;
115	}
116
117
118	=head2 data_structure
119
120	Create in-memory data structure which represents normalized layout from
121	C<conf/normalize/*.xml>.
122
123	This structures are used to produce output.
124
125	my @ds = $webpac->data_structure($rec);
126
127	B<Note: historical oddity follows>
128
129	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130	C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131	C<< <headline> >> tag.
132
133	=cut
134
135	sub data_structure {
136	my $self = shift;
137
138	my $log = $self->_get_logger();
139
140	my $rec = shift;
141	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143	my $cache_file;
144
145	if ($self->{'db'}) {
146	my @ds = $self->{'db'}->load_ds($rec);
147	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148	return @ds if ($#ds > 0);
149	$log->debug("cache miss, creating");
150	}
151
152	undef $self->{'currnet_filename'};
153	undef $self->{'headline'};
154
155	my @sorted_tags;
156	if ($self->{tags_by_order}) {
157	@sorted_tags = @{$self->{tags_by_order}};
158	} else {
159	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160	$self->{tags_by_order} = \@sorted_tags;
161	}
162
163	my @ds;
164
165	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167	foreach my $field (@sorted_tags) {
168
169	my $row;
170
171	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174	my $format = $tag->{'value'} \|\| $tag->{'content'};
175
176	$log->debug("format: $format");
177
178	my @v;
179	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180	@v = $self->fill_in_to_arr($rec,$format);
181	} else {
182	@v = $self->parse_to_arr($rec,$format);
183	}
184	next if (! @v);
185
186	if ($tag->{'sort'}) {
187	@v = $self->sort_arr(@v);
188	}
189
190	# use format?
191	if ($tag->{'format_name'}) {
192	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193	}
194
195	if ($field eq 'filename') {
196	$self->{'current_filename'} = join('',@v);
197	$log->debug("filename: ",$self->{'current_filename'});
198	} elsif ($field eq 'headline') {
199	$self->{'headline'} .= join('',@v);
200	$log->debug("headline: ",$self->{'headline'});
201	next; # don't return headline in data_structure!
202	}
203
204	# delimiter will join repeatable fields
205	if ($tag->{'delimiter'}) {
206	@v = ( join($tag->{'delimiter'}, @v) );
207	}
208
209	# default types
210	my @types = qw(display swish);
211	# override by type attribute
212	@types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214	foreach my $type (@types) {
215	# append to previous line?
216	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
217	if ($tag->{'append'}) {
218
219	# I will delimit appended part with
220	# delimiter (or ,)
221	my $d = $tag->{'delimiter'};
222	# default delimiter
223	$d \|\|= " ";
224
225	my $last = pop @{$row->{$type}};
226	$d = "" if (! $last);
227	$last .= $d . join($d, @v);
228	push @{$row->{$type}}, $last;
229
230	} else {
231	push @{$row->{$type}}, @v;
232	}
233	}
234
235
236	}
237
238	if ($row) {
239	$row->{'tag'} = $field;
240
241	# TODO: name_sigular, name_plural
242	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243	$row->{'name'} = $name ? $self->_x($name) : $field;
244
245	# post-sort all values in field
246	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247	$log->warn("sort at field tag not implemented");
248	}
249
250	push @ds, $row;
251
252	$log->debug("row $field: ",sub { Dumper($row) });
253	}
254
255	}
256
257	$self->{'db'}->save_ds(
258	ds => \@ds,
259	current_filename => $self->{'current_filename'},
260	headline => $self->{'headline'},
261	) if ($self->{'db'});
262
263	$log->debug("ds: ", sub { Dumper(@ds) });
264
265	return @ds;
266
267	}
268
269	=head2 parse
270
271	Perform smart parsing of string, skipping delimiters for fields which aren't
272	defined. It can also eval code in format starting with C<eval{...}> and
273	return output or nothing depending on eval code.
274
275	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
276
277	=cut
278
279	sub parse {
280	my $self = shift;
281
282	my ($rec, $format_utf8, $i) = @_;
283
284	return if (! $format_utf8);
285
286	my $log = $self->_get_logger();
287
288	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
289
290	$i = 0 if (! $i);
291
292	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
293
294	my @out;
295
296	$log->debug("format: $format");
297
298	my $eval_code;
299	# remove eval{...} from beginning
300	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
301
302	my $filter_name;
303	# remove filter{...} from beginning
304	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
305
306	my $prefix;
307	my $all_found=0;
308
309	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
310
311	my $del = $1 \|\| '';
312	$prefix \|\|= $del if ($all_found == 0);
313
314	# repeatable index
315	my $r = $i;
316	$r = 0 if (lc("$2") eq 's');
317
318	my $found = 0;
319	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
320
321	if ($found) {
322	push @out, $del;
323	push @out, $tmp;
324	$all_found += $found;
325	}
326	}
327
328	return if (! $all_found);
329
330	my $out = join('',@out);
331
332	if ($out) {
333	# add rest of format (suffix)
334	$out .= $format;
335
336	# add prefix if not there
337	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
338
339	$log->debug("result: $out");
340	}
341
342	if ($eval_code) {
343	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
344	$log->debug("about to eval{$eval} format: $out");
345	return if (! $self->_eval($eval));
346	}
347
348	if ($filter_name && $self->{'filter'}->{$filter_name}) {
349	$log->debug("about to filter{$filter_name} format: $out");
350	$out = $self->{'filter'}->{$filter_name}->($out);
351	return unless(defined($out));
352	$log->debug("filter result: $out");
353	}
354
355	return $out;
356	}
357
358	=head2 parse_to_arr
359
360	Similar to C<parse>, but returns array of all repeatable fields
361
362	my @arr = $webpac->parse_to_arr($rec,'v250^a');
363
364	=cut
365
366	sub parse_to_arr {
367	my $self = shift;
368
369	my ($rec, $format_utf8) = @_;
370
371	my $log = $self->_get_logger();
372
373	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
374	return if (! $format_utf8);
375
376	my $i = 0;
377	my @arr;
378
379	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
380	push @arr, $v;
381	}
382
383	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
384
385	return @arr;
386	}
387
388
389	=head2 fill_in
390
391	Workhourse of all: takes record from in-memory structure of database and
392	strings with placeholders and returns string or array of with substituted
393	values from record.
394
395	my $text = $webpac->fill_in($rec,'v250^a');
396
397	Optional argument is ordinal number for repeatable fields. By default,
398	it's assume to be first repeatable field (fields are perl array, so first
399	element is 0).
400	Following example will read second value from repeatable field.
401
402	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
403
404	This function B<does not> perform parsing of format to inteligenty skip
405	delimiters before fields which aren't used.
406
407	This method will automatically decode UTF-8 string to local code page
408	if needed.
409
410	=cut
411
412	sub fill_in {
413	my $self = shift;
414
415	my $log = $self->_get_logger();
416
417	my $rec = shift \|\| $log->logconfess("need data record");
418	my $format = shift \|\| $log->logconfess("need format to parse");
419	# iteration (for repeatable fields)
420	my $i = shift \|\| 0;
421
422	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
423
424	# FIXME remove for speedup?
425	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
426
427	if (utf8::is_utf8($format)) {
428	$format = $self->_x($format);
429	}
430
431	my $found = 0;
432
433	my $eval_code;
434	# remove eval{...} from beginning
435	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
436
437	my $filter_name;
438	# remove filter{...} from beginning
439	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
440
441	# do actual replacement of placeholders
442	# repeatable fields
443	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
444	# non-repeatable fields
445	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
446
447	if ($found) {
448	$log->debug("format: $format");
449	if ($eval_code) {
450	my $eval = $self->fill_in($rec,$eval_code,$i);
451	return if (! $self->_eval($eval));
452	}
453	if ($filter_name && $self->{'filter'}->{$filter_name}) {
454	$log->debug("filter '$filter_name' for $format");
455	$format = $self->{'filter'}->{$filter_name}->($format);
456	return unless(defined($format));
457	$log->debug("filter result: $format");
458	}
459	# do we have lookups?
460	if ($self->{'lookup'}) {
461	if ($self->{'lookup'}->can('lookup')) {
462	return $self->{'lookup'}->lookup($format);
463	} else {
464	$log->warn("Have lookup object but can't invoke lookup method");
465	}
466	} else {
467	return $format;
468	}
469	} else {
470	return;
471	}
472	}
473
474
475	=head2 fill_in_to_arr
476
477	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
478	for fields which have lookups, so they shouldn't be parsed but rather
479	C<fill_id>ed.
480
481	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
482
483	=cut
484
485	sub fill_in_to_arr {
486	my $self = shift;
487
488	my ($rec, $format_utf8) = @_;
489
490	my $log = $self->_get_logger();
491
492	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
493	return if (! $format_utf8);
494
495	my $i = 0;
496	my @arr;
497
498	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
499	push @arr, @v;
500	}
501
502	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
503
504	return @arr;
505	}
506
507
508	=head2 get_data
509
510	Returns value from record.
511
512	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
513
514	Arguments are:
515	record reference C<$rec>,
516	field C<$f>,
517	optional subfiled C<$sf>,
518	index for repeatable values C<$i>.
519
520	Optinal variable C<$found> will be incremeted if there
521	is field.
522
523	Returns value or empty string.
524
525	=cut
526
527	sub get_data {
528	my $self = shift;
529
530	my ($rec,$f,$sf,$i,$found) = @_;
531
532	if ($$rec->{$f}) {
533	return '' if (! $$rec->{$f}->[$i]);
534	no strict 'refs';
535	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
536	$$found++ if (defined($$found));
537	return $$rec->{$f}->[$i]->{$sf};
538	} elsif ($$rec->{$f}->[$i]) {
539	$$found++ if (defined($$found));
540	# it still might have subfield, just
541	# not specified, so we'll dump all
542	if ($$rec->{$f}->[$i] =~ /HASH/o) {
543	my $out;
544	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
545	$out .= $$rec->{$f}->[$i]->{$k}." ";
546	}
547	return $out;
548	} else {
549	return $$rec->{$f}->[$i];
550	}
551	}
552	} else {
553	return '';
554	}
555	}
556
557
558	=head2 apply_format
559
560	Apply format specified in tag with C<format_name="name"> and
561	C<format_delimiter=";;">.
562
563	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
564
565	Formats can contain C<lookup{...}> if you need them.
566
567	=cut
568
569	sub apply_format {
570	my $self = shift;
571
572	my ($name,$delimiter,$data) = @_;
573
574	my $log = $self->_get_logger();
575
576	if (! $self->{'import_xml'}->{'format'}->{$name}) {
577	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
578	return $data;
579	}
580
581	$log->warn("no delimiter for format $name") if (! $delimiter);
582
583	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
584
585	my @data = split(/\Q$delimiter\E/, $data);
586
587	my $out = sprintf($format, @data);
588	$log->debug("using format $name [$format] on $data to produce: $out");
589
590	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
591	return $self->{'lookup'}->lookup($out);
592	} else {
593	return $out;
594	}
595
596	}
597
598	=head2 sort_arr
599
600	Sort array ignoring case and html in data
601
602	my @sorted = $webpac->sort_arr(@unsorted);
603
604	=cut
605
606	sub sort_arr {
607	my $self = shift;
608
609	my $log = $self->_get_logger();
610
611	# FIXME add Schwartzian Transformation?
612
613	my @sorted = sort {
614	$a =~ s#<[^>]+/*>##;
615	$b =~ s#<[^>]+/*>##;
616	lc($b) cmp lc($a)
617	} @_;
618	$log->debug("sorted values: ",sub { join(", ",@sorted) });
619
620	return @sorted;
621	}
622
623
624	=head1 INTERNAL METHODS
625
626	=head2 _sort_by_order
627
628	Sort xml tags data structure accoding to C<order=""> attribute.
629
630	=cut
631
632	sub _sort_by_order {
633	my $self = shift;
634
635	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
636	$self->{'import_xml'}->{'indexer'}->{$a};
637	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
638	$self->{'import_xml'}->{'indexer'}->{$b};
639
640	return $va <=> $vb;
641	}
642
643	=head2 _x
644
645	Convert strings from C<conf/normalize/*.xml> encoding into application
646	specific encoding (optinally specified using C<code_page> to C<new>
647	constructor).
648
649	my $text = $n->_x('normalize text string');
650
651	This is a stub so that other modules doesn't have to implement it.
652
653	=cut
654
655	sub _x {
656	my $self = shift;
657	return shift;
658	}
659
660
661	=head1 AUTHOR
662
663	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
664
665	=head1 COPYRIGHT & LICENSE
666
667	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
668
669	This program is free software; you can redistribute it and/or modify it
670	under the same terms as Perl itself.
671
672	=cut
673
674	1; # End of WebPAC::DB