lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
                return @ds if ($#ds > 0);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper(@ds) });

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        return $self->lookup($format);
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.01
15
16	=cut
17
18	our $VERSION = '0.01';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	);
85
86	Parametar C<filter> defines user supplied snippets of perl code which can
87	be use with C<filter{...}> notation.
88
89	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
90	in structures.
91
92	=cut
93
94	sub new {
95	my $class = shift;
96	my $self = {@_};
97	bless($self, $class);
98
99	$self ? return $self : return undef;
100	}
101
102
103	=head2 data_structure
104
105	Create in-memory data structure which represents normalized layout from
106	C<conf/normalize/*.xml>.
107
108	This structures are used to produce output.
109
110	my @ds = $webpac->data_structure($rec);
111
112	B<Note: historical oddity follows>
113
114	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
115	C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
116	C<< <headline> >> tag.
117
118	=cut
119
120	sub data_structure {
121	my $self = shift;
122
123	my $log = $self->_get_logger();
124
125	my $rec = shift;
126	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
127
128	my $cache_file;
129
130	if ($self->{'db'}) {
131	my @ds = $self->{'db'}->load_ds($rec);
132	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
133	return @ds if ($#ds > 0);
134	$log->debug("cache miss, creating");
135	}
136
137	undef $self->{'currnet_filename'};
138	undef $self->{'headline'};
139
140	my @sorted_tags;
141	if ($self->{tags_by_order}) {
142	@sorted_tags = @{$self->{tags_by_order}};
143	} else {
144	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
145	$self->{tags_by_order} = \@sorted_tags;
146	}
147
148	my @ds;
149
150	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
151
152	foreach my $field (@sorted_tags) {
153
154	my $row;
155
156	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
157
158	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
159	my $format = $tag->{'value'} \|\| $tag->{'content'};
160
161	$log->debug("format: $format");
162
163	my @v;
164	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
165	@v = $self->fill_in_to_arr($rec,$format);
166	} else {
167	@v = $self->parse_to_arr($rec,$format);
168	}
169	next if (! @v);
170
171	if ($tag->{'sort'}) {
172	@v = $self->sort_arr(@v);
173	}
174
175	# use format?
176	if ($tag->{'format_name'}) {
177	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
178	}
179
180	if ($field eq 'filename') {
181	$self->{'current_filename'} = join('',@v);
182	$log->debug("filename: ",$self->{'current_filename'});
183	} elsif ($field eq 'headline') {
184	$self->{'headline'} .= join('',@v);
185	$log->debug("headline: ",$self->{'headline'});
186	next; # don't return headline in data_structure!
187	}
188
189	# delimiter will join repeatable fields
190	if ($tag->{'delimiter'}) {
191	@v = ( join($tag->{'delimiter'}, @v) );
192	}
193
194	# default types
195	my @types = qw(display swish);
196	# override by type attribute
197	@types = ( $tag->{'type'} ) if ($tag->{'type'});
198
199	foreach my $type (@types) {
200	# append to previous line?
201	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
202	if ($tag->{'append'}) {
203
204	# I will delimit appended part with
205	# delimiter (or ,)
206	my $d = $tag->{'delimiter'};
207	# default delimiter
208	$d \|\|= " ";
209
210	my $last = pop @{$row->{$type}};
211	$d = "" if (! $last);
212	$last .= $d . join($d, @v);
213	push @{$row->{$type}}, $last;
214
215	} else {
216	push @{$row->{$type}}, @v;
217	}
218	}
219
220
221	}
222
223	if ($row) {
224	$row->{'tag'} = $field;
225
226	# TODO: name_sigular, name_plural
227	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
228	$row->{'name'} = $name ? $self->_x($name) : $field;
229
230	# post-sort all values in field
231	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
232	$log->warn("sort at field tag not implemented");
233	}
234
235	push @ds, $row;
236
237	$log->debug("row $field: ",sub { Dumper($row) });
238	}
239
240	}
241
242	$self->{'db'}->save_ds(
243	ds => \@ds,
244	current_filename => $self->{'current_filename'},
245	headline => $self->{'headline'},
246	) if ($self->{'db'});
247
248	$log->debug("ds: ", sub { Dumper(@ds) });
249
250	return @ds;
251
252	}
253
254	=head2 parse
255
256	Perform smart parsing of string, skipping delimiters for fields which aren't
257	defined. It can also eval code in format starting with C<eval{...}> and
258	return output or nothing depending on eval code.
259
260	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
261
262	=cut
263
264	sub parse {
265	my $self = shift;
266
267	my ($rec, $format_utf8, $i) = @_;
268
269	return if (! $format_utf8);
270
271	my $log = $self->_get_logger();
272
273	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
274
275	$i = 0 if (! $i);
276
277	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
278
279	my @out;
280
281	$log->debug("format: $format");
282
283	my $eval_code;
284	# remove eval{...} from beginning
285	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
286
287	my $filter_name;
288	# remove filter{...} from beginning
289	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
290
291	my $prefix;
292	my $all_found=0;
293
294	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
295
296	my $del = $1 \|\| '';
297	$prefix \|\|= $del if ($all_found == 0);
298
299	# repeatable index
300	my $r = $i;
301	$r = 0 if (lc("$2") eq 's');
302
303	my $found = 0;
304	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
305
306	if ($found) {
307	push @out, $del;
308	push @out, $tmp;
309	$all_found += $found;
310	}
311	}
312
313	return if (! $all_found);
314
315	my $out = join('',@out);
316
317	if ($out) {
318	# add rest of format (suffix)
319	$out .= $format;
320
321	# add prefix if not there
322	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
323
324	$log->debug("result: $out");
325	}
326
327	if ($eval_code) {
328	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
329	$log->debug("about to eval{$eval} format: $out");
330	return if (! $self->_eval($eval));
331	}
332
333	if ($filter_name && $self->{'filter'}->{$filter_name}) {
334	$log->debug("about to filter{$filter_name} format: $out");
335	$out = $self->{'filter'}->{$filter_name}->($out);
336	return unless(defined($out));
337	$log->debug("filter result: $out");
338	}
339
340	return $out;
341	}
342
343	=head2 parse_to_arr
344
345	Similar to C<parse>, but returns array of all repeatable fields
346
347	my @arr = $webpac->parse_to_arr($rec,'v250^a');
348
349	=cut
350
351	sub parse_to_arr {
352	my $self = shift;
353
354	my ($rec, $format_utf8) = @_;
355
356	my $log = $self->_get_logger();
357
358	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
359	return if (! $format_utf8);
360
361	my $i = 0;
362	my @arr;
363
364	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
365	push @arr, $v;
366	}
367
368	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
369
370	return @arr;
371	}
372
373
374	=head2 fill_in
375
376	Workhourse of all: takes record from in-memory structure of database and
377	strings with placeholders and returns string or array of with substituted
378	values from record.
379
380	my $text = $webpac->fill_in($rec,'v250^a');
381
382	Optional argument is ordinal number for repeatable fields. By default,
383	it's assume to be first repeatable field (fields are perl array, so first
384	element is 0).
385	Following example will read second value from repeatable field.
386
387	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
388
389	This function B<does not> perform parsing of format to inteligenty skip
390	delimiters before fields which aren't used.
391
392	This method will automatically decode UTF-8 string to local code page
393	if needed.
394
395	=cut
396
397	sub fill_in {
398	my $self = shift;
399
400	my $log = $self->_get_logger();
401
402	my $rec = shift \|\| $log->logconfess("need data record");
403	my $format = shift \|\| $log->logconfess("need format to parse");
404	# iteration (for repeatable fields)
405	my $i = shift \|\| 0;
406
407	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
408
409	# FIXME remove for speedup?
410	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411
412	if (utf8::is_utf8($format)) {
413	$format = $self->_x($format);
414	}
415
416	my $found = 0;
417
418	my $eval_code;
419	# remove eval{...} from beginning
420	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
421
422	my $filter_name;
423	# remove filter{...} from beginning
424	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
425
426	# do actual replacement of placeholders
427	# repeatable fields
428	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
429	# non-repeatable fields
430	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
431
432	if ($found) {
433	$log->debug("format: $format");
434	if ($eval_code) {
435	my $eval = $self->fill_in($rec,$eval_code,$i);
436	return if (! $self->_eval($eval));
437	}
438	if ($filter_name && $self->{'filter'}->{$filter_name}) {
439	$log->debug("filter '$filter_name' for $format");
440	$format = $self->{'filter'}->{$filter_name}->($format);
441	return unless(defined($format));
442	$log->debug("filter result: $format");
443	}
444	# do we have lookups?
445	if ($self->{'lookup'}) {
446	return $self->lookup($format);
447	} else {
448	return $format;
449	}
450	} else {
451	return;
452	}
453	}
454
455
456	=head2 fill_in_to_arr
457
458	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
459	for fields which have lookups, so they shouldn't be parsed but rather
460	C<fill_id>ed.
461
462	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
463
464	=cut
465
466	sub fill_in_to_arr {
467	my $self = shift;
468
469	my ($rec, $format_utf8) = @_;
470
471	my $log = $self->_get_logger();
472
473	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
474	return if (! $format_utf8);
475
476	my $i = 0;
477	my @arr;
478
479	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
480	push @arr, @v;
481	}
482
483	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
484
485	return @arr;
486	}
487
488
489	=head2 get_data
490
491	Returns value from record.
492
493	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
494
495	Arguments are:
496	record reference C<$rec>,
497	field C<$f>,
498	optional subfiled C<$sf>,
499	index for repeatable values C<$i>.
500
501	Optinal variable C<$found> will be incremeted if there
502	is field.
503
504	Returns value or empty string.
505
506	=cut
507
508	sub get_data {
509	my $self = shift;
510
511	my ($rec,$f,$sf,$i,$found) = @_;
512
513	if ($$rec->{$f}) {
514	return '' if (! $$rec->{$f}->[$i]);
515	no strict 'refs';
516	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
517	$$found++ if (defined($$found));
518	return $$rec->{$f}->[$i]->{$sf};
519	} elsif ($$rec->{$f}->[$i]) {
520	$$found++ if (defined($$found));
521	# it still might have subfield, just
522	# not specified, so we'll dump all
523	if ($$rec->{$f}->[$i] =~ /HASH/o) {
524	my $out;
525	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
526	$out .= $$rec->{$f}->[$i]->{$k}." ";
527	}
528	return $out;
529	} else {
530	return $$rec->{$f}->[$i];
531	}
532	}
533	} else {
534	return '';
535	}
536	}
537
538
539	=head2 apply_format
540
541	Apply format specified in tag with C<format_name="name"> and
542	C<format_delimiter=";;">.
543
544	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
545
546	Formats can contain C<lookup{...}> if you need them.
547
548	=cut
549
550	sub apply_format {
551	my $self = shift;
552
553	my ($name,$delimiter,$data) = @_;
554
555	my $log = $self->_get_logger();
556
557	if (! $self->{'import_xml'}->{'format'}->{$name}) {
558	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
559	return $data;
560	}
561
562	$log->warn("no delimiter for format $name") if (! $delimiter);
563
564	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
565
566	my @data = split(/\Q$delimiter\E/, $data);
567
568	my $out = sprintf($format, @data);
569	$log->debug("using format $name [$format] on $data to produce: $out");
570
571	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
572	return $self->lookup($out);
573	} else {
574	return $out;
575	}
576
577	}
578
579	=head2 sort_arr
580
581	Sort array ignoring case and html in data
582
583	my @sorted = $webpac->sort_arr(@unsorted);
584
585	=cut
586
587	sub sort_arr {
588	my $self = shift;
589
590	my $log = $self->_get_logger();
591
592	# FIXME add Schwartzian Transformation?
593
594	my @sorted = sort {
595	$a =~ s#<[^>]+/*>##;
596	$b =~ s#<[^>]+/*>##;
597	lc($b) cmp lc($a)
598	} @_;
599	$log->debug("sorted values: ",sub { join(", ",@sorted) });
600
601	return @sorted;
602	}
603
604
605	=head1 INTERNAL METHODS
606
607	=head2 _sort_by_order
608
609	Sort xml tags data structure accoding to C<order=""> attribute.
610
611	=cut
612
613	sub _sort_by_order {
614	my $self = shift;
615
616	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
617	$self->{'import_xml'}->{'indexer'}->{$a};
618	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
619	$self->{'import_xml'}->{'indexer'}->{$b};
620
621	return $va <=> $vb;
622	}
623
624	=head2 _x
625
626	Convert strings from C<conf/normalize/*.xml> encoding into application
627	specific encoding (optinally specified using C<code_page> to C<new>
628	constructor).
629
630	my $text = $n->_x('normalize text string');
631
632	This is a stub so that other modules doesn't have to implement it.
633
634	=cut
635
636	sub _x {
637	my $self = shift;
638	return shift;
639	}
640
641
642	=head1 AUTHOR
643
644	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
645
646	=head1 COPYRIGHT & LICENSE
647
648	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
649
650	This program is free software; you can redistribute it and/or modify it
651	under the same terms as Perl itself.
652
653	=cut
654
655	1; # End of WebPAC::DB