lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $webpac_db_obj,
        lookup_regex => $lookup->regex,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->get_ds($rec);
                return @ds if (@ds);
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->put_gs(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        return $self->lookup($format);
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	13	use Data::Dumper;
6	dpavlin	10
7			=head1 NAME
8
9	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
10	dpavlin	10
11			=head1 VERSION
12
13			Version 0.01
14
15			=cut
16
17			our $VERSION = '0.01';
18
19			=head1 SYNOPSIS
20
21	dpavlin	15	This package contains code that mungle data to produce normalized format.
22	dpavlin	10
23	dpavlin	15	It contains several assumptions:
24
25			=over
26
27			=item *
28
29			format of fields is defined using C<v123^a> notation for repeatable fields
30			or C<s123^a> for single (or first) value, where C<123> is field number and
31			C<a> is subfield.
32
33			=item *
34
35			source data records (C<$rec>) have unique identifiers in field C<000>
36
37			=item *
38
39			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
40			perl code that is evaluated before producing output (value of field will be
41			interpolated before that)
42
43			=item *
44
45			optional C<filter{filter_name}> at B<begining of format> will apply perl
46			code defined as code ref on format after field substitution to producing
47			output
48
49			=item *
50
51			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
52
53			=item *
54
55			at end, optional C<format>s rules are resolved. Format rules are similar to
56			C<sprintf> and can also contain C<lookup{...}> which is performed after
57			values are inserted in format.
58
59			=back
60
61			This also describes order in which transformations are applied (eval,
62			filter, lookup, format) which is important to undestand when deciding how to
63			solve your data mungling and normalisation process.
64
65
66
67
68	dpavlin	10	=head1 FUNCTIONS
69
70	dpavlin	13	=head2 new
71	dpavlin	10
72	dpavlin	13	Create new normalisation object
73
74			my $n = new WebPAC::Normalize::Something(
75	dpavlin	15	filter => {
76			'filter_name_1' => sub {
77			# filter code
78			return length($_);
79			}, ...
80			},
81	dpavlin	18	db => $webpac_db_obj,
82	dpavlin	13	lookup_regex => $lookup->regex,
83			);
84
85	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
86			be use with C<filter{...}> notation.
87
88	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
89			in structures.
90
91	dpavlin	10	=cut
92
93	dpavlin	13	sub new {
94			my $class = shift;
95			my $self = {@_};
96			bless($self, $class);
97
98			$self ? return $self : return undef;
99	dpavlin	10	}
100
101	dpavlin	13
102			=head2 data_structure
103
104			Create in-memory data structure which represents normalized layout from
105			C<conf/normalize/*.xml>.
106
107			This structures are used to produce output.
108
109			my @ds = $webpac->data_structure($rec);
110
111			B<Note: historical oddity follows>
112
113			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
114			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
115			C<< <headline> >> tag.
116
117			=cut
118
119			sub data_structure {
120			my $self = shift;
121
122			my $log = $self->_get_logger();
123
124			my $rec = shift;
125			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
126
127			my $cache_file;
128
129	dpavlin	18	if ($self->{'db'}) {
130			my @ds = $self->{'db'}->get_ds($rec);
131			return @ds if (@ds);
132	dpavlin	13	}
133
134			undef $self->{'currnet_filename'};
135			undef $self->{'headline'};
136
137			my @sorted_tags;
138			if ($self->{tags_by_order}) {
139			@sorted_tags = @{$self->{tags_by_order}};
140			} else {
141			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
142			$self->{tags_by_order} = \@sorted_tags;
143			}
144
145			my @ds;
146
147			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
148
149			foreach my $field (@sorted_tags) {
150
151			my $row;
152
153			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
154
155			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
156			my $format = $tag->{'value'} \|\| $tag->{'content'};
157
158			$log->debug("format: $format");
159
160			my @v;
161			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
162			@v = $self->fill_in_to_arr($rec,$format);
163			} else {
164			@v = $self->parse_to_arr($rec,$format);
165			}
166			next if (! @v);
167
168			if ($tag->{'sort'}) {
169			@v = $self->sort_arr(@v);
170			}
171
172			# use format?
173			if ($tag->{'format_name'}) {
174			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
175			}
176
177			if ($field eq 'filename') {
178			$self->{'current_filename'} = join('',@v);
179			$log->debug("filename: ",$self->{'current_filename'});
180			} elsif ($field eq 'headline') {
181			$self->{'headline'} .= join('',@v);
182			$log->debug("headline: ",$self->{'headline'});
183			next; # don't return headline in data_structure!
184			}
185
186			# delimiter will join repeatable fields
187			if ($tag->{'delimiter'}) {
188			@v = ( join($tag->{'delimiter'}, @v) );
189			}
190
191			# default types
192			my @types = qw(display swish);
193			# override by type attribute
194			@types = ( $tag->{'type'} ) if ($tag->{'type'});
195
196			foreach my $type (@types) {
197			# append to previous line?
198			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
199			if ($tag->{'append'}) {
200
201			# I will delimit appended part with
202			# delimiter (or ,)
203			my $d = $tag->{'delimiter'};
204			# default delimiter
205			$d \|\|= " ";
206
207			my $last = pop @{$row->{$type}};
208			$d = "" if (! $last);
209			$last .= $d . join($d, @v);
210			push @{$row->{$type}}, $last;
211
212			} else {
213			push @{$row->{$type}}, @v;
214			}
215			}
216
217
218			}
219
220			if ($row) {
221			$row->{'tag'} = $field;
222
223			# TODO: name_sigular, name_plural
224			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
225			$row->{'name'} = $name ? $self->_x($name) : $field;
226
227			# post-sort all values in field
228			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
229			$log->warn("sort at field tag not implemented");
230			}
231
232			push @ds, $row;
233
234			$log->debug("row $field: ",sub { Dumper($row) });
235			}
236
237			}
238
239	dpavlin	18	$self->{'db'}->put_gs(
240			ds => \@ds,
241			current_filename => $self->{'current_filename'},
242			headline => $self->{'headline'},
243			) if ($self->{'db'});
244	dpavlin	13
245			return @ds;
246
247			}
248
249			=head2 parse
250
251			Perform smart parsing of string, skipping delimiters for fields which aren't
252			defined. It can also eval code in format starting with C<eval{...}> and
253			return output or nothing depending on eval code.
254
255			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
256
257			=cut
258
259			sub parse {
260			my $self = shift;
261
262			my ($rec, $format_utf8, $i) = @_;
263
264			return if (! $format_utf8);
265
266			my $log = $self->_get_logger();
267
268			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
269
270			$i = 0 if (! $i);
271
272			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
273
274			my @out;
275
276			$log->debug("format: $format");
277
278			my $eval_code;
279			# remove eval{...} from beginning
280			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
281
282			my $filter_name;
283			# remove filter{...} from beginning
284			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
285
286			my $prefix;
287			my $all_found=0;
288
289			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
290
291			my $del = $1 \|\| '';
292			$prefix \|\|= $del if ($all_found == 0);
293
294			# repeatable index
295			my $r = $i;
296			$r = 0 if (lc("$2") eq 's');
297
298			my $found = 0;
299			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
300
301			if ($found) {
302			push @out, $del;
303			push @out, $tmp;
304			$all_found += $found;
305			}
306			}
307
308			return if (! $all_found);
309
310			my $out = join('',@out);
311
312			if ($out) {
313			# add rest of format (suffix)
314			$out .= $format;
315
316			# add prefix if not there
317			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
318
319			$log->debug("result: $out");
320			}
321
322			if ($eval_code) {
323			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
324			$log->debug("about to eval{$eval} format: $out");
325			return if (! $self->_eval($eval));
326			}
327
328			if ($filter_name && $self->{'filter'}->{$filter_name}) {
329			$log->debug("about to filter{$filter_name} format: $out");
330			$out = $self->{'filter'}->{$filter_name}->($out);
331			return unless(defined($out));
332			$log->debug("filter result: $out");
333			}
334
335			return $out;
336			}
337
338			=head2 parse_to_arr
339
340			Similar to C<parse>, but returns array of all repeatable fields
341
342			my @arr = $webpac->parse_to_arr($rec,'v250^a');
343
344			=cut
345
346			sub parse_to_arr {
347			my $self = shift;
348
349			my ($rec, $format_utf8) = @_;
350
351			my $log = $self->_get_logger();
352
353			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
354			return if (! $format_utf8);
355
356			my $i = 0;
357			my @arr;
358
359			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
360			push @arr, $v;
361			}
362
363			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
364
365			return @arr;
366			}
367
368	dpavlin	15
369			=head2 fill_in
370
371			Workhourse of all: takes record from in-memory structure of database and
372			strings with placeholders and returns string or array of with substituted
373			values from record.
374
375			my $text = $webpac->fill_in($rec,'v250^a');
376
377			Optional argument is ordinal number for repeatable fields. By default,
378			it's assume to be first repeatable field (fields are perl array, so first
379			element is 0).
380			Following example will read second value from repeatable field.
381
382			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
383
384			This function B<does not> perform parsing of format to inteligenty skip
385			delimiters before fields which aren't used.
386
387			This method will automatically decode UTF-8 string to local code page
388			if needed.
389
390			=cut
391
392			sub fill_in {
393			my $self = shift;
394
395			my $log = $self->_get_logger();
396
397			my $rec = shift \|\| $log->logconfess("need data record");
398			my $format = shift \|\| $log->logconfess("need format to parse");
399			# iteration (for repeatable fields)
400			my $i = shift \|\| 0;
401
402			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
403
404			# FIXME remove for speedup?
405			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
406
407			if (utf8::is_utf8($format)) {
408			$format = $self->_x($format);
409			}
410
411			my $found = 0;
412
413			my $eval_code;
414			# remove eval{...} from beginning
415			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
416
417			my $filter_name;
418			# remove filter{...} from beginning
419			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
420
421			# do actual replacement of placeholders
422			# repeatable fields
423			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
424			# non-repeatable fields
425			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
426
427			if ($found) {
428			$log->debug("format: $format");
429			if ($eval_code) {
430			my $eval = $self->fill_in($rec,$eval_code,$i);
431			return if (! $self->_eval($eval));
432			}
433			if ($filter_name && $self->{'filter'}->{$filter_name}) {
434			$log->debug("filter '$filter_name' for $format");
435			$format = $self->{'filter'}->{$filter_name}->($format);
436			return unless(defined($format));
437			$log->debug("filter result: $format");
438			}
439			# do we have lookups?
440			if ($self->{'lookup'}) {
441			return $self->lookup($format);
442			} else {
443			return $format;
444			}
445			} else {
446			return;
447			}
448			}
449
450
451	dpavlin	13	=head2 fill_in_to_arr
452
453			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
454			for fields which have lookups, so they shouldn't be parsed but rather
455			C<fill_id>ed.
456
457			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
458
459			=cut
460
461			sub fill_in_to_arr {
462			my $self = shift;
463
464			my ($rec, $format_utf8) = @_;
465
466			my $log = $self->_get_logger();
467
468			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
469			return if (! $format_utf8);
470
471			my $i = 0;
472			my @arr;
473
474			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
475			push @arr, @v;
476			}
477
478			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
479
480			return @arr;
481			}
482
483	dpavlin	15
484			=head2 get_data
485
486			Returns value from record.
487
488			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
489
490			Arguments are:
491			record reference C<$rec>,
492			field C<$f>,
493			optional subfiled C<$sf>,
494			index for repeatable values C<$i>.
495
496			Optinal variable C<$found> will be incremeted if there
497			is field.
498
499			Returns value or empty string.
500
501			=cut
502
503			sub get_data {
504			my $self = shift;
505
506			my ($rec,$f,$sf,$i,$found) = @_;
507
508			if ($$rec->{$f}) {
509			return '' if (! $$rec->{$f}->[$i]);
510			no strict 'refs';
511			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
512			$$found++ if (defined($$found));
513			return $$rec->{$f}->[$i]->{$sf};
514			} elsif ($$rec->{$f}->[$i]) {
515			$$found++ if (defined($$found));
516			# it still might have subfield, just
517			# not specified, so we'll dump all
518			if ($$rec->{$f}->[$i] =~ /HASH/o) {
519			my $out;
520			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
521			$out .= $$rec->{$f}->[$i]->{$k}." ";
522			}
523			return $out;
524			} else {
525			return $$rec->{$f}->[$i];
526			}
527			}
528			} else {
529			return '';
530			}
531			}
532
533
534			=head2 apply_format
535
536			Apply format specified in tag with C<format_name="name"> and
537			C<format_delimiter=";;">.
538
539			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
540
541			Formats can contain C<lookup{...}> if you need them.
542
543			=cut
544
545			sub apply_format {
546			my $self = shift;
547
548			my ($name,$delimiter,$data) = @_;
549
550			my $log = $self->_get_logger();
551
552			if (! $self->{'import_xml'}->{'format'}->{$name}) {
553			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
554			return $data;
555			}
556
557			$log->warn("no delimiter for format $name") if (! $delimiter);
558
559			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
560
561			my @data = split(/\Q$delimiter\E/, $data);
562
563			my $out = sprintf($format, @data);
564			$log->debug("using format $name [$format] on $data to produce: $out");
565
566			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
567			return $self->lookup($out);
568			} else {
569			return $out;
570			}
571
572			}
573
574	dpavlin	13	=head2 sort_arr
575
576			Sort array ignoring case and html in data
577
578			my @sorted = $webpac->sort_arr(@unsorted);
579
580			=cut
581
582			sub sort_arr {
583			my $self = shift;
584
585			my $log = $self->_get_logger();
586
587			# FIXME add Schwartzian Transformation?
588
589			my @sorted = sort {
590			$a =~ s#<[^>]+/*>##;
591			$b =~ s#<[^>]+/*>##;
592			lc($b) cmp lc($a)
593			} @_;
594			$log->debug("sorted values: ",sub { join(", ",@sorted) });
595
596			return @sorted;
597			}
598
599
600	dpavlin	15	=head1 INTERNAL METHODS
601
602	dpavlin	13	=head2 _sort_by_order
603
604			Sort xml tags data structure accoding to C<order=""> attribute.
605
606			=cut
607
608			sub _sort_by_order {
609			my $self = shift;
610
611			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
612			$self->{'import_xml'}->{'indexer'}->{$a};
613			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
614			$self->{'import_xml'}->{'indexer'}->{$b};
615
616			return $va <=> $vb;
617			}
618
619			=head2 _x
620
621	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
622			specific encoding (optinally specified using C<code_page> to C<new>
623			constructor).
624	dpavlin	13
625			my $text = $n->_x('normalize text string');
626
627			This is a stub so that other modules doesn't have to implement it.
628
629			=cut
630
631			sub _x {
632			my $self = shift;
633			return shift;
634			}
635
636
637	dpavlin	10	=head1 AUTHOR
638
639			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
640
641			=head1 COPYRIGHT & LICENSE
642
643			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
644
645			This program is free software; you can redistribute it and/or modify it
646			under the same terms as Perl itself.
647
648			=cut
649
650			1; # End of WebPAC::DB