lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
                return @ds if ($#ds > 0);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper(@ds) });

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        return $self->lookup($format);
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14			Version 0.01
15
16			=cut
17
18			our $VERSION = '0.01';
19
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	29	db => $db_obj,
83	dpavlin	13	lookup_regex => $lookup->regex,
84			);
85
86	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
87			be use with C<filter{...}> notation.
88
89	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
90			in structures.
91
92	dpavlin	10	=cut
93
94	dpavlin	13	sub new {
95			my $class = shift;
96			my $self = {@_};
97			bless($self, $class);
98
99			$self ? return $self : return undef;
100	dpavlin	10	}
101
102	dpavlin	13
103			=head2 data_structure
104
105			Create in-memory data structure which represents normalized layout from
106			C<conf/normalize/*.xml>.
107
108			This structures are used to produce output.
109
110			my @ds = $webpac->data_structure($rec);
111
112			B<Note: historical oddity follows>
113
114			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
115			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
116			C<< <headline> >> tag.
117
118			=cut
119
120			sub data_structure {
121			my $self = shift;
122
123			my $log = $self->_get_logger();
124
125			my $rec = shift;
126			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
127
128			my $cache_file;
129
130	dpavlin	18	if ($self->{'db'}) {
131	dpavlin	22	my @ds = $self->{'db'}->load_ds($rec);
132	dpavlin	29	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
133			return @ds if ($#ds > 0);
134			$log->debug("cache miss, creating");
135	dpavlin	13	}
136
137			undef $self->{'currnet_filename'};
138			undef $self->{'headline'};
139
140			my @sorted_tags;
141			if ($self->{tags_by_order}) {
142			@sorted_tags = @{$self->{tags_by_order}};
143			} else {
144			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
145			$self->{tags_by_order} = \@sorted_tags;
146			}
147
148			my @ds;
149
150			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
151
152			foreach my $field (@sorted_tags) {
153
154			my $row;
155
156			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
157
158			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
159			my $format = $tag->{'value'} \|\| $tag->{'content'};
160
161			$log->debug("format: $format");
162
163			my @v;
164			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
165			@v = $self->fill_in_to_arr($rec,$format);
166			} else {
167			@v = $self->parse_to_arr($rec,$format);
168			}
169			next if (! @v);
170
171			if ($tag->{'sort'}) {
172			@v = $self->sort_arr(@v);
173			}
174
175			# use format?
176			if ($tag->{'format_name'}) {
177			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
178			}
179
180			if ($field eq 'filename') {
181			$self->{'current_filename'} = join('',@v);
182			$log->debug("filename: ",$self->{'current_filename'});
183			} elsif ($field eq 'headline') {
184			$self->{'headline'} .= join('',@v);
185			$log->debug("headline: ",$self->{'headline'});
186			next; # don't return headline in data_structure!
187			}
188
189			# delimiter will join repeatable fields
190			if ($tag->{'delimiter'}) {
191			@v = ( join($tag->{'delimiter'}, @v) );
192			}
193
194			# default types
195			my @types = qw(display swish);
196			# override by type attribute
197			@types = ( $tag->{'type'} ) if ($tag->{'type'});
198
199			foreach my $type (@types) {
200			# append to previous line?
201			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
202			if ($tag->{'append'}) {
203
204			# I will delimit appended part with
205			# delimiter (or ,)
206			my $d = $tag->{'delimiter'};
207			# default delimiter
208			$d \|\|= " ";
209
210			my $last = pop @{$row->{$type}};
211			$d = "" if (! $last);
212			$last .= $d . join($d, @v);
213			push @{$row->{$type}}, $last;
214
215			} else {
216			push @{$row->{$type}}, @v;
217			}
218			}
219
220
221			}
222
223			if ($row) {
224			$row->{'tag'} = $field;
225
226			# TODO: name_sigular, name_plural
227			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
228			$row->{'name'} = $name ? $self->_x($name) : $field;
229
230			# post-sort all values in field
231			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
232			$log->warn("sort at field tag not implemented");
233			}
234
235			push @ds, $row;
236
237			$log->debug("row $field: ",sub { Dumper($row) });
238			}
239
240			}
241
242	dpavlin	22	$self->{'db'}->save_ds(
243	dpavlin	18	ds => \@ds,
244			current_filename => $self->{'current_filename'},
245			headline => $self->{'headline'},
246			) if ($self->{'db'});
247	dpavlin	13
248	dpavlin	29	$log->debug("ds: ", sub { Dumper(@ds) });
249
250	dpavlin	13	return @ds;
251
252			}
253
254			=head2 parse
255
256			Perform smart parsing of string, skipping delimiters for fields which aren't
257			defined. It can also eval code in format starting with C<eval{...}> and
258			return output or nothing depending on eval code.
259
260			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
261
262			=cut
263
264			sub parse {
265			my $self = shift;
266
267			my ($rec, $format_utf8, $i) = @_;
268
269			return if (! $format_utf8);
270
271			my $log = $self->_get_logger();
272
273			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
274
275			$i = 0 if (! $i);
276
277			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
278
279			my @out;
280
281			$log->debug("format: $format");
282
283			my $eval_code;
284			# remove eval{...} from beginning
285			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
286
287			my $filter_name;
288			# remove filter{...} from beginning
289			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
290
291			my $prefix;
292			my $all_found=0;
293
294			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
295
296			my $del = $1 \|\| '';
297			$prefix \|\|= $del if ($all_found == 0);
298
299			# repeatable index
300			my $r = $i;
301			$r = 0 if (lc("$2") eq 's');
302
303			my $found = 0;
304			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
305
306			if ($found) {
307			push @out, $del;
308			push @out, $tmp;
309			$all_found += $found;
310			}
311			}
312
313			return if (! $all_found);
314
315			my $out = join('',@out);
316
317			if ($out) {
318			# add rest of format (suffix)
319			$out .= $format;
320
321			# add prefix if not there
322			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
323
324			$log->debug("result: $out");
325			}
326
327			if ($eval_code) {
328			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
329			$log->debug("about to eval{$eval} format: $out");
330			return if (! $self->_eval($eval));
331			}
332
333			if ($filter_name && $self->{'filter'}->{$filter_name}) {
334			$log->debug("about to filter{$filter_name} format: $out");
335			$out = $self->{'filter'}->{$filter_name}->($out);
336			return unless(defined($out));
337			$log->debug("filter result: $out");
338			}
339
340			return $out;
341			}
342
343			=head2 parse_to_arr
344
345			Similar to C<parse>, but returns array of all repeatable fields
346
347			my @arr = $webpac->parse_to_arr($rec,'v250^a');
348
349			=cut
350
351			sub parse_to_arr {
352			my $self = shift;
353
354			my ($rec, $format_utf8) = @_;
355
356			my $log = $self->_get_logger();
357
358			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
359			return if (! $format_utf8);
360
361			my $i = 0;
362			my @arr;
363
364			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
365			push @arr, $v;
366			}
367
368			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
369
370			return @arr;
371			}
372
373	dpavlin	15
374			=head2 fill_in
375
376			Workhourse of all: takes record from in-memory structure of database and
377			strings with placeholders and returns string or array of with substituted
378			values from record.
379
380			my $text = $webpac->fill_in($rec,'v250^a');
381
382			Optional argument is ordinal number for repeatable fields. By default,
383			it's assume to be first repeatable field (fields are perl array, so first
384			element is 0).
385			Following example will read second value from repeatable field.
386
387			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
388
389			This function B<does not> perform parsing of format to inteligenty skip
390			delimiters before fields which aren't used.
391
392			This method will automatically decode UTF-8 string to local code page
393			if needed.
394
395			=cut
396
397			sub fill_in {
398			my $self = shift;
399
400			my $log = $self->_get_logger();
401
402			my $rec = shift \|\| $log->logconfess("need data record");
403			my $format = shift \|\| $log->logconfess("need format to parse");
404			# iteration (for repeatable fields)
405			my $i = shift \|\| 0;
406
407			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
408
409			# FIXME remove for speedup?
410			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411
412			if (utf8::is_utf8($format)) {
413			$format = $self->_x($format);
414			}
415
416			my $found = 0;
417
418			my $eval_code;
419			# remove eval{...} from beginning
420			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
421
422			my $filter_name;
423			# remove filter{...} from beginning
424			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
425
426			# do actual replacement of placeholders
427			# repeatable fields
428			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
429			# non-repeatable fields
430			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
431
432			if ($found) {
433			$log->debug("format: $format");
434			if ($eval_code) {
435			my $eval = $self->fill_in($rec,$eval_code,$i);
436			return if (! $self->_eval($eval));
437			}
438			if ($filter_name && $self->{'filter'}->{$filter_name}) {
439			$log->debug("filter '$filter_name' for $format");
440			$format = $self->{'filter'}->{$filter_name}->($format);
441			return unless(defined($format));
442			$log->debug("filter result: $format");
443			}
444			# do we have lookups?
445			if ($self->{'lookup'}) {
446			return $self->lookup($format);
447			} else {
448			return $format;
449			}
450			} else {
451			return;
452			}
453			}
454
455
456	dpavlin	13	=head2 fill_in_to_arr
457
458			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
459			for fields which have lookups, so they shouldn't be parsed but rather
460			C<fill_id>ed.
461
462			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
463
464			=cut
465
466			sub fill_in_to_arr {
467			my $self = shift;
468
469			my ($rec, $format_utf8) = @_;
470
471			my $log = $self->_get_logger();
472
473			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
474			return if (! $format_utf8);
475
476			my $i = 0;
477			my @arr;
478
479			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
480			push @arr, @v;
481			}
482
483			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
484
485			return @arr;
486			}
487
488	dpavlin	15
489			=head2 get_data
490
491			Returns value from record.
492
493			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
494
495			Arguments are:
496			record reference C<$rec>,
497			field C<$f>,
498			optional subfiled C<$sf>,
499			index for repeatable values C<$i>.
500
501			Optinal variable C<$found> will be incremeted if there
502			is field.
503
504			Returns value or empty string.
505
506			=cut
507
508			sub get_data {
509			my $self = shift;
510
511			my ($rec,$f,$sf,$i,$found) = @_;
512
513			if ($$rec->{$f}) {
514			return '' if (! $$rec->{$f}->[$i]);
515			no strict 'refs';
516			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
517			$$found++ if (defined($$found));
518			return $$rec->{$f}->[$i]->{$sf};
519			} elsif ($$rec->{$f}->[$i]) {
520			$$found++ if (defined($$found));
521			# it still might have subfield, just
522			# not specified, so we'll dump all
523			if ($$rec->{$f}->[$i] =~ /HASH/o) {
524			my $out;
525			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
526			$out .= $$rec->{$f}->[$i]->{$k}." ";
527			}
528			return $out;
529			} else {
530			return $$rec->{$f}->[$i];
531			}
532			}
533			} else {
534			return '';
535			}
536			}
537
538
539			=head2 apply_format
540
541			Apply format specified in tag with C<format_name="name"> and
542			C<format_delimiter=";;">.
543
544			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
545
546			Formats can contain C<lookup{...}> if you need them.
547
548			=cut
549
550			sub apply_format {
551			my $self = shift;
552
553			my ($name,$delimiter,$data) = @_;
554
555			my $log = $self->_get_logger();
556
557			if (! $self->{'import_xml'}->{'format'}->{$name}) {
558			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
559			return $data;
560			}
561
562			$log->warn("no delimiter for format $name") if (! $delimiter);
563
564			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
565
566			my @data = split(/\Q$delimiter\E/, $data);
567
568			my $out = sprintf($format, @data);
569			$log->debug("using format $name [$format] on $data to produce: $out");
570
571			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
572			return $self->lookup($out);
573			} else {
574			return $out;
575			}
576
577			}
578
579	dpavlin	13	=head2 sort_arr
580
581			Sort array ignoring case and html in data
582
583			my @sorted = $webpac->sort_arr(@unsorted);
584
585			=cut
586
587			sub sort_arr {
588			my $self = shift;
589
590			my $log = $self->_get_logger();
591
592			# FIXME add Schwartzian Transformation?
593
594			my @sorted = sort {
595			$a =~ s#<[^>]+/*>##;
596			$b =~ s#<[^>]+/*>##;
597			lc($b) cmp lc($a)
598			} @_;
599			$log->debug("sorted values: ",sub { join(", ",@sorted) });
600
601			return @sorted;
602			}
603
604
605	dpavlin	15	=head1 INTERNAL METHODS
606
607	dpavlin	13	=head2 _sort_by_order
608
609			Sort xml tags data structure accoding to C<order=""> attribute.
610
611			=cut
612
613			sub _sort_by_order {
614			my $self = shift;
615
616			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
617			$self->{'import_xml'}->{'indexer'}->{$a};
618			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
619			$self->{'import_xml'}->{'indexer'}->{$b};
620
621			return $va <=> $vb;
622			}
623
624			=head2 _x
625
626	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
627			specific encoding (optinally specified using C<code_page> to C<new>
628			constructor).
629	dpavlin	13
630			my $text = $n->_x('normalize text string');
631
632			This is a stub so that other modules doesn't have to implement it.
633
634			=cut
635
636			sub _x {
637			my $self = shift;
638			return shift;
639			}
640
641
642	dpavlin	10	=head1 AUTHOR
643
644			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
645
646			=head1 COPYRIGHT & LICENSE
647
648			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
649
650			This program is free software; you can redistribute it and/or modify it
651			under the same terms as Perl itself.
652
653			=cut
654
655			1; # End of WebPAC::DB