lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.02

=cut

our $VERSION = '0.02';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));

        my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( $mfn );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $mfn,
                ds => $ds,
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.02
15
16	=cut
17
18	our $VERSION = '0.02';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	lookup => $lookup_obj,
85	);
86
87	Parametar C<filter> defines user supplied snippets of perl code which can
88	be use with C<filter{...}> notation.
89
90	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91	in structures. If you pass this parametar, you must also pass C<lookup>
92	which is C<WebPAC::Lookup> object.
93
94	=cut
95
96	sub new {
97	my $class = shift;
98	my $self = {@_};
99	bless($self, $class);
100
101	my $r = $self->{'lookup_regex'} ? 1 : 0;
102	my $l = $self->{'lookup'} ? 1 : 0;
103
104	my $log = $self->_get_logger();
105
106	# those two must be in pair
107	if ( ($r & $l) != ($r \|\| $l) ) {
108	my $log = $self->_get_logger();
109	$log->logdie("lookup_regex and lookup must be in pair");
110	}
111
112	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114	$self ? return $self : return undef;
115	}
116
117
118	=head2 data_structure
119
120	Create in-memory data structure which represents normalized layout from
121	C<conf/normalize/*.xml>.
122
123	This structures are used to produce output.
124
125	my $ds = $webpac->data_structure($rec);
126
127	=cut
128
129	sub data_structure {
130	my $self = shift;
131
132	my $log = $self->_get_logger();
133
134	my $rec = shift;
135	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
136
137	$log->debug("data_structure rec = ", sub { Dumper($rec) });
138
139	$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
140
141	my $mfn = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
142
143	my $cache_file;
144
145	if ($self->{'db'}) {
146	my $ds = $self->{'db'}->load_ds( $mfn );
147	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
148	return $ds if ($ds);
149	$log->debug("cache miss, creating");
150	}
151
152	undef $self->{'currnet_filename'};
153	undef $self->{'headline'};
154
155	my @sorted_tags;
156	if ($self->{tags_by_order}) {
157	@sorted_tags = @{$self->{tags_by_order}};
158	} else {
159	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160	$self->{tags_by_order} = \@sorted_tags;
161	}
162
163	my $ds;
164
165	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167	foreach my $field (@sorted_tags) {
168
169	my $row;
170
171	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174	my $format;
175
176	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177	$format = $tag->{'value'} \|\| $tag->{'content'};
178
179	$log->debug("format: $format");
180
181	my @v;
182	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183	@v = $self->fill_in_to_arr($rec,$format);
184	} else {
185	@v = $self->parse_to_arr($rec,$format);
186	}
187	next if (! @v);
188
189	if ($tag->{'sort'}) {
190	@v = $self->sort_arr(@v);
191	}
192
193	# use format?
194	if ($tag->{'format_name'}) {
195	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196	}
197
198	# delimiter will join repeatable fields
199	if ($tag->{'delimiter'}) {
200	@v = ( join($tag->{'delimiter'}, @v) );
201	}
202
203	# default types
204	my @types = qw(display search);
205	# override by type attribute
206	@types = ( $tag->{'type'} ) if ($tag->{'type'});
207
208	foreach my $type (@types) {
209	# append to previous line?
210	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
211	if ($tag->{'append'}) {
212
213	# I will delimit appended part with
214	# delimiter (or ,)
215	my $d = $tag->{'delimiter'};
216	# default delimiter
217	$d \|\|= " ";
218
219	my $last = pop @{$row->{$type}};
220	$d = "" if (! $last);
221	$last .= $d . join($d, @v);
222	push @{$row->{$type}}, $last;
223
224	} else {
225	push @{$row->{$type}}, @v;
226	}
227	}
228
229
230	}
231
232	if ($row) {
233	$row->{'tag'} = $field;
234
235	# TODO: name_sigular, name_plural
236	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
237	my $row_name = $name ? $self->_x($name) : $field;
238
239	# post-sort all values in field
240	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
241	$log->warn("sort at field tag not implemented");
242	}
243
244	$ds->{$row_name} = $row;
245
246	$log->debug("row $field: ",sub { Dumper($row) });
247	}
248
249	}
250
251	$self->{'db'}->save_ds(
252	id => $mfn,
253	ds => $ds,
254	) if ($self->{'db'});
255
256	$log->debug("ds: ", sub { Dumper($ds) });
257
258	$log->logconfess("data structure returned is not array any more!") if wantarray;
259
260	return $ds;
261
262	}
263
264	=head2 parse
265
266	Perform smart parsing of string, skipping delimiters for fields which aren't
267	defined. It can also eval code in format starting with C<eval{...}> and
268	return output or nothing depending on eval code.
269
270	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
271
272	=cut
273
274	sub parse {
275	my $self = shift;
276
277	my ($rec, $format_utf8, $i) = @_;
278
279	return if (! $format_utf8);
280
281	my $log = $self->_get_logger();
282
283	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
284
285	$i = 0 if (! $i);
286
287	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
288
289	my @out;
290
291	$log->debug("format: $format");
292
293	my $eval_code;
294	# remove eval{...} from beginning
295	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
296
297	my $filter_name;
298	# remove filter{...} from beginning
299	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
300
301	my $prefix;
302	my $all_found=0;
303
304	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
305
306	my $del = $1 \|\| '';
307	$prefix \|\|= $del if ($all_found == 0);
308
309	# repeatable index
310	my $r = $i;
311	$r = 0 if (lc("$2") eq 's');
312
313	my $found = 0;
314	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
315
316	if ($found) {
317	push @out, $del;
318	push @out, $tmp;
319	$all_found += $found;
320	}
321	}
322
323	return if (! $all_found);
324
325	my $out = join('',@out);
326
327	if ($out) {
328	# add rest of format (suffix)
329	$out .= $format;
330
331	# add prefix if not there
332	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
333
334	$log->debug("result: $out");
335	}
336
337	if ($eval_code) {
338	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
339	$log->debug("about to eval{$eval} format: $out");
340	return if (! $self->_eval($eval));
341	}
342
343	if ($filter_name && $self->{'filter'}->{$filter_name}) {
344	$log->debug("about to filter{$filter_name} format: $out");
345	$out = $self->{'filter'}->{$filter_name}->($out);
346	return unless(defined($out));
347	$log->debug("filter result: $out");
348	}
349
350	return $out;
351	}
352
353	=head2 parse_to_arr
354
355	Similar to C<parse>, but returns array of all repeatable fields
356
357	my @arr = $webpac->parse_to_arr($rec,'v250^a');
358
359	=cut
360
361	sub parse_to_arr {
362	my $self = shift;
363
364	my ($rec, $format_utf8) = @_;
365
366	my $log = $self->_get_logger();
367
368	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
369	return if (! $format_utf8);
370
371	my $i = 0;
372	my @arr;
373
374	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
375	push @arr, $v;
376	}
377
378	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
379
380	return @arr;
381	}
382
383
384	=head2 fill_in
385
386	Workhourse of all: takes record from in-memory structure of database and
387	strings with placeholders and returns string or array of with substituted
388	values from record.
389
390	my $text = $webpac->fill_in($rec,'v250^a');
391
392	Optional argument is ordinal number for repeatable fields. By default,
393	it's assume to be first repeatable field (fields are perl array, so first
394	element is 0).
395	Following example will read second value from repeatable field.
396
397	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
398
399	This function B<does not> perform parsing of format to inteligenty skip
400	delimiters before fields which aren't used.
401
402	This method will automatically decode UTF-8 string to local code page
403	if needed.
404
405	=cut
406
407	sub fill_in {
408	my $self = shift;
409
410	my $log = $self->_get_logger();
411
412	my $rec = shift \|\| $log->logconfess("need data record");
413	my $format = shift \|\| $log->logconfess("need format to parse");
414	# iteration (for repeatable fields)
415	my $i = shift \|\| 0;
416
417	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
418
419	# FIXME remove for speedup?
420	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
421
422	if (utf8::is_utf8($format)) {
423	$format = $self->_x($format);
424	}
425
426	my $found = 0;
427
428	my $eval_code;
429	# remove eval{...} from beginning
430	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
431
432	my $filter_name;
433	# remove filter{...} from beginning
434	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
435
436	# do actual replacement of placeholders
437	# repeatable fields
438	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
439	# non-repeatable fields
440	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
441
442	if ($found) {
443	$log->debug("format: $format");
444	if ($eval_code) {
445	my $eval = $self->fill_in($rec,$eval_code,$i);
446	return if (! $self->_eval($eval));
447	}
448	if ($filter_name && $self->{'filter'}->{$filter_name}) {
449	$log->debug("filter '$filter_name' for $format");
450	$format = $self->{'filter'}->{$filter_name}->($format);
451	return unless(defined($format));
452	$log->debug("filter result: $format");
453	}
454	# do we have lookups?
455	if ($self->{'lookup'}) {
456	if ($self->{'lookup'}->can('lookup')) {
457	return $self->{'lookup'}->lookup($format);
458	} else {
459	$log->warn("Have lookup object but can't invoke lookup method");
460	}
461	} else {
462	return $format;
463	}
464	} else {
465	return;
466	}
467	}
468
469
470	=head2 fill_in_to_arr
471
472	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
473	for fields which have lookups, so they shouldn't be parsed but rather
474	C<fill_id>ed.
475
476	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
477
478	=cut
479
480	sub fill_in_to_arr {
481	my $self = shift;
482
483	my ($rec, $format_utf8) = @_;
484
485	my $log = $self->_get_logger();
486
487	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
488	return if (! $format_utf8);
489
490	my $i = 0;
491	my @arr;
492
493	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
494	push @arr, @v;
495	}
496
497	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
498
499	return @arr;
500	}
501
502
503	=head2 get_data
504
505	Returns value from record.
506
507	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
508
509	Arguments are:
510	record reference C<$rec>,
511	field C<$f>,
512	optional subfiled C<$sf>,
513	index for repeatable values C<$i>.
514
515	Optinal variable C<$found> will be incremeted if there
516	is field.
517
518	Returns value or empty string.
519
520	=cut
521
522	sub get_data {
523	my $self = shift;
524
525	my ($rec,$f,$sf,$i,$found) = @_;
526
527	if ($$rec->{$f}) {
528	return '' if (! $$rec->{$f}->[$i]);
529	no strict 'refs';
530	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
531	$$found++ if (defined($$found));
532	return $$rec->{$f}->[$i]->{$sf};
533	} elsif (! $sf && $$rec->{$f}->[$i]) {
534	$$found++ if (defined($$found));
535	# it still might have subfield, just
536	# not specified, so we'll dump all
537	if ($$rec->{$f}->[$i] =~ /HASH/o) {
538	my $out;
539	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
540	$out .= $$rec->{$f}->[$i]->{$k}." ";
541	}
542	return $out;
543	} else {
544	return $$rec->{$f}->[$i];
545	}
546	} else {
547	return '';
548	}
549	} else {
550	return '';
551	}
552	}
553
554
555	=head2 apply_format
556
557	Apply format specified in tag with C<format_name="name"> and
558	C<format_delimiter=";;">.
559
560	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
561
562	Formats can contain C<lookup{...}> if you need them.
563
564	=cut
565
566	sub apply_format {
567	my $self = shift;
568
569	my ($name,$delimiter,$data) = @_;
570
571	my $log = $self->_get_logger();
572
573	if (! $self->{'import_xml'}->{'format'}->{$name}) {
574	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
575	return $data;
576	}
577
578	$log->warn("no delimiter for format $name") if (! $delimiter);
579
580	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
581
582	my @data = split(/\Q$delimiter\E/, $data);
583
584	my $out = sprintf($format, @data);
585	$log->debug("using format $name [$format] on $data to produce: $out");
586
587	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
588	return $self->{'lookup'}->lookup($out);
589	} else {
590	return $out;
591	}
592
593	}
594
595	=head2 sort_arr
596
597	Sort array ignoring case and html in data
598
599	my @sorted = $webpac->sort_arr(@unsorted);
600
601	=cut
602
603	sub sort_arr {
604	my $self = shift;
605
606	my $log = $self->_get_logger();
607
608	# FIXME add Schwartzian Transformation?
609
610	my @sorted = sort {
611	$a =~ s#<[^>]+/*>##;
612	$b =~ s#<[^>]+/*>##;
613	lc($b) cmp lc($a)
614	} @_;
615	$log->debug("sorted values: ",sub { join(", ",@sorted) });
616
617	return @sorted;
618	}
619
620
621	=head1 INTERNAL METHODS
622
623	=head2 _sort_by_order
624
625	Sort xml tags data structure accoding to C<order=""> attribute.
626
627	=cut
628
629	sub _sort_by_order {
630	my $self = shift;
631
632	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
633	$self->{'import_xml'}->{'indexer'}->{$a};
634	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
635	$self->{'import_xml'}->{'indexer'}->{$b};
636
637	return $va <=> $vb;
638	}
639
640	=head2 _x
641
642	Convert strings from C<conf/normalize/*.xml> encoding into application
643	specific encoding (optinally specified using C<code_page> to C<new>
644	constructor).
645
646	my $text = $n->_x('normalize text string');
647
648	This is a stub so that other modules doesn't have to implement it.
649
650	=cut
651
652	sub _x {
653	my $self = shift;
654	return shift;
655	}
656
657
658	=head1 AUTHOR
659
660	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
661
662	=head1 COPYRIGHT & LICENSE
663
664	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
665
666	This program is free software; you can redistribute it and/or modify it
667	under the same terms as Perl itself.
668
669	=cut
670
671	1; # End of WebPAC::Normalize