lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.02

=cut

our $VERSION = '0.02';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});

        $self->{'db'}->save_ds(
                ds => $ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.02
15
16	=cut
17
18	our $VERSION = '0.02';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	lookup => $lookup_obj,
85	);
86
87	Parametar C<filter> defines user supplied snippets of perl code which can
88	be use with C<filter{...}> notation.
89
90	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91	in structures. If you pass this parametar, you must also pass C<lookup>
92	which is C<WebPAC::Lookup> object.
93
94	=cut
95
96	sub new {
97	my $class = shift;
98	my $self = {@_};
99	bless($self, $class);
100
101	my $r = $self->{'lookup_regex'} ? 1 : 0;
102	my $l = $self->{'lookup'} ? 1 : 0;
103
104	my $log = $self->_get_logger();
105
106	# those two must be in pair
107	if ( ($r & $l) != ($r \|\| $l) ) {
108	my $log = $self->_get_logger();
109	$log->logdie("lookup_regex and lookup must be in pair");
110	}
111
112	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114	$self ? return $self : return undef;
115	}
116
117
118	=head2 data_structure
119
120	Create in-memory data structure which represents normalized layout from
121	C<conf/normalize/*.xml>.
122
123	This structures are used to produce output.
124
125	my $ds = $webpac->data_structure($rec);
126
127	B<Note: historical oddity follows>
128
129	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130	C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131	C<< <headline> >> tag.
132
133	=cut
134
135	sub data_structure {
136	my $self = shift;
137
138	my $log = $self->_get_logger();
139
140	my $rec = shift;
141	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143	my $cache_file;
144
145	if ($self->{'db'}) {
146	my $ds = $self->{'db'}->load_ds($rec);
147	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
148	return $ds if ($ds);
149	$log->debug("cache miss, creating");
150	}
151
152	undef $self->{'currnet_filename'};
153	undef $self->{'headline'};
154
155	my @sorted_tags;
156	if ($self->{tags_by_order}) {
157	@sorted_tags = @{$self->{tags_by_order}};
158	} else {
159	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160	$self->{tags_by_order} = \@sorted_tags;
161	}
162
163	my $ds;
164
165	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167	foreach my $field (@sorted_tags) {
168
169	my $row;
170
171	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174	my $format;
175
176	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177	$format = $tag->{'value'} \|\| $tag->{'content'};
178
179	$log->debug("format: $format");
180
181	my @v;
182	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183	@v = $self->fill_in_to_arr($rec,$format);
184	} else {
185	@v = $self->parse_to_arr($rec,$format);
186	}
187	next if (! @v);
188
189	if ($tag->{'sort'}) {
190	@v = $self->sort_arr(@v);
191	}
192
193	# use format?
194	if ($tag->{'format_name'}) {
195	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196	}
197
198	if ($field eq 'filename') {
199	$self->{'current_filename'} = join('',@v);
200	$log->debug("filename: ",$self->{'current_filename'});
201	} elsif ($field eq 'headline') {
202	$self->{'headline'} .= join('',@v);
203	$log->debug("headline: ",$self->{'headline'});
204	next; # don't return headline in data_structure!
205	}
206
207	# delimiter will join repeatable fields
208	if ($tag->{'delimiter'}) {
209	@v = ( join($tag->{'delimiter'}, @v) );
210	}
211
212	# default types
213	my @types = qw(display search);
214	# override by type attribute
215	@types = ( $tag->{'type'} ) if ($tag->{'type'});
216
217	foreach my $type (@types) {
218	# append to previous line?
219	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
220	if ($tag->{'append'}) {
221
222	# I will delimit appended part with
223	# delimiter (or ,)
224	my $d = $tag->{'delimiter'};
225	# default delimiter
226	$d \|\|= " ";
227
228	my $last = pop @{$row->{$type}};
229	$d = "" if (! $last);
230	$last .= $d . join($d, @v);
231	push @{$row->{$type}}, $last;
232
233	} else {
234	push @{$row->{$type}}, @v;
235	}
236	}
237
238
239	}
240
241	if ($row) {
242	$row->{'tag'} = $field;
243
244	# TODO: name_sigular, name_plural
245	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246	my $row_name = $name ? $self->_x($name) : $field;
247
248	# post-sort all values in field
249	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250	$log->warn("sort at field tag not implemented");
251	}
252
253	$ds->{$row_name} = $row;
254
255	$log->debug("row $field: ",sub { Dumper($row) });
256	}
257
258	}
259
260	$log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261
262	$self->{'db'}->save_ds(
263	ds => $ds,
264	current_filename => $self->{'current_filename'},
265	headline => $self->{'headline'},
266	) if ($self->{'db'});
267
268	$log->debug("ds: ", sub { Dumper($ds) });
269
270	$log->logconfess("data structure returned is not array any more!") if wantarray;
271
272	return $ds;
273
274	}
275
276	=head2 parse
277
278	Perform smart parsing of string, skipping delimiters for fields which aren't
279	defined. It can also eval code in format starting with C<eval{...}> and
280	return output or nothing depending on eval code.
281
282	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
283
284	=cut
285
286	sub parse {
287	my $self = shift;
288
289	my ($rec, $format_utf8, $i) = @_;
290
291	return if (! $format_utf8);
292
293	my $log = $self->_get_logger();
294
295	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
296
297	$i = 0 if (! $i);
298
299	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
300
301	my @out;
302
303	$log->debug("format: $format");
304
305	my $eval_code;
306	# remove eval{...} from beginning
307	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
308
309	my $filter_name;
310	# remove filter{...} from beginning
311	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
312
313	my $prefix;
314	my $all_found=0;
315
316	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
317
318	my $del = $1 \|\| '';
319	$prefix \|\|= $del if ($all_found == 0);
320
321	# repeatable index
322	my $r = $i;
323	$r = 0 if (lc("$2") eq 's');
324
325	my $found = 0;
326	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
327
328	if ($found) {
329	push @out, $del;
330	push @out, $tmp;
331	$all_found += $found;
332	}
333	}
334
335	return if (! $all_found);
336
337	my $out = join('',@out);
338
339	if ($out) {
340	# add rest of format (suffix)
341	$out .= $format;
342
343	# add prefix if not there
344	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
345
346	$log->debug("result: $out");
347	}
348
349	if ($eval_code) {
350	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
351	$log->debug("about to eval{$eval} format: $out");
352	return if (! $self->_eval($eval));
353	}
354
355	if ($filter_name && $self->{'filter'}->{$filter_name}) {
356	$log->debug("about to filter{$filter_name} format: $out");
357	$out = $self->{'filter'}->{$filter_name}->($out);
358	return unless(defined($out));
359	$log->debug("filter result: $out");
360	}
361
362	return $out;
363	}
364
365	=head2 parse_to_arr
366
367	Similar to C<parse>, but returns array of all repeatable fields
368
369	my @arr = $webpac->parse_to_arr($rec,'v250^a');
370
371	=cut
372
373	sub parse_to_arr {
374	my $self = shift;
375
376	my ($rec, $format_utf8) = @_;
377
378	my $log = $self->_get_logger();
379
380	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
381	return if (! $format_utf8);
382
383	my $i = 0;
384	my @arr;
385
386	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
387	push @arr, $v;
388	}
389
390	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
391
392	return @arr;
393	}
394
395
396	=head2 fill_in
397
398	Workhourse of all: takes record from in-memory structure of database and
399	strings with placeholders and returns string or array of with substituted
400	values from record.
401
402	my $text = $webpac->fill_in($rec,'v250^a');
403
404	Optional argument is ordinal number for repeatable fields. By default,
405	it's assume to be first repeatable field (fields are perl array, so first
406	element is 0).
407	Following example will read second value from repeatable field.
408
409	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
410
411	This function B<does not> perform parsing of format to inteligenty skip
412	delimiters before fields which aren't used.
413
414	This method will automatically decode UTF-8 string to local code page
415	if needed.
416
417	=cut
418
419	sub fill_in {
420	my $self = shift;
421
422	my $log = $self->_get_logger();
423
424	my $rec = shift \|\| $log->logconfess("need data record");
425	my $format = shift \|\| $log->logconfess("need format to parse");
426	# iteration (for repeatable fields)
427	my $i = shift \|\| 0;
428
429	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
430
431	# FIXME remove for speedup?
432	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
433
434	if (utf8::is_utf8($format)) {
435	$format = $self->_x($format);
436	}
437
438	my $found = 0;
439
440	my $eval_code;
441	# remove eval{...} from beginning
442	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
443
444	my $filter_name;
445	# remove filter{...} from beginning
446	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
447
448	# do actual replacement of placeholders
449	# repeatable fields
450	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
451	# non-repeatable fields
452	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
453
454	if ($found) {
455	$log->debug("format: $format");
456	if ($eval_code) {
457	my $eval = $self->fill_in($rec,$eval_code,$i);
458	return if (! $self->_eval($eval));
459	}
460	if ($filter_name && $self->{'filter'}->{$filter_name}) {
461	$log->debug("filter '$filter_name' for $format");
462	$format = $self->{'filter'}->{$filter_name}->($format);
463	return unless(defined($format));
464	$log->debug("filter result: $format");
465	}
466	# do we have lookups?
467	if ($self->{'lookup'}) {
468	if ($self->{'lookup'}->can('lookup')) {
469	return $self->{'lookup'}->lookup($format);
470	} else {
471	$log->warn("Have lookup object but can't invoke lookup method");
472	}
473	} else {
474	return $format;
475	}
476	} else {
477	return;
478	}
479	}
480
481
482	=head2 fill_in_to_arr
483
484	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
485	for fields which have lookups, so they shouldn't be parsed but rather
486	C<fill_id>ed.
487
488	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
489
490	=cut
491
492	sub fill_in_to_arr {
493	my $self = shift;
494
495	my ($rec, $format_utf8) = @_;
496
497	my $log = $self->_get_logger();
498
499	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500	return if (! $format_utf8);
501
502	my $i = 0;
503	my @arr;
504
505	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
506	push @arr, @v;
507	}
508
509	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
510
511	return @arr;
512	}
513
514
515	=head2 get_data
516
517	Returns value from record.
518
519	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
520
521	Arguments are:
522	record reference C<$rec>,
523	field C<$f>,
524	optional subfiled C<$sf>,
525	index for repeatable values C<$i>.
526
527	Optinal variable C<$found> will be incremeted if there
528	is field.
529
530	Returns value or empty string.
531
532	=cut
533
534	sub get_data {
535	my $self = shift;
536
537	my ($rec,$f,$sf,$i,$found) = @_;
538
539	if ($$rec->{$f}) {
540	return '' if (! $$rec->{$f}->[$i]);
541	no strict 'refs';
542	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
543	$$found++ if (defined($$found));
544	return $$rec->{$f}->[$i]->{$sf};
545	} elsif (! $sf && $$rec->{$f}->[$i]) {
546	$$found++ if (defined($$found));
547	# it still might have subfield, just
548	# not specified, so we'll dump all
549	if ($$rec->{$f}->[$i] =~ /HASH/o) {
550	my $out;
551	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
552	$out .= $$rec->{$f}->[$i]->{$k}." ";
553	}
554	return $out;
555	} else {
556	return $$rec->{$f}->[$i];
557	}
558	} else {
559	return '';
560	}
561	} else {
562	return '';
563	}
564	}
565
566
567	=head2 apply_format
568
569	Apply format specified in tag with C<format_name="name"> and
570	C<format_delimiter=";;">.
571
572	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
573
574	Formats can contain C<lookup{...}> if you need them.
575
576	=cut
577
578	sub apply_format {
579	my $self = shift;
580
581	my ($name,$delimiter,$data) = @_;
582
583	my $log = $self->_get_logger();
584
585	if (! $self->{'import_xml'}->{'format'}->{$name}) {
586	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
587	return $data;
588	}
589
590	$log->warn("no delimiter for format $name") if (! $delimiter);
591
592	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
593
594	my @data = split(/\Q$delimiter\E/, $data);
595
596	my $out = sprintf($format, @data);
597	$log->debug("using format $name [$format] on $data to produce: $out");
598
599	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
600	return $self->{'lookup'}->lookup($out);
601	} else {
602	return $out;
603	}
604
605	}
606
607	=head2 sort_arr
608
609	Sort array ignoring case and html in data
610
611	my @sorted = $webpac->sort_arr(@unsorted);
612
613	=cut
614
615	sub sort_arr {
616	my $self = shift;
617
618	my $log = $self->_get_logger();
619
620	# FIXME add Schwartzian Transformation?
621
622	my @sorted = sort {
623	$a =~ s#<[^>]+/*>##;
624	$b =~ s#<[^>]+/*>##;
625	lc($b) cmp lc($a)
626	} @_;
627	$log->debug("sorted values: ",sub { join(", ",@sorted) });
628
629	return @sorted;
630	}
631
632
633	=head1 INTERNAL METHODS
634
635	=head2 _sort_by_order
636
637	Sort xml tags data structure accoding to C<order=""> attribute.
638
639	=cut
640
641	sub _sort_by_order {
642	my $self = shift;
643
644	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
645	$self->{'import_xml'}->{'indexer'}->{$a};
646	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
647	$self->{'import_xml'}->{'indexer'}->{$b};
648
649	return $va <=> $vb;
650	}
651
652	=head2 _x
653
654	Convert strings from C<conf/normalize/*.xml> encoding into application
655	specific encoding (optinally specified using C<code_page> to C<new>
656	constructor).
657
658	my $text = $n->_x('normalize text string');
659
660	This is a stub so that other modules doesn't have to implement it.
661
662	=cut
663
664	sub _x {
665	my $self = shift;
666	return shift;
667	}
668
669
670	=head1 AUTHOR
671
672	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
673
674	=head1 COPYRIGHT & LICENSE
675
676	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
677
678	This program is free software; you can redistribute it and/or modify it
679	under the same terms as Perl itself.
680
681	=cut
682
683	1; # End of WebPAC::DB