lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.05

=cut

our $VERSION = '0.05';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use base 'WebPAC::Common';
6	use Data::Dumper;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.05
15
16	=cut
17
18	our $VERSION = '0.05';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	db => $db_obj,
83	lookup_regex => $lookup->regex,
84	lookup => $lookup_obj,
85	prefix => 'foobar',
86	);
87
88	Parametar C<filter> defines user supplied snippets of perl code which can
89	be use with C<filter{...}> notation.
90
91	C<prefix> is used to form filename for database record (to support multiple
92	source files which are joined in one database).
93
94	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95	in structures. If you pass this parametar, you must also pass C<lookup>
96	which is C<WebPAC::Lookup> object.
97
98	=cut
99
100	sub new {
101	my $class = shift;
102	my $self = {@_};
103	bless($self, $class);
104
105	my $r = $self->{'lookup_regex'} ? 1 : 0;
106	my $l = $self->{'lookup'} ? 1 : 0;
107
108	my $log = $self->_get_logger();
109
110	# those two must be in pair
111	if ( ($r & $l) != ($r \|\| $l) ) {
112	my $log = $self->_get_logger();
113	$log->logdie("lookup_regex and lookup must be in pair");
114	}
115
116	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
121
122	$self ? return $self : return undef;
123	}
124
125
126	=head2 data_structure
127
128	Create in-memory data structure which represents normalized layout from
129	C<conf/normalize/*.xml>.
130
131	This structures are used to produce output.
132
133	my $ds = $webpac->data_structure($rec);
134
135	=cut
136
137	sub data_structure {
138	my $self = shift;
139
140	my $log = $self->_get_logger();
141
142	my $rec = shift;
143	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
144
145	$log->debug("data_structure rec = ", sub { Dumper($rec) });
146
147	$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
148
149	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
150
151	my $cache_file;
152
153	if ($self->{'db'}) {
154	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
155	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
156	return $ds if ($ds);
157	$log->debug("cache miss, creating");
158	}
159
160	undef $self->{'currnet_filename'};
161	undef $self->{'headline'};
162
163	my @sorted_tags;
164	if ($self->{tags_by_order}) {
165	@sorted_tags = @{$self->{tags_by_order}};
166	} else {
167	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
168	$self->{tags_by_order} = \@sorted_tags;
169	}
170
171	my $ds;
172
173	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
174
175	foreach my $field (@sorted_tags) {
176
177	my $row;
178
179	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
180
181	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
182	my $format;
183
184	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
185	$format = $tag->{'value'} \|\| $tag->{'content'};
186
187	$log->debug("format: $format");
188
189	my @v;
190	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
191	@v = $self->fill_in_to_arr($rec,$format);
192	} else {
193	@v = $self->parse_to_arr($rec,$format);
194	}
195	next if (! @v);
196
197	if ($tag->{'sort'}) {
198	@v = $self->sort_arr(@v);
199	}
200
201	# use format?
202	if ($tag->{'format_name'}) {
203	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
204	}
205
206	# delimiter will join repeatable fields
207	if ($tag->{'delimiter'}) {
208	@v = ( join($tag->{'delimiter'}, @v) );
209	}
210
211	# default types
212	my @types = qw(display search);
213	# override by type attribute
214	@types = ( $tag->{'type'} ) if ($tag->{'type'});
215
216	foreach my $type (@types) {
217	# append to previous line?
218	$log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} \|\| 'no append');
219	if ($tag->{'append'}) {
220
221	# I will delimit appended part with
222	# delimiter (or ,)
223	my $d = $tag->{'delimiter'};
224	# default delimiter
225	$d \|\|= " ";
226
227	my $last = pop @{$row->{$type}};
228	$d = "" if (! $last);
229	$last .= $d . join($d, @v);
230	push @{$row->{$type}}, $last;
231
232	} else {
233	push @{$row->{$type}}, @v;
234	}
235	}
236
237
238	}
239
240	if ($row) {
241	$row->{'tag'} = $field;
242
243	# TODO: name_sigular, name_plural
244	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
245	my $row_name = $name ? $self->_x($name) : $field;
246
247	# post-sort all values in field
248	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
249	$log->warn("sort at field tag not implemented");
250	}
251
252	$ds->{$row_name} = $row;
253
254	$log->debug("row $field: ",sub { Dumper($row) });
255	}
256
257	}
258
259	$self->{'db'}->save_ds(
260	id => $id,
261	ds => $ds,
262	prefix => $self->{prefix},
263	) if ($self->{'db'});
264
265	$log->debug("ds: ", sub { Dumper($ds) });
266
267	$log->logconfess("data structure returned is not array any more!") if wantarray;
268
269	return $ds;
270
271	}
272
273	=head2 parse
274
275	Perform smart parsing of string, skipping delimiters for fields which aren't
276	defined. It can also eval code in format starting with C<eval{...}> and
277	return output or nothing depending on eval code.
278
279	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
280
281	=cut
282
283	sub parse {
284	my $self = shift;
285
286	my ($rec, $format_utf8, $i) = @_;
287
288	return if (! $format_utf8);
289
290	my $log = $self->_get_logger();
291
292	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
293
294	$i = 0 if (! $i);
295
296	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
297
298	my @out;
299
300	$log->debug("format: $format");
301
302	my $eval_code;
303	# remove eval{...} from beginning
304	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
305
306	my $filter_name;
307	# remove filter{...} from beginning
308	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
309
310	my $prefix;
311	my $all_found=0;
312
313	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
314
315	my $del = $1 \|\| '';
316	$prefix \|\|= $del if ($all_found == 0);
317
318	# repeatable index
319	my $r = $i;
320	$r = 0 if (lc("$2") eq 's');
321
322	my $found = 0;
323	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
324
325	if ($found) {
326	push @out, $del;
327	push @out, $tmp;
328	$all_found += $found;
329	}
330	}
331
332	return if (! $all_found);
333
334	my $out = join('',@out);
335
336	if ($out) {
337	# add rest of format (suffix)
338	$out .= $format;
339
340	# add prefix if not there
341	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
342
343	$log->debug("result: $out");
344	}
345
346	if ($eval_code) {
347	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
348	$log->debug("about to eval{$eval} format: $out");
349	return if (! $self->_eval($eval));
350	}
351
352	if ($filter_name && $self->{'filter'}->{$filter_name}) {
353	$log->debug("about to filter{$filter_name} format: $out");
354	$out = $self->{'filter'}->{$filter_name}->($out);
355	return unless(defined($out));
356	$log->debug("filter result: $out");
357	}
358
359	return $out;
360	}
361
362	=head2 parse_to_arr
363
364	Similar to C<parse>, but returns array of all repeatable fields
365
366	my @arr = $webpac->parse_to_arr($rec,'v250^a');
367
368	=cut
369
370	sub parse_to_arr {
371	my $self = shift;
372
373	my ($rec, $format_utf8) = @_;
374
375	my $log = $self->_get_logger();
376
377	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
378	return if (! $format_utf8);
379
380	my $i = 0;
381	my @arr;
382
383	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
384	push @arr, $v;
385	}
386
387	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
388
389	return @arr;
390	}
391
392
393	=head2 fill_in
394
395	Workhourse of all: takes record from in-memory structure of database and
396	strings with placeholders and returns string or array of with substituted
397	values from record.
398
399	my $text = $webpac->fill_in($rec,'v250^a');
400
401	Optional argument is ordinal number for repeatable fields. By default,
402	it's assume to be first repeatable field (fields are perl array, so first
403	element is 0).
404	Following example will read second value from repeatable field.
405
406	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
407
408	This function B<does not> perform parsing of format to inteligenty skip
409	delimiters before fields which aren't used.
410
411	This method will automatically decode UTF-8 string to local code page
412	if needed.
413
414	=cut
415
416	sub fill_in {
417	my $self = shift;
418
419	my $log = $self->_get_logger();
420
421	my $rec = shift \|\| $log->logconfess("need data record");
422	my $format = shift \|\| $log->logconfess("need format to parse");
423	# iteration (for repeatable fields)
424	my $i = shift \|\| 0;
425
426	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
427
428	# FIXME remove for speedup?
429	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
430
431	if (utf8::is_utf8($format)) {
432	$format = $self->_x($format);
433	}
434
435	my $found = 0;
436
437	my $eval_code;
438	# remove eval{...} from beginning
439	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
440
441	my $filter_name;
442	# remove filter{...} from beginning
443	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
444
445	# do actual replacement of placeholders
446	# repeatable fields
447	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
448	# non-repeatable fields
449	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
450
451	if ($found) {
452	$log->debug("format: $format");
453	if ($eval_code) {
454	my $eval = $self->fill_in($rec,$eval_code,$i);
455	return if (! $self->_eval($eval));
456	}
457	if ($filter_name && $self->{'filter'}->{$filter_name}) {
458	$log->debug("filter '$filter_name' for $format");
459	$format = $self->{'filter'}->{$filter_name}->($format);
460	return unless(defined($format));
461	$log->debug("filter result: $format");
462	}
463	# do we have lookups?
464	if ($self->{'lookup'}) {
465	if ($self->{'lookup'}->can('lookup')) {
466	my @lookup = $self->{lookup}->lookup($format);
467	$log->debug("lookup $format", join(", ", @lookup));
468	return @lookup;
469	} else {
470	$log->warn("Have lookup object but can't invoke lookup method");
471	}
472	} else {
473	return $format;
474	}
475	} else {
476	return;
477	}
478	}
479
480
481	=head2 fill_in_to_arr
482
483	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
484	for fields which have lookups, so they shouldn't be parsed but rather
485	C<fill_id>ed.
486
487	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
488
489	=cut
490
491	sub fill_in_to_arr {
492	my $self = shift;
493
494	my ($rec, $format_utf8) = @_;
495
496	my $log = $self->_get_logger();
497
498	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
499	return if (! $format_utf8);
500
501	my $i = 0;
502	my @arr;
503
504	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
505	push @arr, @v;
506	}
507
508	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
509
510	return @arr;
511	}
512
513
514	=head2 get_data
515
516	Returns value from record.
517
518	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
519
520	Arguments are:
521	record reference C<$rec>,
522	field C<$f>,
523	optional subfiled C<$sf>,
524	index for repeatable values C<$i>.
525
526	Optinal variable C<$found> will be incremeted if there
527	is field.
528
529	Returns value or empty string.
530
531	=cut
532
533	sub get_data {
534	my $self = shift;
535
536	my ($rec,$f,$sf,$i,$found) = @_;
537
538	if ($$rec->{$f}) {
539	return '' if (! $$rec->{$f}->[$i]);
540	no strict 'refs';
541	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
542	$$found++ if (defined($$found));
543	return $$rec->{$f}->[$i]->{$sf};
544	} elsif (! $sf && $$rec->{$f}->[$i]) {
545	$$found++ if (defined($$found));
546	# it still might have subfield, just
547	# not specified, so we'll dump all
548	if ($$rec->{$f}->[$i] =~ /HASH/o) {
549	my $out;
550	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
551	$out .= $$rec->{$f}->[$i]->{$k}." ";
552	}
553	return $out;
554	} else {
555	return $$rec->{$f}->[$i];
556	}
557	} else {
558	return '';
559	}
560	} else {
561	return '';
562	}
563	}
564
565
566	=head2 apply_format
567
568	Apply format specified in tag with C<format_name="name"> and
569	C<format_delimiter=";;">.
570
571	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
572
573	Formats can contain C<lookup{...}> if you need them.
574
575	=cut
576
577	sub apply_format {
578	my $self = shift;
579
580	my ($name,$delimiter,$data) = @_;
581
582	my $log = $self->_get_logger();
583
584	if (! $self->{'import_xml'}->{'format'}->{$name}) {
585	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
586	return $data;
587	}
588
589	$log->warn("no delimiter for format $name") if (! $delimiter);
590
591	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
592
593	my @data = split(/\Q$delimiter\E/, $data);
594
595	my $out = sprintf($format, @data);
596	$log->debug("using format $name [$format] on $data to produce: $out");
597
598	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
599	return $self->{'lookup'}->lookup($out);
600	} else {
601	return $out;
602	}
603
604	}
605
606	=head2 sort_arr
607
608	Sort array ignoring case and html in data
609
610	my @sorted = $webpac->sort_arr(@unsorted);
611
612	=cut
613
614	sub sort_arr {
615	my $self = shift;
616
617	my $log = $self->_get_logger();
618
619	# FIXME add Schwartzian Transformation?
620
621	my @sorted = sort {
622	$a =~ s#<[^>]+/*>##;
623	$b =~ s#<[^>]+/*>##;
624	lc($b) cmp lc($a)
625	} @_;
626	$log->debug("sorted values: ",sub { join(", ",@sorted) });
627
628	return @sorted;
629	}
630
631
632	=head1 INTERNAL METHODS
633
634	=head2 _sort_by_order
635
636	Sort xml tags data structure accoding to C<order=""> attribute.
637
638	=cut
639
640	sub _sort_by_order {
641	my $self = shift;
642
643	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
644	$self->{'import_xml'}->{'indexer'}->{$a};
645	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
646	$self->{'import_xml'}->{'indexer'}->{$b};
647
648	return $va <=> $vb;
649	}
650
651	=head2 _x
652
653	Convert strings from C<conf/normalize/*.xml> encoding into application
654	specific encoding (optinally specified using C<code_page> to C<new>
655	constructor).
656
657	my $text = $n->_x('normalize text string');
658
659	This is a stub so that other modules doesn't have to implement it.
660
661	=cut
662
663	sub _x {
664	my $self = shift;
665	return shift;
666	}
667
668
669	=head1 AUTHOR
670
671	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
672
673	=head1 COPYRIGHT & LICENSE
674
675	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
676
677	This program is free software; you can redistribute it and/or modify it
678	under the same terms as Perl itself.
679
680	=cut
681
682	1; # End of WebPAC::Normalize