lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
                return @ds if ($#ds > 0);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper(@ds) });

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14			Version 0.01
15
16			=cut
17
18			our $VERSION = '0.01';
19
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	29	db => $db_obj,
83	dpavlin	13	lookup_regex => $lookup->regex,
84	dpavlin	31	lookup => $lookup_obj,
85	dpavlin	13	);
86
87	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
88			be use with C<filter{...}> notation.
89
90	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
92			which is C<WebPAC::Lookup> object.
93	dpavlin	13
94	dpavlin	10	=cut
95
96	dpavlin	13	sub new {
97			my $class = shift;
98			my $self = {@_};
99			bless($self, $class);
100
101	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
102			my $l = $self->{'lookup'} ? 1 : 0;
103
104			my $log = $self->_get_logger();
105
106			# those two must be in pair
107			if ( ($r & $l) != ($r \|\| $l) ) {
108			my $log = $self->_get_logger();
109			$log->logdie("lookup_regex and lookup must be in pair");
110			}
111
112			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114	dpavlin	13	$self ? return $self : return undef;
115	dpavlin	10	}
116
117	dpavlin	13
118			=head2 data_structure
119
120			Create in-memory data structure which represents normalized layout from
121			C<conf/normalize/*.xml>.
122
123			This structures are used to produce output.
124
125			my @ds = $webpac->data_structure($rec);
126
127			B<Note: historical oddity follows>
128
129			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131			C<< <headline> >> tag.
132
133			=cut
134
135			sub data_structure {
136			my $self = shift;
137
138			my $log = $self->_get_logger();
139
140			my $rec = shift;
141			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143			my $cache_file;
144
145	dpavlin	18	if ($self->{'db'}) {
146	dpavlin	22	my @ds = $self->{'db'}->load_ds($rec);
147	dpavlin	29	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148			return @ds if ($#ds > 0);
149			$log->debug("cache miss, creating");
150	dpavlin	13	}
151
152			undef $self->{'currnet_filename'};
153			undef $self->{'headline'};
154
155			my @sorted_tags;
156			if ($self->{tags_by_order}) {
157			@sorted_tags = @{$self->{tags_by_order}};
158			} else {
159			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160			$self->{tags_by_order} = \@sorted_tags;
161			}
162
163			my @ds;
164
165			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167			foreach my $field (@sorted_tags) {
168
169			my $row;
170
171			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174			my $format = $tag->{'value'} \|\| $tag->{'content'};
175
176			$log->debug("format: $format");
177
178			my @v;
179			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180			@v = $self->fill_in_to_arr($rec,$format);
181			} else {
182			@v = $self->parse_to_arr($rec,$format);
183			}
184			next if (! @v);
185
186			if ($tag->{'sort'}) {
187			@v = $self->sort_arr(@v);
188			}
189
190			# use format?
191			if ($tag->{'format_name'}) {
192			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193			}
194
195			if ($field eq 'filename') {
196			$self->{'current_filename'} = join('',@v);
197			$log->debug("filename: ",$self->{'current_filename'});
198			} elsif ($field eq 'headline') {
199			$self->{'headline'} .= join('',@v);
200			$log->debug("headline: ",$self->{'headline'});
201			next; # don't return headline in data_structure!
202			}
203
204			# delimiter will join repeatable fields
205			if ($tag->{'delimiter'}) {
206			@v = ( join($tag->{'delimiter'}, @v) );
207			}
208
209			# default types
210			my @types = qw(display swish);
211			# override by type attribute
212			@types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214			foreach my $type (@types) {
215			# append to previous line?
216			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
217			if ($tag->{'append'}) {
218
219			# I will delimit appended part with
220			# delimiter (or ,)
221			my $d = $tag->{'delimiter'};
222			# default delimiter
223			$d \|\|= " ";
224
225			my $last = pop @{$row->{$type}};
226			$d = "" if (! $last);
227			$last .= $d . join($d, @v);
228			push @{$row->{$type}}, $last;
229
230			} else {
231			push @{$row->{$type}}, @v;
232			}
233			}
234
235
236			}
237
238			if ($row) {
239			$row->{'tag'} = $field;
240
241			# TODO: name_sigular, name_plural
242			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243			$row->{'name'} = $name ? $self->_x($name) : $field;
244
245			# post-sort all values in field
246			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247			$log->warn("sort at field tag not implemented");
248			}
249
250			push @ds, $row;
251
252			$log->debug("row $field: ",sub { Dumper($row) });
253			}
254
255			}
256
257	dpavlin	22	$self->{'db'}->save_ds(
258	dpavlin	18	ds => \@ds,
259			current_filename => $self->{'current_filename'},
260			headline => $self->{'headline'},
261			) if ($self->{'db'});
262	dpavlin	13
263	dpavlin	29	$log->debug("ds: ", sub { Dumper(@ds) });
264
265	dpavlin	13	return @ds;
266
267			}
268
269			=head2 parse
270
271			Perform smart parsing of string, skipping delimiters for fields which aren't
272			defined. It can also eval code in format starting with C<eval{...}> and
273			return output or nothing depending on eval code.
274
275			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
276
277			=cut
278
279			sub parse {
280			my $self = shift;
281
282			my ($rec, $format_utf8, $i) = @_;
283
284			return if (! $format_utf8);
285
286			my $log = $self->_get_logger();
287
288			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
289
290			$i = 0 if (! $i);
291
292			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
293
294			my @out;
295
296			$log->debug("format: $format");
297
298			my $eval_code;
299			# remove eval{...} from beginning
300			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
301
302			my $filter_name;
303			# remove filter{...} from beginning
304			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
305
306			my $prefix;
307			my $all_found=0;
308
309			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
310
311			my $del = $1 \|\| '';
312			$prefix \|\|= $del if ($all_found == 0);
313
314			# repeatable index
315			my $r = $i;
316			$r = 0 if (lc("$2") eq 's');
317
318			my $found = 0;
319			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
320
321			if ($found) {
322			push @out, $del;
323			push @out, $tmp;
324			$all_found += $found;
325			}
326			}
327
328			return if (! $all_found);
329
330			my $out = join('',@out);
331
332			if ($out) {
333			# add rest of format (suffix)
334			$out .= $format;
335
336			# add prefix if not there
337			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
338
339			$log->debug("result: $out");
340			}
341
342			if ($eval_code) {
343			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
344			$log->debug("about to eval{$eval} format: $out");
345			return if (! $self->_eval($eval));
346			}
347
348			if ($filter_name && $self->{'filter'}->{$filter_name}) {
349			$log->debug("about to filter{$filter_name} format: $out");
350			$out = $self->{'filter'}->{$filter_name}->($out);
351			return unless(defined($out));
352			$log->debug("filter result: $out");
353			}
354
355			return $out;
356			}
357
358			=head2 parse_to_arr
359
360			Similar to C<parse>, but returns array of all repeatable fields
361
362			my @arr = $webpac->parse_to_arr($rec,'v250^a');
363
364			=cut
365
366			sub parse_to_arr {
367			my $self = shift;
368
369			my ($rec, $format_utf8) = @_;
370
371			my $log = $self->_get_logger();
372
373			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
374			return if (! $format_utf8);
375
376			my $i = 0;
377			my @arr;
378
379			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
380			push @arr, $v;
381			}
382
383			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
384
385			return @arr;
386			}
387
388	dpavlin	15
389			=head2 fill_in
390
391			Workhourse of all: takes record from in-memory structure of database and
392			strings with placeholders and returns string or array of with substituted
393			values from record.
394
395			my $text = $webpac->fill_in($rec,'v250^a');
396
397			Optional argument is ordinal number for repeatable fields. By default,
398			it's assume to be first repeatable field (fields are perl array, so first
399			element is 0).
400			Following example will read second value from repeatable field.
401
402			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
403
404			This function B<does not> perform parsing of format to inteligenty skip
405			delimiters before fields which aren't used.
406
407			This method will automatically decode UTF-8 string to local code page
408			if needed.
409
410			=cut
411
412			sub fill_in {
413			my $self = shift;
414
415			my $log = $self->_get_logger();
416
417			my $rec = shift \|\| $log->logconfess("need data record");
418			my $format = shift \|\| $log->logconfess("need format to parse");
419			# iteration (for repeatable fields)
420			my $i = shift \|\| 0;
421
422			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
423
424			# FIXME remove for speedup?
425			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
426
427			if (utf8::is_utf8($format)) {
428			$format = $self->_x($format);
429			}
430
431			my $found = 0;
432
433			my $eval_code;
434			# remove eval{...} from beginning
435			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
436
437			my $filter_name;
438			# remove filter{...} from beginning
439			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
440
441			# do actual replacement of placeholders
442			# repeatable fields
443			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
444			# non-repeatable fields
445			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
446
447			if ($found) {
448			$log->debug("format: $format");
449			if ($eval_code) {
450			my $eval = $self->fill_in($rec,$eval_code,$i);
451			return if (! $self->_eval($eval));
452			}
453			if ($filter_name && $self->{'filter'}->{$filter_name}) {
454			$log->debug("filter '$filter_name' for $format");
455			$format = $self->{'filter'}->{$filter_name}->($format);
456			return unless(defined($format));
457			$log->debug("filter result: $format");
458			}
459			# do we have lookups?
460			if ($self->{'lookup'}) {
461	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
462			return $self->{'lookup'}->lookup($format);
463			} else {
464			$log->warn("Have lookup object but can't invoke lookup method");
465			}
466	dpavlin	15	} else {
467			return $format;
468			}
469			} else {
470			return;
471			}
472			}
473
474
475	dpavlin	13	=head2 fill_in_to_arr
476
477			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
478			for fields which have lookups, so they shouldn't be parsed but rather
479			C<fill_id>ed.
480
481			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
482
483			=cut
484
485			sub fill_in_to_arr {
486			my $self = shift;
487
488			my ($rec, $format_utf8) = @_;
489
490			my $log = $self->_get_logger();
491
492			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
493			return if (! $format_utf8);
494
495			my $i = 0;
496			my @arr;
497
498			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
499			push @arr, @v;
500			}
501
502			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
503
504			return @arr;
505			}
506
507	dpavlin	15
508			=head2 get_data
509
510			Returns value from record.
511
512			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
513
514			Arguments are:
515			record reference C<$rec>,
516			field C<$f>,
517			optional subfiled C<$sf>,
518			index for repeatable values C<$i>.
519
520			Optinal variable C<$found> will be incremeted if there
521			is field.
522
523			Returns value or empty string.
524
525			=cut
526
527			sub get_data {
528			my $self = shift;
529
530			my ($rec,$f,$sf,$i,$found) = @_;
531
532			if ($$rec->{$f}) {
533			return '' if (! $$rec->{$f}->[$i]);
534			no strict 'refs';
535			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
536			$$found++ if (defined($$found));
537			return $$rec->{$f}->[$i]->{$sf};
538			} elsif ($$rec->{$f}->[$i]) {
539			$$found++ if (defined($$found));
540			# it still might have subfield, just
541			# not specified, so we'll dump all
542			if ($$rec->{$f}->[$i] =~ /HASH/o) {
543			my $out;
544			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
545			$out .= $$rec->{$f}->[$i]->{$k}." ";
546			}
547			return $out;
548			} else {
549			return $$rec->{$f}->[$i];
550			}
551			}
552			} else {
553			return '';
554			}
555			}
556
557
558			=head2 apply_format
559
560			Apply format specified in tag with C<format_name="name"> and
561			C<format_delimiter=";;">.
562
563			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
564
565			Formats can contain C<lookup{...}> if you need them.
566
567			=cut
568
569			sub apply_format {
570			my $self = shift;
571
572			my ($name,$delimiter,$data) = @_;
573
574			my $log = $self->_get_logger();
575
576			if (! $self->{'import_xml'}->{'format'}->{$name}) {
577			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
578			return $data;
579			}
580
581			$log->warn("no delimiter for format $name") if (! $delimiter);
582
583			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
584
585			my @data = split(/\Q$delimiter\E/, $data);
586
587			my $out = sprintf($format, @data);
588			$log->debug("using format $name [$format] on $data to produce: $out");
589
590			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
591	dpavlin	31	return $self->{'lookup'}->lookup($out);
592	dpavlin	15	} else {
593			return $out;
594			}
595
596			}
597
598	dpavlin	13	=head2 sort_arr
599
600			Sort array ignoring case and html in data
601
602			my @sorted = $webpac->sort_arr(@unsorted);
603
604			=cut
605
606			sub sort_arr {
607			my $self = shift;
608
609			my $log = $self->_get_logger();
610
611			# FIXME add Schwartzian Transformation?
612
613			my @sorted = sort {
614			$a =~ s#<[^>]+/*>##;
615			$b =~ s#<[^>]+/*>##;
616			lc($b) cmp lc($a)
617			} @_;
618			$log->debug("sorted values: ",sub { join(", ",@sorted) });
619
620			return @sorted;
621			}
622
623
624	dpavlin	15	=head1 INTERNAL METHODS
625
626	dpavlin	13	=head2 _sort_by_order
627
628			Sort xml tags data structure accoding to C<order=""> attribute.
629
630			=cut
631
632			sub _sort_by_order {
633			my $self = shift;
634
635			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
636			$self->{'import_xml'}->{'indexer'}->{$a};
637			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
638			$self->{'import_xml'}->{'indexer'}->{$b};
639
640			return $va <=> $vb;
641			}
642
643			=head2 _x
644
645	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
646			specific encoding (optinally specified using C<code_page> to C<new>
647			constructor).
648	dpavlin	13
649			my $text = $n->_x('normalize text string');
650
651			This is a stub so that other modules doesn't have to implement it.
652
653			=cut
654
655			sub _x {
656			my $self = shift;
657			return shift;
658			}
659
660
661	dpavlin	10	=head1 AUTHOR
662
663			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
664
665			=head1 COPYRIGHT & LICENSE
666
667			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
668
669			This program is free software; you can redistribute it and/or modify it
670			under the same terms as Perl itself.
671
672			=cut
673
674			1; # End of WebPAC::DB