lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if ($self->{'db'}) {
                my @ds = $self->{'db'}->load_ds($rec);
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
                return @ds if ($#ds > 0);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});

        $self->{'db'}->save_ds(
                ds => \@ds,
                current_filename => $self->{'current_filename'},
                headline => $self->{'headline'},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper(@ds) });

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14			Version 0.01
15
16			=cut
17
18			our $VERSION = '0.01';
19
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	29	db => $db_obj,
83	dpavlin	13	lookup_regex => $lookup->regex,
84	dpavlin	31	lookup => $lookup_obj,
85	dpavlin	13	);
86
87	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
88			be use with C<filter{...}> notation.
89
90	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
92			which is C<WebPAC::Lookup> object.
93	dpavlin	13
94	dpavlin	10	=cut
95
96	dpavlin	13	sub new {
97			my $class = shift;
98			my $self = {@_};
99			bless($self, $class);
100
101	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
102			my $l = $self->{'lookup'} ? 1 : 0;
103
104			my $log = $self->_get_logger();
105
106			# those two must be in pair
107			if ( ($r & $l) != ($r \|\| $l) ) {
108			my $log = $self->_get_logger();
109			$log->logdie("lookup_regex and lookup must be in pair");
110			}
111
112			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114	dpavlin	13	$self ? return $self : return undef;
115	dpavlin	10	}
116
117	dpavlin	13
118			=head2 data_structure
119
120			Create in-memory data structure which represents normalized layout from
121			C<conf/normalize/*.xml>.
122
123			This structures are used to produce output.
124
125			my @ds = $webpac->data_structure($rec);
126
127			B<Note: historical oddity follows>
128
129			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131			C<< <headline> >> tag.
132
133			=cut
134
135			sub data_structure {
136			my $self = shift;
137
138			my $log = $self->_get_logger();
139
140			my $rec = shift;
141			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143			my $cache_file;
144
145	dpavlin	18	if ($self->{'db'}) {
146	dpavlin	22	my @ds = $self->{'db'}->load_ds($rec);
147	dpavlin	29	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148			return @ds if ($#ds > 0);
149			$log->debug("cache miss, creating");
150	dpavlin	13	}
151
152			undef $self->{'currnet_filename'};
153			undef $self->{'headline'};
154
155			my @sorted_tags;
156			if ($self->{tags_by_order}) {
157			@sorted_tags = @{$self->{tags_by_order}};
158			} else {
159			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160			$self->{tags_by_order} = \@sorted_tags;
161			}
162
163			my @ds;
164
165			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167			foreach my $field (@sorted_tags) {
168
169			my $row;
170
171			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174	dpavlin	38	my $format;
175	dpavlin	13
176	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177			$format = $tag->{'value'} \|\| $tag->{'content'};
178
179	dpavlin	13	$log->debug("format: $format");
180
181			my @v;
182			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183			@v = $self->fill_in_to_arr($rec,$format);
184			} else {
185			@v = $self->parse_to_arr($rec,$format);
186			}
187			next if (! @v);
188
189			if ($tag->{'sort'}) {
190			@v = $self->sort_arr(@v);
191			}
192
193			# use format?
194			if ($tag->{'format_name'}) {
195			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196			}
197
198			if ($field eq 'filename') {
199			$self->{'current_filename'} = join('',@v);
200			$log->debug("filename: ",$self->{'current_filename'});
201			} elsif ($field eq 'headline') {
202			$self->{'headline'} .= join('',@v);
203			$log->debug("headline: ",$self->{'headline'});
204			next; # don't return headline in data_structure!
205			}
206
207			# delimiter will join repeatable fields
208			if ($tag->{'delimiter'}) {
209			@v = ( join($tag->{'delimiter'}, @v) );
210			}
211
212			# default types
213			my @types = qw(display swish);
214			# override by type attribute
215			@types = ( $tag->{'type'} ) if ($tag->{'type'});
216
217			foreach my $type (@types) {
218			# append to previous line?
219			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
220			if ($tag->{'append'}) {
221
222			# I will delimit appended part with
223			# delimiter (or ,)
224			my $d = $tag->{'delimiter'};
225			# default delimiter
226			$d \|\|= " ";
227
228			my $last = pop @{$row->{$type}};
229			$d = "" if (! $last);
230			$last .= $d . join($d, @v);
231			push @{$row->{$type}}, $last;
232
233			} else {
234			push @{$row->{$type}}, @v;
235			}
236			}
237
238
239			}
240
241			if ($row) {
242			$row->{'tag'} = $field;
243
244			# TODO: name_sigular, name_plural
245			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246			$row->{'name'} = $name ? $self->_x($name) : $field;
247
248			# post-sort all values in field
249			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250			$log->warn("sort at field tag not implemented");
251			}
252
253			push @ds, $row;
254
255			$log->debug("row $field: ",sub { Dumper($row) });
256			}
257
258			}
259
260	dpavlin	39	$log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261
262	dpavlin	22	$self->{'db'}->save_ds(
263	dpavlin	18	ds => \@ds,
264			current_filename => $self->{'current_filename'},
265			headline => $self->{'headline'},
266			) if ($self->{'db'});
267	dpavlin	13
268	dpavlin	29	$log->debug("ds: ", sub { Dumper(@ds) });
269
270	dpavlin	13	return @ds;
271
272			}
273
274			=head2 parse
275
276			Perform smart parsing of string, skipping delimiters for fields which aren't
277			defined. It can also eval code in format starting with C<eval{...}> and
278			return output or nothing depending on eval code.
279
280			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
281
282			=cut
283
284			sub parse {
285			my $self = shift;
286
287			my ($rec, $format_utf8, $i) = @_;
288
289			return if (! $format_utf8);
290
291			my $log = $self->_get_logger();
292
293			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
294
295			$i = 0 if (! $i);
296
297			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
298
299			my @out;
300
301			$log->debug("format: $format");
302
303			my $eval_code;
304			# remove eval{...} from beginning
305			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
306
307			my $filter_name;
308			# remove filter{...} from beginning
309			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
310
311			my $prefix;
312			my $all_found=0;
313
314			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
315
316			my $del = $1 \|\| '';
317			$prefix \|\|= $del if ($all_found == 0);
318
319			# repeatable index
320			my $r = $i;
321			$r = 0 if (lc("$2") eq 's');
322
323			my $found = 0;
324			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
325
326			if ($found) {
327			push @out, $del;
328			push @out, $tmp;
329			$all_found += $found;
330			}
331			}
332
333			return if (! $all_found);
334
335			my $out = join('',@out);
336
337			if ($out) {
338			# add rest of format (suffix)
339			$out .= $format;
340
341			# add prefix if not there
342			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
343
344			$log->debug("result: $out");
345			}
346
347			if ($eval_code) {
348			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
349			$log->debug("about to eval{$eval} format: $out");
350			return if (! $self->_eval($eval));
351			}
352
353			if ($filter_name && $self->{'filter'}->{$filter_name}) {
354			$log->debug("about to filter{$filter_name} format: $out");
355			$out = $self->{'filter'}->{$filter_name}->($out);
356			return unless(defined($out));
357			$log->debug("filter result: $out");
358			}
359
360			return $out;
361			}
362
363			=head2 parse_to_arr
364
365			Similar to C<parse>, but returns array of all repeatable fields
366
367			my @arr = $webpac->parse_to_arr($rec,'v250^a');
368
369			=cut
370
371			sub parse_to_arr {
372			my $self = shift;
373
374			my ($rec, $format_utf8) = @_;
375
376			my $log = $self->_get_logger();
377
378			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
379			return if (! $format_utf8);
380
381			my $i = 0;
382			my @arr;
383
384			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
385			push @arr, $v;
386			}
387
388			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
389
390			return @arr;
391			}
392
393	dpavlin	15
394			=head2 fill_in
395
396			Workhourse of all: takes record from in-memory structure of database and
397			strings with placeholders and returns string or array of with substituted
398			values from record.
399
400			my $text = $webpac->fill_in($rec,'v250^a');
401
402			Optional argument is ordinal number for repeatable fields. By default,
403			it's assume to be first repeatable field (fields are perl array, so first
404			element is 0).
405			Following example will read second value from repeatable field.
406
407			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
408
409			This function B<does not> perform parsing of format to inteligenty skip
410			delimiters before fields which aren't used.
411
412			This method will automatically decode UTF-8 string to local code page
413			if needed.
414
415			=cut
416
417			sub fill_in {
418			my $self = shift;
419
420			my $log = $self->_get_logger();
421
422			my $rec = shift \|\| $log->logconfess("need data record");
423			my $format = shift \|\| $log->logconfess("need format to parse");
424			# iteration (for repeatable fields)
425			my $i = shift \|\| 0;
426
427			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
428
429			# FIXME remove for speedup?
430			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
431
432			if (utf8::is_utf8($format)) {
433			$format = $self->_x($format);
434			}
435
436			my $found = 0;
437
438			my $eval_code;
439			# remove eval{...} from beginning
440			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441
442			my $filter_name;
443			# remove filter{...} from beginning
444			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
445
446			# do actual replacement of placeholders
447			# repeatable fields
448			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
449			# non-repeatable fields
450			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
451
452			if ($found) {
453			$log->debug("format: $format");
454			if ($eval_code) {
455			my $eval = $self->fill_in($rec,$eval_code,$i);
456			return if (! $self->_eval($eval));
457			}
458			if ($filter_name && $self->{'filter'}->{$filter_name}) {
459			$log->debug("filter '$filter_name' for $format");
460			$format = $self->{'filter'}->{$filter_name}->($format);
461			return unless(defined($format));
462			$log->debug("filter result: $format");
463			}
464			# do we have lookups?
465			if ($self->{'lookup'}) {
466	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
467			return $self->{'lookup'}->lookup($format);
468			} else {
469			$log->warn("Have lookup object but can't invoke lookup method");
470			}
471	dpavlin	15	} else {
472			return $format;
473			}
474			} else {
475			return;
476			}
477			}
478
479
480	dpavlin	13	=head2 fill_in_to_arr
481
482			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
483			for fields which have lookups, so they shouldn't be parsed but rather
484			C<fill_id>ed.
485
486			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
487
488			=cut
489
490			sub fill_in_to_arr {
491			my $self = shift;
492
493			my ($rec, $format_utf8) = @_;
494
495			my $log = $self->_get_logger();
496
497			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
498			return if (! $format_utf8);
499
500			my $i = 0;
501			my @arr;
502
503			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
504			push @arr, @v;
505			}
506
507			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
508
509			return @arr;
510			}
511
512	dpavlin	15
513			=head2 get_data
514
515			Returns value from record.
516
517			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518
519			Arguments are:
520			record reference C<$rec>,
521			field C<$f>,
522			optional subfiled C<$sf>,
523			index for repeatable values C<$i>.
524
525			Optinal variable C<$found> will be incremeted if there
526			is field.
527
528			Returns value or empty string.
529
530			=cut
531
532			sub get_data {
533			my $self = shift;
534
535			my ($rec,$f,$sf,$i,$found) = @_;
536
537			if ($$rec->{$f}) {
538			return '' if (! $$rec->{$f}->[$i]);
539			no strict 'refs';
540			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541			$$found++ if (defined($$found));
542			return $$rec->{$f}->[$i]->{$sf};
543			} elsif ($$rec->{$f}->[$i]) {
544			$$found++ if (defined($$found));
545			# it still might have subfield, just
546			# not specified, so we'll dump all
547			if ($$rec->{$f}->[$i] =~ /HASH/o) {
548			my $out;
549			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550			$out .= $$rec->{$f}->[$i]->{$k}." ";
551			}
552			return $out;
553			} else {
554			return $$rec->{$f}->[$i];
555			}
556			}
557			} else {
558			return '';
559			}
560			}
561
562
563			=head2 apply_format
564
565			Apply format specified in tag with C<format_name="name"> and
566			C<format_delimiter=";;">.
567
568			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
569
570			Formats can contain C<lookup{...}> if you need them.
571
572			=cut
573
574			sub apply_format {
575			my $self = shift;
576
577			my ($name,$delimiter,$data) = @_;
578
579			my $log = $self->_get_logger();
580
581			if (! $self->{'import_xml'}->{'format'}->{$name}) {
582			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
583			return $data;
584			}
585
586			$log->warn("no delimiter for format $name") if (! $delimiter);
587
588			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
589
590			my @data = split(/\Q$delimiter\E/, $data);
591
592			my $out = sprintf($format, @data);
593			$log->debug("using format $name [$format] on $data to produce: $out");
594
595			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
596	dpavlin	31	return $self->{'lookup'}->lookup($out);
597	dpavlin	15	} else {
598			return $out;
599			}
600
601			}
602
603	dpavlin	13	=head2 sort_arr
604
605			Sort array ignoring case and html in data
606
607			my @sorted = $webpac->sort_arr(@unsorted);
608
609			=cut
610
611			sub sort_arr {
612			my $self = shift;
613
614			my $log = $self->_get_logger();
615
616			# FIXME add Schwartzian Transformation?
617
618			my @sorted = sort {
619			$a =~ s#<[^>]+/*>##;
620			$b =~ s#<[^>]+/*>##;
621			lc($b) cmp lc($a)
622			} @_;
623			$log->debug("sorted values: ",sub { join(", ",@sorted) });
624
625			return @sorted;
626			}
627
628
629	dpavlin	15	=head1 INTERNAL METHODS
630
631	dpavlin	13	=head2 _sort_by_order
632
633			Sort xml tags data structure accoding to C<order=""> attribute.
634
635			=cut
636
637			sub _sort_by_order {
638			my $self = shift;
639
640			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
641			$self->{'import_xml'}->{'indexer'}->{$a};
642			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
643			$self->{'import_xml'}->{'indexer'}->{$b};
644
645			return $va <=> $vb;
646			}
647
648			=head2 _x
649
650	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
651			specific encoding (optinally specified using C<code_page> to C<new>
652			constructor).
653	dpavlin	13
654			my $text = $n->_x('normalize text string');
655
656			This is a stub so that other modules doesn't have to implement it.
657
658			=cut
659
660			sub _x {
661			my $self = shift;
662			return shift;
663			}
664
665
666	dpavlin	10	=head1 AUTHOR
667
668			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
669
670			=head1 COPYRIGHT & LICENSE
671
672			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
673
674			This program is free software; you can redistribute it and/or modify it
675			under the same terms as Perl itself.
676
677			=cut
678
679			1; # End of WebPAC::DB