lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.04

=cut

our $VERSION = '0.04';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                return $self->{'lookup'}->lookup($format);
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14	dpavlin	219	Version 0.04
15	dpavlin	10
16			=cut
17
18	dpavlin	219	our $VERSION = '0.04';
19	dpavlin	10
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	29	db => $db_obj,
83	dpavlin	13	lookup_regex => $lookup->regex,
84	dpavlin	31	lookup => $lookup_obj,
85	dpavlin	219	prefix => 'foobar',
86	dpavlin	13	);
87
88	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
89			be use with C<filter{...}> notation.
90
91	dpavlin	219	C<prefix> is used to form filename for database record (to support multiple
92			source files which are joined in one database).
93
94	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
96			which is C<WebPAC::Lookup> object.
97	dpavlin	13
98	dpavlin	10	=cut
99
100	dpavlin	13	sub new {
101			my $class = shift;
102			my $self = {@_};
103			bless($self, $class);
104
105	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
106			my $l = $self->{'lookup'} ? 1 : 0;
107
108			my $log = $self->_get_logger();
109
110			# those two must be in pair
111			if ( ($r & $l) != ($r \|\| $l) ) {
112			my $log = $self->_get_logger();
113			$log->logdie("lookup_regex and lookup must be in pair");
114			}
115
116			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118	dpavlin	219	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120	dpavlin	13	$self ? return $self : return undef;
121	dpavlin	10	}
122
123	dpavlin	13
124			=head2 data_structure
125
126			Create in-memory data structure which represents normalized layout from
127			C<conf/normalize/*.xml>.
128
129			This structures are used to produce output.
130
131	dpavlin	70	my $ds = $webpac->data_structure($rec);
132	dpavlin	13
133			=cut
134
135			sub data_structure {
136			my $self = shift;
137
138			my $log = $self->_get_logger();
139
140			my $rec = shift;
141			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143	dpavlin	125	$log->debug("data_structure rec = ", sub { Dumper($rec) });
144
145			$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
146
147	dpavlin	219	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
148	dpavlin	125
149	dpavlin	13	my $cache_file;
150
151	dpavlin	18	if ($self->{'db'}) {
152	dpavlin	219	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
153	dpavlin	70	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
154			return $ds if ($ds);
155	dpavlin	29	$log->debug("cache miss, creating");
156	dpavlin	13	}
157
158			undef $self->{'currnet_filename'};
159			undef $self->{'headline'};
160
161			my @sorted_tags;
162			if ($self->{tags_by_order}) {
163			@sorted_tags = @{$self->{tags_by_order}};
164			} else {
165			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
166			$self->{tags_by_order} = \@sorted_tags;
167			}
168
169	dpavlin	70	my $ds;
170	dpavlin	13
171			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
172
173			foreach my $field (@sorted_tags) {
174
175			my $row;
176
177			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
178
179			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
180	dpavlin	38	my $format;
181	dpavlin	13
182	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
183			$format = $tag->{'value'} \|\| $tag->{'content'};
184
185	dpavlin	13	$log->debug("format: $format");
186
187			my @v;
188			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
189			@v = $self->fill_in_to_arr($rec,$format);
190			} else {
191			@v = $self->parse_to_arr($rec,$format);
192			}
193			next if (! @v);
194
195			if ($tag->{'sort'}) {
196			@v = $self->sort_arr(@v);
197			}
198
199			# use format?
200			if ($tag->{'format_name'}) {
201			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
202			}
203
204			# delimiter will join repeatable fields
205			if ($tag->{'delimiter'}) {
206			@v = ( join($tag->{'delimiter'}, @v) );
207			}
208
209			# default types
210	dpavlin	74	my @types = qw(display search);
211	dpavlin	13	# override by type attribute
212			@types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214			foreach my $type (@types) {
215			# append to previous line?
216			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
217			if ($tag->{'append'}) {
218
219			# I will delimit appended part with
220			# delimiter (or ,)
221			my $d = $tag->{'delimiter'};
222			# default delimiter
223			$d \|\|= " ";
224
225			my $last = pop @{$row->{$type}};
226			$d = "" if (! $last);
227			$last .= $d . join($d, @v);
228			push @{$row->{$type}}, $last;
229
230			} else {
231			push @{$row->{$type}}, @v;
232			}
233			}
234
235
236			}
237
238			if ($row) {
239			$row->{'tag'} = $field;
240
241			# TODO: name_sigular, name_plural
242			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243	dpavlin	70	my $row_name = $name ? $self->_x($name) : $field;
244	dpavlin	13
245			# post-sort all values in field
246			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247			$log->warn("sort at field tag not implemented");
248			}
249
250	dpavlin	70	$ds->{$row_name} = $row;
251	dpavlin	13
252			$log->debug("row $field: ",sub { Dumper($row) });
253			}
254
255			}
256
257	dpavlin	22	$self->{'db'}->save_ds(
258	dpavlin	219	id => $id,
259	dpavlin	70	ds => $ds,
260	dpavlin	219	prefix => $self->{prefix},
261	dpavlin	18	) if ($self->{'db'});
262	dpavlin	13
263	dpavlin	70	$log->debug("ds: ", sub { Dumper($ds) });
264	dpavlin	29
265	dpavlin	70	$log->logconfess("data structure returned is not array any more!") if wantarray;
266	dpavlin	13
267	dpavlin	70	return $ds;
268
269	dpavlin	13	}
270
271			=head2 parse
272
273			Perform smart parsing of string, skipping delimiters for fields which aren't
274			defined. It can also eval code in format starting with C<eval{...}> and
275			return output or nothing depending on eval code.
276
277			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
278
279			=cut
280
281			sub parse {
282			my $self = shift;
283
284			my ($rec, $format_utf8, $i) = @_;
285
286			return if (! $format_utf8);
287
288			my $log = $self->_get_logger();
289
290			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
291
292			$i = 0 if (! $i);
293
294			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
295
296			my @out;
297
298			$log->debug("format: $format");
299
300			my $eval_code;
301			# remove eval{...} from beginning
302			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
303
304			my $filter_name;
305			# remove filter{...} from beginning
306			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
307
308			my $prefix;
309			my $all_found=0;
310
311			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
312
313			my $del = $1 \|\| '';
314			$prefix \|\|= $del if ($all_found == 0);
315
316			# repeatable index
317			my $r = $i;
318			$r = 0 if (lc("$2") eq 's');
319
320			my $found = 0;
321			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
322
323			if ($found) {
324			push @out, $del;
325			push @out, $tmp;
326			$all_found += $found;
327			}
328			}
329
330			return if (! $all_found);
331
332			my $out = join('',@out);
333
334			if ($out) {
335			# add rest of format (suffix)
336			$out .= $format;
337
338			# add prefix if not there
339			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
340
341			$log->debug("result: $out");
342			}
343
344			if ($eval_code) {
345			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
346			$log->debug("about to eval{$eval} format: $out");
347			return if (! $self->_eval($eval));
348			}
349
350			if ($filter_name && $self->{'filter'}->{$filter_name}) {
351			$log->debug("about to filter{$filter_name} format: $out");
352			$out = $self->{'filter'}->{$filter_name}->($out);
353			return unless(defined($out));
354			$log->debug("filter result: $out");
355			}
356
357			return $out;
358			}
359
360			=head2 parse_to_arr
361
362			Similar to C<parse>, but returns array of all repeatable fields
363
364			my @arr = $webpac->parse_to_arr($rec,'v250^a');
365
366			=cut
367
368			sub parse_to_arr {
369			my $self = shift;
370
371			my ($rec, $format_utf8) = @_;
372
373			my $log = $self->_get_logger();
374
375			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
376			return if (! $format_utf8);
377
378			my $i = 0;
379			my @arr;
380
381			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
382			push @arr, $v;
383			}
384
385			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
386
387			return @arr;
388			}
389
390	dpavlin	15
391			=head2 fill_in
392
393			Workhourse of all: takes record from in-memory structure of database and
394			strings with placeholders and returns string or array of with substituted
395			values from record.
396
397			my $text = $webpac->fill_in($rec,'v250^a');
398
399			Optional argument is ordinal number for repeatable fields. By default,
400			it's assume to be first repeatable field (fields are perl array, so first
401			element is 0).
402			Following example will read second value from repeatable field.
403
404			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
405
406			This function B<does not> perform parsing of format to inteligenty skip
407			delimiters before fields which aren't used.
408
409			This method will automatically decode UTF-8 string to local code page
410			if needed.
411
412			=cut
413
414			sub fill_in {
415			my $self = shift;
416
417			my $log = $self->_get_logger();
418
419			my $rec = shift \|\| $log->logconfess("need data record");
420			my $format = shift \|\| $log->logconfess("need format to parse");
421			# iteration (for repeatable fields)
422			my $i = shift \|\| 0;
423
424			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
425
426			# FIXME remove for speedup?
427			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
428
429			if (utf8::is_utf8($format)) {
430			$format = $self->_x($format);
431			}
432
433			my $found = 0;
434
435			my $eval_code;
436			# remove eval{...} from beginning
437			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
438
439			my $filter_name;
440			# remove filter{...} from beginning
441			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
442
443			# do actual replacement of placeholders
444			# repeatable fields
445			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
446			# non-repeatable fields
447			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
448
449			if ($found) {
450			$log->debug("format: $format");
451			if ($eval_code) {
452			my $eval = $self->fill_in($rec,$eval_code,$i);
453			return if (! $self->_eval($eval));
454			}
455			if ($filter_name && $self->{'filter'}->{$filter_name}) {
456			$log->debug("filter '$filter_name' for $format");
457			$format = $self->{'filter'}->{$filter_name}->($format);
458			return unless(defined($format));
459			$log->debug("filter result: $format");
460			}
461			# do we have lookups?
462			if ($self->{'lookup'}) {
463	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
464			return $self->{'lookup'}->lookup($format);
465			} else {
466			$log->warn("Have lookup object but can't invoke lookup method");
467			}
468	dpavlin	15	} else {
469			return $format;
470			}
471			} else {
472			return;
473			}
474			}
475
476
477	dpavlin	13	=head2 fill_in_to_arr
478
479			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
480			for fields which have lookups, so they shouldn't be parsed but rather
481			C<fill_id>ed.
482
483			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
484
485			=cut
486
487			sub fill_in_to_arr {
488			my $self = shift;
489
490			my ($rec, $format_utf8) = @_;
491
492			my $log = $self->_get_logger();
493
494			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
495			return if (! $format_utf8);
496
497			my $i = 0;
498			my @arr;
499
500			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
501			push @arr, @v;
502			}
503
504			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
505
506			return @arr;
507			}
508
509	dpavlin	15
510			=head2 get_data
511
512			Returns value from record.
513
514			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
515
516			Arguments are:
517			record reference C<$rec>,
518			field C<$f>,
519			optional subfiled C<$sf>,
520			index for repeatable values C<$i>.
521
522			Optinal variable C<$found> will be incremeted if there
523			is field.
524
525			Returns value or empty string.
526
527			=cut
528
529			sub get_data {
530			my $self = shift;
531
532			my ($rec,$f,$sf,$i,$found) = @_;
533
534			if ($$rec->{$f}) {
535			return '' if (! $$rec->{$f}->[$i]);
536			no strict 'refs';
537			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
538			$$found++ if (defined($$found));
539			return $$rec->{$f}->[$i]->{$sf};
540	dpavlin	64	} elsif (! $sf && $$rec->{$f}->[$i]) {
541	dpavlin	15	$$found++ if (defined($$found));
542			# it still might have subfield, just
543			# not specified, so we'll dump all
544			if ($$rec->{$f}->[$i] =~ /HASH/o) {
545			my $out;
546			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
547			$out .= $$rec->{$f}->[$i]->{$k}." ";
548			}
549			return $out;
550			} else {
551			return $$rec->{$f}->[$i];
552			}
553	dpavlin	64	} else {
554			return '';
555	dpavlin	15	}
556			} else {
557			return '';
558			}
559			}
560
561
562			=head2 apply_format
563
564			Apply format specified in tag with C<format_name="name"> and
565			C<format_delimiter=";;">.
566
567			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
568
569			Formats can contain C<lookup{...}> if you need them.
570
571			=cut
572
573			sub apply_format {
574			my $self = shift;
575
576			my ($name,$delimiter,$data) = @_;
577
578			my $log = $self->_get_logger();
579
580			if (! $self->{'import_xml'}->{'format'}->{$name}) {
581			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
582			return $data;
583			}
584
585			$log->warn("no delimiter for format $name") if (! $delimiter);
586
587			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
588
589			my @data = split(/\Q$delimiter\E/, $data);
590
591			my $out = sprintf($format, @data);
592			$log->debug("using format $name [$format] on $data to produce: $out");
593
594			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
595	dpavlin	31	return $self->{'lookup'}->lookup($out);
596	dpavlin	15	} else {
597			return $out;
598			}
599
600			}
601
602	dpavlin	13	=head2 sort_arr
603
604			Sort array ignoring case and html in data
605
606			my @sorted = $webpac->sort_arr(@unsorted);
607
608			=cut
609
610			sub sort_arr {
611			my $self = shift;
612
613			my $log = $self->_get_logger();
614
615			# FIXME add Schwartzian Transformation?
616
617			my @sorted = sort {
618			$a =~ s#<[^>]+/*>##;
619			$b =~ s#<[^>]+/*>##;
620			lc($b) cmp lc($a)
621			} @_;
622			$log->debug("sorted values: ",sub { join(", ",@sorted) });
623
624			return @sorted;
625			}
626
627
628	dpavlin	15	=head1 INTERNAL METHODS
629
630	dpavlin	13	=head2 _sort_by_order
631
632			Sort xml tags data structure accoding to C<order=""> attribute.
633
634			=cut
635
636			sub _sort_by_order {
637			my $self = shift;
638
639			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
640			$self->{'import_xml'}->{'indexer'}->{$a};
641			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
642			$self->{'import_xml'}->{'indexer'}->{$b};
643
644			return $va <=> $vb;
645			}
646
647			=head2 _x
648
649	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
650			specific encoding (optinally specified using C<code_page> to C<new>
651			constructor).
652	dpavlin	13
653			my $text = $n->_x('normalize text string');
654
655			This is a stub so that other modules doesn't have to implement it.
656
657			=cut
658
659			sub _x {
660			my $self = shift;
661			return shift;
662			}
663
664
665	dpavlin	10	=head1 AUTHOR
666
667			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
668
669			=head1 COPYRIGHT & LICENSE
670
671			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
672
673			This program is free software; you can redistribute it and/or modify it
674			under the same terms as Perl itself.
675
676			=cut
677
678	dpavlin	209	1; # End of WebPAC::Normalize