lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use base 'WebPAC::Common';
use Data::Dumper;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.05

=cut

our $VERSION = '0.05';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        db => $db_obj,
        lookup_regex => $lookup->regex,
        lookup => $lookup_obj,
        prefix => 'foobar',
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

C<prefix> is used to form filename for database record (to support multiple
source files which are joined in one database).

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures. If you pass this parametar, you must also pass C<lookup>
which is C<WebPAC::Lookup> object.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        my $r = $self->{'lookup_regex'} ? 1 : 0;
        my $l = $self->{'lookup'} ? 1 : 0;

        my $log = $self->_get_logger();

        # those two must be in pair
        if ( ($r & $l) != ($r || $l) ) {
                my $log = $self->_get_logger();
                $log->logdie("lookup_regex and lookup must be in pair");
        }

        $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));

        $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});

        $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);

        $self ? return $self : return undef;
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my $ds = $webpac->data_structure($rec);

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $log->debug("data_structure rec = ", sub { Dumper($rec) });

        $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));

        my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");

        my $cache_file;

        if ($self->{'db'}) {
                my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
                $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
                return $ds if ($ds);
                $log->debug("cache miss, creating");
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my $ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format;

                        $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
                        $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display search);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        my $row_name = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        $ds->{$row_name} = $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        $self->{'db'}->save_ds(
                id => $id,
                ds => $ds,
                prefix => $self->{prefix},
        ) if ($self->{'db'});

        $log->debug("ds: ", sub { Dumper($ds) });

        $log->logconfess("data structure returned is not array any more!") if wantarray;

        return $ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        if ($self->{'lookup'}->can('lookup')) {
                                my @lookup = $self->{lookup}->lookup($format);
                                $log->debug("lookup $format", join(", ", @lookup));
                                return @lookup;
                        } else {
                                $log->warn("Have lookup object but can't invoke lookup method");
                        }
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif (! $sf && $$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                } else {
                        return '';
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->{'lookup'}->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::Normalize
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	29	use base 'WebPAC::Common';
6	dpavlin	13	use Data::Dumper;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14	dpavlin	253	Version 0.05
15	dpavlin	10
16			=cut
17
18	dpavlin	253	our $VERSION = '0.05';
19	dpavlin	10
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	29	db => $db_obj,
83	dpavlin	13	lookup_regex => $lookup->regex,
84	dpavlin	31	lookup => $lookup_obj,
85	dpavlin	219	prefix => 'foobar',
86	dpavlin	13	);
87
88	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
89			be use with C<filter{...}> notation.
90
91	dpavlin	219	C<prefix> is used to form filename for database record (to support multiple
92			source files which are joined in one database).
93
94	dpavlin	13	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95	dpavlin	31	in structures. If you pass this parametar, you must also pass C<lookup>
96			which is C<WebPAC::Lookup> object.
97	dpavlin	13
98	dpavlin	10	=cut
99
100	dpavlin	13	sub new {
101			my $class = shift;
102			my $self = {@_};
103			bless($self, $class);
104
105	dpavlin	31	my $r = $self->{'lookup_regex'} ? 1 : 0;
106			my $l = $self->{'lookup'} ? 1 : 0;
107
108			my $log = $self->_get_logger();
109
110			# those two must be in pair
111			if ( ($r & $l) != ($r \|\| $l) ) {
112			my $log = $self->_get_logger();
113			$log->logdie("lookup_regex and lookup must be in pair");
114			}
115
116			$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118	dpavlin	219	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120	dpavlin	252	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
121
122	dpavlin	13	$self ? return $self : return undef;
123	dpavlin	10	}
124
125	dpavlin	13
126			=head2 data_structure
127
128			Create in-memory data structure which represents normalized layout from
129			C<conf/normalize/*.xml>.
130
131			This structures are used to produce output.
132
133	dpavlin	70	my $ds = $webpac->data_structure($rec);
134	dpavlin	13
135			=cut
136
137			sub data_structure {
138			my $self = shift;
139
140			my $log = $self->_get_logger();
141
142			my $rec = shift;
143			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
144
145	dpavlin	125	$log->debug("data_structure rec = ", sub { Dumper($rec) });
146
147			$log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
148
149	dpavlin	219	my $id = $rec->{'000'}->[0] \|\| $log->logdie("field 000 isn't array!");
150	dpavlin	125
151	dpavlin	13	my $cache_file;
152
153	dpavlin	18	if ($self->{'db'}) {
154	dpavlin	219	my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
155	dpavlin	70	$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
156			return $ds if ($ds);
157	dpavlin	29	$log->debug("cache miss, creating");
158	dpavlin	13	}
159
160			undef $self->{'currnet_filename'};
161			undef $self->{'headline'};
162
163			my @sorted_tags;
164			if ($self->{tags_by_order}) {
165			@sorted_tags = @{$self->{tags_by_order}};
166			} else {
167			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
168			$self->{tags_by_order} = \@sorted_tags;
169			}
170
171	dpavlin	70	my $ds;
172	dpavlin	13
173			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
174
175			foreach my $field (@sorted_tags) {
176
177			my $row;
178
179			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
180
181			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
182	dpavlin	38	my $format;
183	dpavlin	13
184	dpavlin	38	$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
185			$format = $tag->{'value'} \|\| $tag->{'content'};
186
187	dpavlin	13	$log->debug("format: $format");
188
189			my @v;
190			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
191			@v = $self->fill_in_to_arr($rec,$format);
192			} else {
193			@v = $self->parse_to_arr($rec,$format);
194			}
195			next if (! @v);
196
197			if ($tag->{'sort'}) {
198			@v = $self->sort_arr(@v);
199			}
200
201			# use format?
202			if ($tag->{'format_name'}) {
203			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
204			}
205
206			# delimiter will join repeatable fields
207			if ($tag->{'delimiter'}) {
208			@v = ( join($tag->{'delimiter'}, @v) );
209			}
210
211			# default types
212	dpavlin	74	my @types = qw(display search);
213	dpavlin	13	# override by type attribute
214			@types = ( $tag->{'type'} ) if ($tag->{'type'});
215
216			foreach my $type (@types) {
217			# append to previous line?
218	dpavlin	252	$log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} \|\| 'no append');
219	dpavlin	13	if ($tag->{'append'}) {
220
221			# I will delimit appended part with
222			# delimiter (or ,)
223			my $d = $tag->{'delimiter'};
224			# default delimiter
225			$d \|\|= " ";
226
227			my $last = pop @{$row->{$type}};
228			$d = "" if (! $last);
229			$last .= $d . join($d, @v);
230			push @{$row->{$type}}, $last;
231
232			} else {
233			push @{$row->{$type}}, @v;
234			}
235			}
236
237
238			}
239
240			if ($row) {
241			$row->{'tag'} = $field;
242
243			# TODO: name_sigular, name_plural
244			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
245	dpavlin	70	my $row_name = $name ? $self->_x($name) : $field;
246	dpavlin	13
247			# post-sort all values in field
248			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
249			$log->warn("sort at field tag not implemented");
250			}
251
252	dpavlin	70	$ds->{$row_name} = $row;
253	dpavlin	13
254			$log->debug("row $field: ",sub { Dumper($row) });
255			}
256
257			}
258
259	dpavlin	22	$self->{'db'}->save_ds(
260	dpavlin	219	id => $id,
261	dpavlin	70	ds => $ds,
262	dpavlin	219	prefix => $self->{prefix},
263	dpavlin	18	) if ($self->{'db'});
264	dpavlin	13
265	dpavlin	70	$log->debug("ds: ", sub { Dumper($ds) });
266	dpavlin	29
267	dpavlin	70	$log->logconfess("data structure returned is not array any more!") if wantarray;
268	dpavlin	13
269	dpavlin	70	return $ds;
270
271	dpavlin	13	}
272
273			=head2 parse
274
275			Perform smart parsing of string, skipping delimiters for fields which aren't
276			defined. It can also eval code in format starting with C<eval{...}> and
277			return output or nothing depending on eval code.
278
279			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
280
281			=cut
282
283			sub parse {
284			my $self = shift;
285
286			my ($rec, $format_utf8, $i) = @_;
287
288			return if (! $format_utf8);
289
290			my $log = $self->_get_logger();
291
292			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
293
294			$i = 0 if (! $i);
295
296			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
297
298			my @out;
299
300			$log->debug("format: $format");
301
302			my $eval_code;
303			# remove eval{...} from beginning
304			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
305
306			my $filter_name;
307			# remove filter{...} from beginning
308			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
309
310			my $prefix;
311			my $all_found=0;
312
313			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
314
315			my $del = $1 \|\| '';
316			$prefix \|\|= $del if ($all_found == 0);
317
318			# repeatable index
319			my $r = $i;
320			$r = 0 if (lc("$2") eq 's');
321
322			my $found = 0;
323			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
324
325			if ($found) {
326			push @out, $del;
327			push @out, $tmp;
328			$all_found += $found;
329			}
330			}
331
332			return if (! $all_found);
333
334			my $out = join('',@out);
335
336			if ($out) {
337			# add rest of format (suffix)
338			$out .= $format;
339
340			# add prefix if not there
341			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
342
343			$log->debug("result: $out");
344			}
345
346			if ($eval_code) {
347			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
348			$log->debug("about to eval{$eval} format: $out");
349			return if (! $self->_eval($eval));
350			}
351
352			if ($filter_name && $self->{'filter'}->{$filter_name}) {
353			$log->debug("about to filter{$filter_name} format: $out");
354			$out = $self->{'filter'}->{$filter_name}->($out);
355			return unless(defined($out));
356			$log->debug("filter result: $out");
357			}
358
359			return $out;
360			}
361
362			=head2 parse_to_arr
363
364			Similar to C<parse>, but returns array of all repeatable fields
365
366			my @arr = $webpac->parse_to_arr($rec,'v250^a');
367
368			=cut
369
370			sub parse_to_arr {
371			my $self = shift;
372
373			my ($rec, $format_utf8) = @_;
374
375			my $log = $self->_get_logger();
376
377			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
378			return if (! $format_utf8);
379
380			my $i = 0;
381			my @arr;
382
383			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
384			push @arr, $v;
385			}
386
387			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
388
389			return @arr;
390			}
391
392	dpavlin	15
393			=head2 fill_in
394
395			Workhourse of all: takes record from in-memory structure of database and
396			strings with placeholders and returns string or array of with substituted
397			values from record.
398
399			my $text = $webpac->fill_in($rec,'v250^a');
400
401			Optional argument is ordinal number for repeatable fields. By default,
402			it's assume to be first repeatable field (fields are perl array, so first
403			element is 0).
404			Following example will read second value from repeatable field.
405
406			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
407
408			This function B<does not> perform parsing of format to inteligenty skip
409			delimiters before fields which aren't used.
410
411			This method will automatically decode UTF-8 string to local code page
412			if needed.
413
414			=cut
415
416			sub fill_in {
417			my $self = shift;
418
419			my $log = $self->_get_logger();
420
421			my $rec = shift \|\| $log->logconfess("need data record");
422			my $format = shift \|\| $log->logconfess("need format to parse");
423			# iteration (for repeatable fields)
424			my $i = shift \|\| 0;
425
426			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
427
428			# FIXME remove for speedup?
429			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
430
431			if (utf8::is_utf8($format)) {
432			$format = $self->_x($format);
433			}
434
435			my $found = 0;
436
437			my $eval_code;
438			# remove eval{...} from beginning
439			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
440
441			my $filter_name;
442			# remove filter{...} from beginning
443			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
444
445			# do actual replacement of placeholders
446			# repeatable fields
447			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
448			# non-repeatable fields
449			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
450
451			if ($found) {
452			$log->debug("format: $format");
453			if ($eval_code) {
454			my $eval = $self->fill_in($rec,$eval_code,$i);
455			return if (! $self->_eval($eval));
456			}
457			if ($filter_name && $self->{'filter'}->{$filter_name}) {
458			$log->debug("filter '$filter_name' for $format");
459			$format = $self->{'filter'}->{$filter_name}->($format);
460			return unless(defined($format));
461			$log->debug("filter result: $format");
462			}
463			# do we have lookups?
464			if ($self->{'lookup'}) {
465	dpavlin	31	if ($self->{'lookup'}->can('lookup')) {
466	dpavlin	252	my @lookup = $self->{lookup}->lookup($format);
467	dpavlin	253	$log->debug("lookup $format", join(", ", @lookup));
468	dpavlin	252	return @lookup;
469	dpavlin	31	} else {
470			$log->warn("Have lookup object but can't invoke lookup method");
471			}
472	dpavlin	15	} else {
473			return $format;
474			}
475			} else {
476			return;
477			}
478			}
479
480
481	dpavlin	13	=head2 fill_in_to_arr
482
483			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
484			for fields which have lookups, so they shouldn't be parsed but rather
485			C<fill_id>ed.
486
487			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
488
489			=cut
490
491			sub fill_in_to_arr {
492			my $self = shift;
493
494			my ($rec, $format_utf8) = @_;
495
496			my $log = $self->_get_logger();
497
498			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
499			return if (! $format_utf8);
500
501			my $i = 0;
502			my @arr;
503
504			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
505			push @arr, @v;
506			}
507
508			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
509
510			return @arr;
511			}
512
513	dpavlin	15
514			=head2 get_data
515
516			Returns value from record.
517
518			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
519
520			Arguments are:
521			record reference C<$rec>,
522			field C<$f>,
523			optional subfiled C<$sf>,
524			index for repeatable values C<$i>.
525
526			Optinal variable C<$found> will be incremeted if there
527			is field.
528
529			Returns value or empty string.
530
531			=cut
532
533			sub get_data {
534			my $self = shift;
535
536			my ($rec,$f,$sf,$i,$found) = @_;
537
538			if ($$rec->{$f}) {
539			return '' if (! $$rec->{$f}->[$i]);
540			no strict 'refs';
541			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
542			$$found++ if (defined($$found));
543			return $$rec->{$f}->[$i]->{$sf};
544	dpavlin	64	} elsif (! $sf && $$rec->{$f}->[$i]) {
545	dpavlin	15	$$found++ if (defined($$found));
546			# it still might have subfield, just
547			# not specified, so we'll dump all
548			if ($$rec->{$f}->[$i] =~ /HASH/o) {
549			my $out;
550			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
551			$out .= $$rec->{$f}->[$i]->{$k}." ";
552			}
553			return $out;
554			} else {
555			return $$rec->{$f}->[$i];
556			}
557	dpavlin	64	} else {
558			return '';
559	dpavlin	15	}
560			} else {
561			return '';
562			}
563			}
564
565
566			=head2 apply_format
567
568			Apply format specified in tag with C<format_name="name"> and
569			C<format_delimiter=";;">.
570
571			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
572
573			Formats can contain C<lookup{...}> if you need them.
574
575			=cut
576
577			sub apply_format {
578			my $self = shift;
579
580			my ($name,$delimiter,$data) = @_;
581
582			my $log = $self->_get_logger();
583
584			if (! $self->{'import_xml'}->{'format'}->{$name}) {
585			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
586			return $data;
587			}
588
589			$log->warn("no delimiter for format $name") if (! $delimiter);
590
591			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
592
593			my @data = split(/\Q$delimiter\E/, $data);
594
595			my $out = sprintf($format, @data);
596			$log->debug("using format $name [$format] on $data to produce: $out");
597
598			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
599	dpavlin	31	return $self->{'lookup'}->lookup($out);
600	dpavlin	15	} else {
601			return $out;
602			}
603
604			}
605
606	dpavlin	13	=head2 sort_arr
607
608			Sort array ignoring case and html in data
609
610			my @sorted = $webpac->sort_arr(@unsorted);
611
612			=cut
613
614			sub sort_arr {
615			my $self = shift;
616
617			my $log = $self->_get_logger();
618
619			# FIXME add Schwartzian Transformation?
620
621			my @sorted = sort {
622			$a =~ s#<[^>]+/*>##;
623			$b =~ s#<[^>]+/*>##;
624			lc($b) cmp lc($a)
625			} @_;
626			$log->debug("sorted values: ",sub { join(", ",@sorted) });
627
628			return @sorted;
629			}
630
631
632	dpavlin	15	=head1 INTERNAL METHODS
633
634	dpavlin	13	=head2 _sort_by_order
635
636			Sort xml tags data structure accoding to C<order=""> attribute.
637
638			=cut
639
640			sub _sort_by_order {
641			my $self = shift;
642
643			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
644			$self->{'import_xml'}->{'indexer'}->{$a};
645			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
646			$self->{'import_xml'}->{'indexer'}->{$b};
647
648			return $va <=> $vb;
649			}
650
651			=head2 _x
652
653	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
654			specific encoding (optinally specified using C<code_page> to C<new>
655			constructor).
656	dpavlin	13
657			my $text = $n->_x('normalize text string');
658
659			This is a stub so that other modules doesn't have to implement it.
660
661			=cut
662
663			sub _x {
664			my $self = shift;
665			return shift;
666			}
667
668
669	dpavlin	10	=head1 AUTHOR
670
671			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
672
673			=head1 COPYRIGHT & LICENSE
674
675			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
676
677			This program is free software; you can redistribute it and/or modify it
678			under the same terms as Perl itself.
679
680			=cut
681
682	dpavlin	209	1; # End of WebPAC::Normalize