lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use Data::Dumper;
use Storable;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        cache_data_structure => './cache/ds/',
        lookup_regex => $lookup->regex,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        $self ? return $self : return undef;
}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        return $self->lookup($format);
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use Data::Dumper;
6	use Storable;
7
8	=head1 NAME
9
10	WebPAC::Normalize - data mungling for normalisation
11
12	=head1 VERSION
13
14	Version 0.01
15
16	=cut
17
18	our $VERSION = '0.01';
19
20	=head1 SYNOPSIS
21
22	This package contains code that mungle data to produce normalized format.
23
24	It contains several assumptions:
25
26	=over
27
28	=item *
29
30	format of fields is defined using C<v123^a> notation for repeatable fields
31	or C<s123^a> for single (or first) value, where C<123> is field number and
32	C<a> is subfield.
33
34	=item *
35
36	source data records (C<$rec>) have unique identifiers in field C<000>
37
38	=item *
39
40	optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41	perl code that is evaluated before producing output (value of field will be
42	interpolated before that)
43
44	=item *
45
46	optional C<filter{filter_name}> at B<begining of format> will apply perl
47	code defined as code ref on format after field substitution to producing
48	output
49
50	=item *
51
52	optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54	=item *
55
56	at end, optional C<format>s rules are resolved. Format rules are similar to
57	C<sprintf> and can also contain C<lookup{...}> which is performed after
58	values are inserted in format.
59
60	=back
61
62	This also describes order in which transformations are applied (eval,
63	filter, lookup, format) which is important to undestand when deciding how to
64	solve your data mungling and normalisation process.
65
66
67
68
69	=head1 FUNCTIONS
70
71	=head2 new
72
73	Create new normalisation object
74
75	my $n = new WebPAC::Normalize::Something(
76	filter => {
77	'filter_name_1' => sub {
78	# filter code
79	return length($_);
80	}, ...
81	},
82	cache_data_structure => './cache/ds/',
83	lookup_regex => $lookup->regex,
84	);
85
86	Parametar C<filter> defines user supplied snippets of perl code which can
87	be use with C<filter{...}> notation.
88
89	Optional parameter C<cache_data_structure> defines path to directory
90	in which cache file for C<data_structure> call will be created.
91
92	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
93	in structures.
94
95	=cut
96
97	sub new {
98	my $class = shift;
99	my $self = {@_};
100	bless($self, $class);
101
102	$self->setup_cache_dir( $self->{'cache_data_structure'} );
103
104	$self ? return $self : return undef;
105	}
106
107	=head2 setup_cache_dir
108
109	Check if specified cache directory exist, and if not, disable caching.
110
111	$setup_cache_dir('./cache/ds/');
112
113	If you pass false or zero value to this function, it will disable
114	cacheing.
115
116	=cut
117
118	sub setup_cache_dir {
119	my $self = shift;
120
121	my $dir = shift;
122
123	my $log = $self->_get_logger();
124
125	if ($dir) {
126	my $msg;
127	if (! -e $dir) {
128	$msg = "doesn't exist";
129	} elsif (! -d $dir) {
130	$msg = "is not directory";
131	} elsif (! -w $dir) {
132	$msg = "not writable";
133	}
134
135	if ($msg) {
136	undef $self->{'cache_data_structure'};
137	$log->warn("cache_data_structure $dir $msg, disabling...");
138	} else {
139	$log->debug("using cache dir $dir");
140	}
141	} else {
142	$log->debug("disabling cache");
143	undef $self->{'cache_data_structure'};
144	}
145	}
146
147
148	=head2 data_structure
149
150	Create in-memory data structure which represents normalized layout from
151	C<conf/normalize/*.xml>.
152
153	This structures are used to produce output.
154
155	my @ds = $webpac->data_structure($rec);
156
157	B<Note: historical oddity follows>
158
159	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
160	C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
161	C<< <headline> >> tag.
162
163	=cut
164
165	sub data_structure {
166	my $self = shift;
167
168	my $log = $self->_get_logger();
169
170	my $rec = shift;
171	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
172
173	my $cache_file;
174
175	if (my $cache_path = $self->{'cache_data_structure'}) {
176	my $id = $rec->{'000'};
177	$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
178	unless (defined($id)) {
179	$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
180	undef $self->{'cache_data_structure'};
181	} else {
182	$cache_file = "$cache_path/$id";
183	if (-r $cache_file) {
184	my $ds_ref = retrieve($cache_file);
185	if ($ds_ref) {
186	$log->debug("cache hit: $cache_file");
187	my $ok = 1;
188	foreach my $f (qw(current_filename headline)) {
189	if ($ds_ref->{$f}) {
190	$self->{$f} = $ds_ref->{$f};
191	} else {
192	$ok = 0;
193	}
194	};
195	if ($ok && $ds_ref->{'ds'}) {
196	return @{ $ds_ref->{'ds'} };
197	} else {
198	$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
199	undef $self->{'cache_data_structure'};
200	}
201	}
202	}
203	}
204	}
205
206	undef $self->{'currnet_filename'};
207	undef $self->{'headline'};
208
209	my @sorted_tags;
210	if ($self->{tags_by_order}) {
211	@sorted_tags = @{$self->{tags_by_order}};
212	} else {
213	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
214	$self->{tags_by_order} = \@sorted_tags;
215	}
216
217	my @ds;
218
219	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
220
221	foreach my $field (@sorted_tags) {
222
223	my $row;
224
225	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
226
227	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
228	my $format = $tag->{'value'} \|\| $tag->{'content'};
229
230	$log->debug("format: $format");
231
232	my @v;
233	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
234	@v = $self->fill_in_to_arr($rec,$format);
235	} else {
236	@v = $self->parse_to_arr($rec,$format);
237	}
238	next if (! @v);
239
240	if ($tag->{'sort'}) {
241	@v = $self->sort_arr(@v);
242	}
243
244	# use format?
245	if ($tag->{'format_name'}) {
246	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
247	}
248
249	if ($field eq 'filename') {
250	$self->{'current_filename'} = join('',@v);
251	$log->debug("filename: ",$self->{'current_filename'});
252	} elsif ($field eq 'headline') {
253	$self->{'headline'} .= join('',@v);
254	$log->debug("headline: ",$self->{'headline'});
255	next; # don't return headline in data_structure!
256	}
257
258	# delimiter will join repeatable fields
259	if ($tag->{'delimiter'}) {
260	@v = ( join($tag->{'delimiter'}, @v) );
261	}
262
263	# default types
264	my @types = qw(display swish);
265	# override by type attribute
266	@types = ( $tag->{'type'} ) if ($tag->{'type'});
267
268	foreach my $type (@types) {
269	# append to previous line?
270	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
271	if ($tag->{'append'}) {
272
273	# I will delimit appended part with
274	# delimiter (or ,)
275	my $d = $tag->{'delimiter'};
276	# default delimiter
277	$d \|\|= " ";
278
279	my $last = pop @{$row->{$type}};
280	$d = "" if (! $last);
281	$last .= $d . join($d, @v);
282	push @{$row->{$type}}, $last;
283
284	} else {
285	push @{$row->{$type}}, @v;
286	}
287	}
288
289
290	}
291
292	if ($row) {
293	$row->{'tag'} = $field;
294
295	# TODO: name_sigular, name_plural
296	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
297	$row->{'name'} = $name ? $self->_x($name) : $field;
298
299	# post-sort all values in field
300	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
301	$log->warn("sort at field tag not implemented");
302	}
303
304	push @ds, $row;
305
306	$log->debug("row $field: ",sub { Dumper($row) });
307	}
308
309	}
310
311	if ($cache_file) {
312	store {
313	ds => \@ds,
314	current_filename => $self->{'current_filename'},
315	headline => $self->{'headline'},
316	}, $cache_file;
317	$log->debug("created storable cache file $cache_file");
318	}
319
320	return @ds;
321
322	}
323
324	=head2 parse
325
326	Perform smart parsing of string, skipping delimiters for fields which aren't
327	defined. It can also eval code in format starting with C<eval{...}> and
328	return output or nothing depending on eval code.
329
330	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
331
332	=cut
333
334	sub parse {
335	my $self = shift;
336
337	my ($rec, $format_utf8, $i) = @_;
338
339	return if (! $format_utf8);
340
341	my $log = $self->_get_logger();
342
343	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
344
345	$i = 0 if (! $i);
346
347	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
348
349	my @out;
350
351	$log->debug("format: $format");
352
353	my $eval_code;
354	# remove eval{...} from beginning
355	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
356
357	my $filter_name;
358	# remove filter{...} from beginning
359	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
360
361	my $prefix;
362	my $all_found=0;
363
364	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
365
366	my $del = $1 \|\| '';
367	$prefix \|\|= $del if ($all_found == 0);
368
369	# repeatable index
370	my $r = $i;
371	$r = 0 if (lc("$2") eq 's');
372
373	my $found = 0;
374	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
375
376	if ($found) {
377	push @out, $del;
378	push @out, $tmp;
379	$all_found += $found;
380	}
381	}
382
383	return if (! $all_found);
384
385	my $out = join('',@out);
386
387	if ($out) {
388	# add rest of format (suffix)
389	$out .= $format;
390
391	# add prefix if not there
392	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
393
394	$log->debug("result: $out");
395	}
396
397	if ($eval_code) {
398	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
399	$log->debug("about to eval{$eval} format: $out");
400	return if (! $self->_eval($eval));
401	}
402
403	if ($filter_name && $self->{'filter'}->{$filter_name}) {
404	$log->debug("about to filter{$filter_name} format: $out");
405	$out = $self->{'filter'}->{$filter_name}->($out);
406	return unless(defined($out));
407	$log->debug("filter result: $out");
408	}
409
410	return $out;
411	}
412
413	=head2 parse_to_arr
414
415	Similar to C<parse>, but returns array of all repeatable fields
416
417	my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419	=cut
420
421	sub parse_to_arr {
422	my $self = shift;
423
424	my ($rec, $format_utf8) = @_;
425
426	my $log = $self->_get_logger();
427
428	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429	return if (! $format_utf8);
430
431	my $i = 0;
432	my @arr;
433
434	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435	push @arr, $v;
436	}
437
438	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
439
440	return @arr;
441	}
442
443
444	=head2 fill_in
445
446	Workhourse of all: takes record from in-memory structure of database and
447	strings with placeholders and returns string or array of with substituted
448	values from record.
449
450	my $text = $webpac->fill_in($rec,'v250^a');
451
452	Optional argument is ordinal number for repeatable fields. By default,
453	it's assume to be first repeatable field (fields are perl array, so first
454	element is 0).
455	Following example will read second value from repeatable field.
456
457	my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459	This function B<does not> perform parsing of format to inteligenty skip
460	delimiters before fields which aren't used.
461
462	This method will automatically decode UTF-8 string to local code page
463	if needed.
464
465	=cut
466
467	sub fill_in {
468	my $self = shift;
469
470	my $log = $self->_get_logger();
471
472	my $rec = shift \|\| $log->logconfess("need data record");
473	my $format = shift \|\| $log->logconfess("need format to parse");
474	# iteration (for repeatable fields)
475	my $i = shift \|\| 0;
476
477	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
478
479	# FIXME remove for speedup?
480	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481
482	if (utf8::is_utf8($format)) {
483	$format = $self->_x($format);
484	}
485
486	my $found = 0;
487
488	my $eval_code;
489	# remove eval{...} from beginning
490	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
491
492	my $filter_name;
493	# remove filter{...} from beginning
494	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
495
496	# do actual replacement of placeholders
497	# repeatable fields
498	$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
499	# non-repeatable fields
500	$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
501
502	if ($found) {
503	$log->debug("format: $format");
504	if ($eval_code) {
505	my $eval = $self->fill_in($rec,$eval_code,$i);
506	return if (! $self->_eval($eval));
507	}
508	if ($filter_name && $self->{'filter'}->{$filter_name}) {
509	$log->debug("filter '$filter_name' for $format");
510	$format = $self->{'filter'}->{$filter_name}->($format);
511	return unless(defined($format));
512	$log->debug("filter result: $format");
513	}
514	# do we have lookups?
515	if ($self->{'lookup'}) {
516	return $self->lookup($format);
517	} else {
518	return $format;
519	}
520	} else {
521	return;
522	}
523	}
524
525
526	=head2 fill_in_to_arr
527
528	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
529	for fields which have lookups, so they shouldn't be parsed but rather
530	C<fill_id>ed.
531
532	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
533
534	=cut
535
536	sub fill_in_to_arr {
537	my $self = shift;
538
539	my ($rec, $format_utf8) = @_;
540
541	my $log = $self->_get_logger();
542
543	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
544	return if (! $format_utf8);
545
546	my $i = 0;
547	my @arr;
548
549	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
550	push @arr, @v;
551	}
552
553	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
554
555	return @arr;
556	}
557
558
559	=head2 get_data
560
561	Returns value from record.
562
563	my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
564
565	Arguments are:
566	record reference C<$rec>,
567	field C<$f>,
568	optional subfiled C<$sf>,
569	index for repeatable values C<$i>.
570
571	Optinal variable C<$found> will be incremeted if there
572	is field.
573
574	Returns value or empty string.
575
576	=cut
577
578	sub get_data {
579	my $self = shift;
580
581	my ($rec,$f,$sf,$i,$found) = @_;
582
583	if ($$rec->{$f}) {
584	return '' if (! $$rec->{$f}->[$i]);
585	no strict 'refs';
586	if ($sf && $$rec->{$f}->[$i]->{$sf}) {
587	$$found++ if (defined($$found));
588	return $$rec->{$f}->[$i]->{$sf};
589	} elsif ($$rec->{$f}->[$i]) {
590	$$found++ if (defined($$found));
591	# it still might have subfield, just
592	# not specified, so we'll dump all
593	if ($$rec->{$f}->[$i] =~ /HASH/o) {
594	my $out;
595	foreach my $k (keys %{$$rec->{$f}->[$i]}) {
596	$out .= $$rec->{$f}->[$i]->{$k}." ";
597	}
598	return $out;
599	} else {
600	return $$rec->{$f}->[$i];
601	}
602	}
603	} else {
604	return '';
605	}
606	}
607
608
609	=head2 apply_format
610
611	Apply format specified in tag with C<format_name="name"> and
612	C<format_delimiter=";;">.
613
614	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615
616	Formats can contain C<lookup{...}> if you need them.
617
618	=cut
619
620	sub apply_format {
621	my $self = shift;
622
623	my ($name,$delimiter,$data) = @_;
624
625	my $log = $self->_get_logger();
626
627	if (! $self->{'import_xml'}->{'format'}->{$name}) {
628	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629	return $data;
630	}
631
632	$log->warn("no delimiter for format $name") if (! $delimiter);
633
634	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
635
636	my @data = split(/\Q$delimiter\E/, $data);
637
638	my $out = sprintf($format, @data);
639	$log->debug("using format $name [$format] on $data to produce: $out");
640
641	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642	return $self->lookup($out);
643	} else {
644	return $out;
645	}
646
647	}
648
649	=head2 sort_arr
650
651	Sort array ignoring case and html in data
652
653	my @sorted = $webpac->sort_arr(@unsorted);
654
655	=cut
656
657	sub sort_arr {
658	my $self = shift;
659
660	my $log = $self->_get_logger();
661
662	# FIXME add Schwartzian Transformation?
663
664	my @sorted = sort {
665	$a =~ s#<[^>]+/*>##;
666	$b =~ s#<[^>]+/*>##;
667	lc($b) cmp lc($a)
668	} @_;
669	$log->debug("sorted values: ",sub { join(", ",@sorted) });
670
671	return @sorted;
672	}
673
674
675	=head1 INTERNAL METHODS
676
677	=head2 _sort_by_order
678
679	Sort xml tags data structure accoding to C<order=""> attribute.
680
681	=cut
682
683	sub _sort_by_order {
684	my $self = shift;
685
686	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
687	$self->{'import_xml'}->{'indexer'}->{$a};
688	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
689	$self->{'import_xml'}->{'indexer'}->{$b};
690
691	return $va <=> $vb;
692	}
693
694	=head2 _x
695
696	Convert strings from C<conf/normalize/*.xml> encoding into application
697	specific encoding (optinally specified using C<code_page> to C<new>
698	constructor).
699
700	my $text = $n->_x('normalize text string');
701
702	This is a stub so that other modules doesn't have to implement it.
703
704	=cut
705
706	sub _x {
707	my $self = shift;
708	return shift;
709	}
710
711
712	=head1 AUTHOR
713
714	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
715
716	=head1 COPYRIGHT & LICENSE
717
718	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
719
720	This program is free software; you can redistribute it and/or modify it
721	under the same terms as Perl itself.
722
723	=cut
724
725	1; # End of WebPAC::DB