lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use Data::Dumper;
use Storable;

=head1 NAME

WebPAC::Normalize - data mungling for normalisation

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that mungle data to produce normalized format.

It contains several assumptions:

=over

=item *

format of fields is defined using C<v123^a> notation for repeatable fields
or C<s123^a> for single (or first) value, where C<123> is field number and
C<a> is subfield.

=item *

source data records (C<$rec>) have unique identifiers in field C<000>

=item *

optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
perl code that is evaluated before producing output (value of field will be
interpolated before that)

=item *

optional C<filter{filter_name}> at B<begining of format> will apply perl
code defined as code ref on format after field substitution to producing
output

=item *

optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.

=item *

at end, optional C<format>s rules are resolved. Format rules are similar to
C<sprintf> and can also contain C<lookup{...}> which is performed after
values are inserted in format.

=back

This also describes order in which transformations are applied (eval,
filter, lookup, format) which is important to undestand when deciding how to
solve your data mungling and normalisation process.


=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        filter => {
                'filter_name_1' => sub {
                        # filter code
                        return length($_);
                }, ...
        },
        cache_data_structure => './cache/ds/',
        lookup_regex => $lookup->regex,
  );

Parametar C<filter> defines user supplied snippets of perl code which can
be use with C<filter{...}> notation.

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        $self ? return $self : return undef;
}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 fill_in

Workhourse of all: takes record from in-memory structure of database and
strings with placeholders and returns string or array of with substituted
values from record.

 my $text = $webpac->fill_in($rec,'v250^a');

Optional argument is ordinal number for repeatable fields. By default,
it's assume to be first repeatable field (fields are perl array, so first
element is 0).
Following example will read second value from repeatable field.

 my $text = $webpac->fill_in($rec,'Title: v250^a',1);

This function B<does not> perform parsing of format to inteligenty skip
delimiters before fields which aren't used.

This method will automatically decode UTF-8 string to local code page
if needed.

=cut

sub fill_in {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift || $log->logconfess("need data record");
        my $format = shift || $log->logconfess("need format to parse");
        # iteration (for repeatable fields)
        my $i = shift || 0;

        $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));

        # FIXME remove for speedup?
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        if (utf8::is_utf8($format)) {
                $format = $self->_x($format);
        }

        my $found = 0;

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        # do actual replacement of placeholders
        # repeatable fields
        $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
        # non-repeatable fields
        $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;

        if ($found) {
                $log->debug("format: $format");
                if ($eval_code) {
                        my $eval = $self->fill_in($rec,$eval_code,$i);
                        return if (! $self->_eval($eval));
                }
                if ($filter_name && $self->{'filter'}->{$filter_name}) {
                        $log->debug("filter '$filter_name' for $format");
                        $format = $self->{'filter'}->{$filter_name}->($format);
                        return unless(defined($format));
                        $log->debug("filter result: $format");
                }
                # do we have lookups?
                if ($self->{'lookup'}) {
                        return $self->lookup($format);
                } else {
                        return $format;
                }
        } else {
                return;
        }
}


=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}


=head2 get_data

Returns value from record.

 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);

Arguments are:
record reference C<$rec>,
field C<$f>,
optional subfiled C<$sf>,
index for repeatable values C<$i>.

Optinal variable C<$found> will be incremeted if there
is field.

Returns value or empty string.

=cut

sub get_data {
        my $self = shift;

        my ($rec,$f,$sf,$i,$found) = @_;

        if ($$rec->{$f}) {
                return '' if (! $$rec->{$f}->[$i]);
                no strict 'refs';
                if ($sf && $$rec->{$f}->[$i]->{$sf}) {
                        $$found++ if (defined($$found));
                        return $$rec->{$f}->[$i]->{$sf};
                } elsif ($$rec->{$f}->[$i]) {
                        $$found++ if (defined($$found));
                        # it still might have subfield, just
                        # not specified, so we'll dump all
                        if ($$rec->{$f}->[$i] =~ /HASH/o) {
                                my $out;
                                foreach my $k (keys %{$$rec->{$f}->[$i]}) {
                                        $out .= $$rec->{$f}->[$i]->{$k}." ";
                                }
                                return $out;
                        } else {
                                return $$rec->{$f}->[$i];
                        }
                }
        } else {
                return '';
        }
}


=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head1 INTERNAL METHODS

=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize/*.xml> encoding into application
specific encoding (optinally specified using C<code_page> to C<new>
constructor).

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	13	use Data::Dumper;
6	dpavlin	14	use Storable;
7	dpavlin	10
8			=head1 NAME
9
10	dpavlin	15	WebPAC::Normalize - data mungling for normalisation
11	dpavlin	10
12			=head1 VERSION
13
14			Version 0.01
15
16			=cut
17
18			our $VERSION = '0.01';
19
20			=head1 SYNOPSIS
21
22	dpavlin	15	This package contains code that mungle data to produce normalized format.
23	dpavlin	10
24	dpavlin	15	It contains several assumptions:
25
26			=over
27
28			=item *
29
30			format of fields is defined using C<v123^a> notation for repeatable fields
31			or C<s123^a> for single (or first) value, where C<123> is field number and
32			C<a> is subfield.
33
34			=item *
35
36			source data records (C<$rec>) have unique identifiers in field C<000>
37
38			=item *
39
40			optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41			perl code that is evaluated before producing output (value of field will be
42			interpolated before that)
43
44			=item *
45
46			optional C<filter{filter_name}> at B<begining of format> will apply perl
47			code defined as code ref on format after field substitution to producing
48			output
49
50			=item *
51
52			optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54			=item *
55
56			at end, optional C<format>s rules are resolved. Format rules are similar to
57			C<sprintf> and can also contain C<lookup{...}> which is performed after
58			values are inserted in format.
59
60			=back
61
62			This also describes order in which transformations are applied (eval,
63			filter, lookup, format) which is important to undestand when deciding how to
64			solve your data mungling and normalisation process.
65
66
67
68
69	dpavlin	10	=head1 FUNCTIONS
70
71	dpavlin	13	=head2 new
72	dpavlin	10
73	dpavlin	13	Create new normalisation object
74
75			my $n = new WebPAC::Normalize::Something(
76	dpavlin	15	filter => {
77			'filter_name_1' => sub {
78			# filter code
79			return length($_);
80			}, ...
81			},
82	dpavlin	13	cache_data_structure => './cache/ds/',
83			lookup_regex => $lookup->regex,
84			);
85
86	dpavlin	15	Parametar C<filter> defines user supplied snippets of perl code which can
87			be use with C<filter{...}> notation.
88
89	dpavlin	13	Optional parameter C<cache_data_structure> defines path to directory
90			in which cache file for C<data_structure> call will be created.
91
92			Recommended parametar C<lookup_regex> is used to enable parsing of lookups
93			in structures.
94
95	dpavlin	10	=cut
96
97	dpavlin	13	sub new {
98			my $class = shift;
99			my $self = {@_};
100			bless($self, $class);
101
102			$self->setup_cache_dir( $self->{'cache_data_structure'} );
103
104			$self ? return $self : return undef;
105	dpavlin	10	}
106
107	dpavlin	13	=head2 setup_cache_dir
108
109			Check if specified cache directory exist, and if not, disable caching.
110
111			$setup_cache_dir('./cache/ds/');
112
113			If you pass false or zero value to this function, it will disable
114			cacheing.
115
116			=cut
117
118			sub setup_cache_dir {
119			my $self = shift;
120
121			my $dir = shift;
122
123			my $log = $self->_get_logger();
124
125			if ($dir) {
126			my $msg;
127			if (! -e $dir) {
128			$msg = "doesn't exist";
129			} elsif (! -d $dir) {
130			$msg = "is not directory";
131			} elsif (! -w $dir) {
132			$msg = "not writable";
133			}
134
135			if ($msg) {
136			undef $self->{'cache_data_structure'};
137			$log->warn("cache_data_structure $dir $msg, disabling...");
138			} else {
139			$log->debug("using cache dir $dir");
140			}
141			} else {
142			$log->debug("disabling cache");
143			undef $self->{'cache_data_structure'};
144			}
145			}
146
147
148			=head2 data_structure
149
150			Create in-memory data structure which represents normalized layout from
151			C<conf/normalize/*.xml>.
152
153			This structures are used to produce output.
154
155			my @ds = $webpac->data_structure($rec);
156
157			B<Note: historical oddity follows>
158
159			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
160			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
161			C<< <headline> >> tag.
162
163			=cut
164
165			sub data_structure {
166			my $self = shift;
167
168			my $log = $self->_get_logger();
169
170			my $rec = shift;
171			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
172
173			my $cache_file;
174
175			if (my $cache_path = $self->{'cache_data_structure'}) {
176			my $id = $rec->{'000'};
177			$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
178			unless (defined($id)) {
179			$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
180			undef $self->{'cache_data_structure'};
181			} else {
182			$cache_file = "$cache_path/$id";
183			if (-r $cache_file) {
184			my $ds_ref = retrieve($cache_file);
185			if ($ds_ref) {
186			$log->debug("cache hit: $cache_file");
187			my $ok = 1;
188			foreach my $f (qw(current_filename headline)) {
189			if ($ds_ref->{$f}) {
190			$self->{$f} = $ds_ref->{$f};
191			} else {
192			$ok = 0;
193			}
194			};
195			if ($ok && $ds_ref->{'ds'}) {
196			return @{ $ds_ref->{'ds'} };
197			} else {
198			$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
199			undef $self->{'cache_data_structure'};
200			}
201			}
202			}
203			}
204			}
205
206			undef $self->{'currnet_filename'};
207			undef $self->{'headline'};
208
209			my @sorted_tags;
210			if ($self->{tags_by_order}) {
211			@sorted_tags = @{$self->{tags_by_order}};
212			} else {
213			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
214			$self->{tags_by_order} = \@sorted_tags;
215			}
216
217			my @ds;
218
219			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
220
221			foreach my $field (@sorted_tags) {
222
223			my $row;
224
225			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
226
227			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
228			my $format = $tag->{'value'} \|\| $tag->{'content'};
229
230			$log->debug("format: $format");
231
232			my @v;
233			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
234			@v = $self->fill_in_to_arr($rec,$format);
235			} else {
236			@v = $self->parse_to_arr($rec,$format);
237			}
238			next if (! @v);
239
240			if ($tag->{'sort'}) {
241			@v = $self->sort_arr(@v);
242			}
243
244			# use format?
245			if ($tag->{'format_name'}) {
246			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
247			}
248
249			if ($field eq 'filename') {
250			$self->{'current_filename'} = join('',@v);
251			$log->debug("filename: ",$self->{'current_filename'});
252			} elsif ($field eq 'headline') {
253			$self->{'headline'} .= join('',@v);
254			$log->debug("headline: ",$self->{'headline'});
255			next; # don't return headline in data_structure!
256			}
257
258			# delimiter will join repeatable fields
259			if ($tag->{'delimiter'}) {
260			@v = ( join($tag->{'delimiter'}, @v) );
261			}
262
263			# default types
264			my @types = qw(display swish);
265			# override by type attribute
266			@types = ( $tag->{'type'} ) if ($tag->{'type'});
267
268			foreach my $type (@types) {
269			# append to previous line?
270			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
271			if ($tag->{'append'}) {
272
273			# I will delimit appended part with
274			# delimiter (or ,)
275			my $d = $tag->{'delimiter'};
276			# default delimiter
277			$d \|\|= " ";
278
279			my $last = pop @{$row->{$type}};
280			$d = "" if (! $last);
281			$last .= $d . join($d, @v);
282			push @{$row->{$type}}, $last;
283
284			} else {
285			push @{$row->{$type}}, @v;
286			}
287			}
288
289
290			}
291
292			if ($row) {
293			$row->{'tag'} = $field;
294
295			# TODO: name_sigular, name_plural
296			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
297			$row->{'name'} = $name ? $self->_x($name) : $field;
298
299			# post-sort all values in field
300			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
301			$log->warn("sort at field tag not implemented");
302			}
303
304			push @ds, $row;
305
306			$log->debug("row $field: ",sub { Dumper($row) });
307			}
308
309			}
310
311			if ($cache_file) {
312			store {
313			ds => \@ds,
314			current_filename => $self->{'current_filename'},
315			headline => $self->{'headline'},
316			}, $cache_file;
317			$log->debug("created storable cache file $cache_file");
318			}
319
320			return @ds;
321
322			}
323
324			=head2 parse
325
326			Perform smart parsing of string, skipping delimiters for fields which aren't
327			defined. It can also eval code in format starting with C<eval{...}> and
328			return output or nothing depending on eval code.
329
330			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
331
332			=cut
333
334			sub parse {
335			my $self = shift;
336
337			my ($rec, $format_utf8, $i) = @_;
338
339			return if (! $format_utf8);
340
341			my $log = $self->_get_logger();
342
343			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
344
345			$i = 0 if (! $i);
346
347			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
348
349			my @out;
350
351			$log->debug("format: $format");
352
353			my $eval_code;
354			# remove eval{...} from beginning
355			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
356
357			my $filter_name;
358			# remove filter{...} from beginning
359			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
360
361			my $prefix;
362			my $all_found=0;
363
364			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
365
366			my $del = $1 \|\| '';
367			$prefix \|\|= $del if ($all_found == 0);
368
369			# repeatable index
370			my $r = $i;
371			$r = 0 if (lc("$2") eq 's');
372
373			my $found = 0;
374			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
375
376			if ($found) {
377			push @out, $del;
378			push @out, $tmp;
379			$all_found += $found;
380			}
381			}
382
383			return if (! $all_found);
384
385			my $out = join('',@out);
386
387			if ($out) {
388			# add rest of format (suffix)
389			$out .= $format;
390
391			# add prefix if not there
392			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
393
394			$log->debug("result: $out");
395			}
396
397			if ($eval_code) {
398			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
399			$log->debug("about to eval{$eval} format: $out");
400			return if (! $self->_eval($eval));
401			}
402
403			if ($filter_name && $self->{'filter'}->{$filter_name}) {
404			$log->debug("about to filter{$filter_name} format: $out");
405			$out = $self->{'filter'}->{$filter_name}->($out);
406			return unless(defined($out));
407			$log->debug("filter result: $out");
408			}
409
410			return $out;
411			}
412
413			=head2 parse_to_arr
414
415			Similar to C<parse>, but returns array of all repeatable fields
416
417			my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419			=cut
420
421			sub parse_to_arr {
422			my $self = shift;
423
424			my ($rec, $format_utf8) = @_;
425
426			my $log = $self->_get_logger();
427
428			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429			return if (! $format_utf8);
430
431			my $i = 0;
432			my @arr;
433
434			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435			push @arr, $v;
436			}
437
438			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
439
440			return @arr;
441			}
442
443	dpavlin	15
444			=head2 fill_in
445
446			Workhourse of all: takes record from in-memory structure of database and
447			strings with placeholders and returns string or array of with substituted
448			values from record.
449
450			my $text = $webpac->fill_in($rec,'v250^a');
451
452			Optional argument is ordinal number for repeatable fields. By default,
453			it's assume to be first repeatable field (fields are perl array, so first
454			element is 0).
455			Following example will read second value from repeatable field.
456
457			my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459			This function B<does not> perform parsing of format to inteligenty skip
460			delimiters before fields which aren't used.
461
462			This method will automatically decode UTF-8 string to local code page
463			if needed.
464
465			=cut
466
467			sub fill_in {
468			my $self = shift;
469
470			my $log = $self->_get_logger();
471
472			my $rec = shift \|\| $log->logconfess("need data record");
473			my $format = shift \|\| $log->logconfess("need format to parse");
474			# iteration (for repeatable fields)
475			my $i = shift \|\| 0;
476
477			$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} \|\| 9999));
478
479			# FIXME remove for speedup?
480			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481
482			if (utf8::is_utf8($format)) {
483			$format = $self->_x($format);
484			}
485
486			my $found = 0;
487
488			my $eval_code;
489			# remove eval{...} from beginning
490			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
491
492			my $filter_name;
493			# remove filter{...} from beginning
494			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
495
496			# do actual replacement of placeholders
497			# repeatable fields
498			$format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
499			# non-repeatable fields
500			$format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
501
502			if ($found) {
503			$log->debug("format: $format");
504			if ($eval_code) {
505			my $eval = $self->fill_in($rec,$eval_code,$i);
506			return if (! $self->_eval($eval));
507			}
508			if ($filter_name && $self->{'filter'}->{$filter_name}) {
509			$log->debug("filter '$filter_name' for $format");
510			$format = $self->{'filter'}->{$filter_name}->($format);
511			return unless(defined($format));
512			$log->debug("filter result: $format");
513			}
514			# do we have lookups?
515			if ($self->{'lookup'}) {
516			return $self->lookup($format);
517			} else {
518			return $format;
519			}
520			} else {
521			return;
522			}
523			}
524
525
526	dpavlin	13	=head2 fill_in_to_arr
527
528			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
529			for fields which have lookups, so they shouldn't be parsed but rather
530			C<fill_id>ed.
531
532			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
533
534			=cut
535
536			sub fill_in_to_arr {
537			my $self = shift;
538
539			my ($rec, $format_utf8) = @_;
540
541			my $log = $self->_get_logger();
542
543			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
544			return if (! $format_utf8);
545
546			my $i = 0;
547			my @arr;
548
549			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
550			push @arr, @v;
551			}
552
553			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
554
555			return @arr;
556			}
557
558	dpavlin	15
559			=head2 get_data
560
561			Returns value from record.
562
563			my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
564
565			Arguments are:
566			record reference C<$rec>,
567			field C<$f>,
568			optional subfiled C<$sf>,
569			index for repeatable values C<$i>.
570
571			Optinal variable C<$found> will be incremeted if there
572			is field.
573
574			Returns value or empty string.
575
576			=cut
577
578			sub get_data {
579			my $self = shift;
580
581			my ($rec,$f,$sf,$i,$found) = @_;
582
583			if ($$rec->{$f}) {
584			return '' if (! $$rec->{$f}->[$i]);
585			no strict 'refs';
586			if ($sf && $$rec->{$f}->[$i]->{$sf}) {
587			$$found++ if (defined($$found));
588			return $$rec->{$f}->[$i]->{$sf};
589			} elsif ($$rec->{$f}->[$i]) {
590			$$found++ if (defined($$found));
591			# it still might have subfield, just
592			# not specified, so we'll dump all
593			if ($$rec->{$f}->[$i] =~ /HASH/o) {
594			my $out;
595			foreach my $k (keys %{$$rec->{$f}->[$i]}) {
596			$out .= $$rec->{$f}->[$i]->{$k}." ";
597			}
598			return $out;
599			} else {
600			return $$rec->{$f}->[$i];
601			}
602			}
603			} else {
604			return '';
605			}
606			}
607
608
609			=head2 apply_format
610
611			Apply format specified in tag with C<format_name="name"> and
612			C<format_delimiter=";;">.
613
614			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615
616			Formats can contain C<lookup{...}> if you need them.
617
618			=cut
619
620			sub apply_format {
621			my $self = shift;
622
623			my ($name,$delimiter,$data) = @_;
624
625			my $log = $self->_get_logger();
626
627			if (! $self->{'import_xml'}->{'format'}->{$name}) {
628			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629			return $data;
630			}
631
632			$log->warn("no delimiter for format $name") if (! $delimiter);
633
634			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
635
636			my @data = split(/\Q$delimiter\E/, $data);
637
638			my $out = sprintf($format, @data);
639			$log->debug("using format $name [$format] on $data to produce: $out");
640
641			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642			return $self->lookup($out);
643			} else {
644			return $out;
645			}
646
647			}
648
649	dpavlin	13	=head2 sort_arr
650
651			Sort array ignoring case and html in data
652
653			my @sorted = $webpac->sort_arr(@unsorted);
654
655			=cut
656
657			sub sort_arr {
658			my $self = shift;
659
660			my $log = $self->_get_logger();
661
662			# FIXME add Schwartzian Transformation?
663
664			my @sorted = sort {
665			$a =~ s#<[^>]+/*>##;
666			$b =~ s#<[^>]+/*>##;
667			lc($b) cmp lc($a)
668			} @_;
669			$log->debug("sorted values: ",sub { join(", ",@sorted) });
670
671			return @sorted;
672			}
673
674
675	dpavlin	15	=head1 INTERNAL METHODS
676
677	dpavlin	13	=head2 _sort_by_order
678
679			Sort xml tags data structure accoding to C<order=""> attribute.
680
681			=cut
682
683			sub _sort_by_order {
684			my $self = shift;
685
686			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
687			$self->{'import_xml'}->{'indexer'}->{$a};
688			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
689			$self->{'import_xml'}->{'indexer'}->{$b};
690
691			return $va <=> $vb;
692			}
693
694			=head2 _x
695
696	dpavlin	15	Convert strings from C<conf/normalize/*.xml> encoding into application
697			specific encoding (optinally specified using C<code_page> to C<new>
698			constructor).
699	dpavlin	13
700			my $text = $n->_x('normalize text string');
701
702			This is a stub so that other modules doesn't have to implement it.
703
704			=cut
705
706			sub _x {
707			my $self = shift;
708			return shift;
709			}
710
711
712	dpavlin	10	=head1 AUTHOR
713
714			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
715
716			=head1 COPYRIGHT & LICENSE
717
718			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
719
720			This program is free software; you can redistribute it and/or modify it
721			under the same terms as Perl itself.
722
723			=cut
724
725			1; # End of WebPAC::DB