lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use Data::Dumper;
use Storable;

=head1 NAME

WebPAC::Normalize - normalisation of source file

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that could be helpful in implementing different
normalisation front-ends.

=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        cache_data_structure => './cache/ds/',
        lookup_regex => $lookup->regex,
  );

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        $self ? return $self : return undef;
}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}

=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}

=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize> encoding into application specific
(optinally specified using C<code_page> to C<new> constructor.

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	dpavlin	10	package WebPAC::Normalize;
2
3			use warnings;
4			use strict;
5	dpavlin	13	use Data::Dumper;
6	dpavlin	14	use Storable;
7	dpavlin	10
8			=head1 NAME
9
10			WebPAC::Normalize - normalisation of source file
11
12			=head1 VERSION
13
14			Version 0.01
15
16			=cut
17
18			our $VERSION = '0.01';
19
20			=head1 SYNOPSIS
21
22			This package contains code that could be helpful in implementing different
23			normalisation front-ends.
24
25			=head1 FUNCTIONS
26
27	dpavlin	13	=head2 new
28	dpavlin	10
29	dpavlin	13	Create new normalisation object
30
31			my $n = new WebPAC::Normalize::Something(
32			cache_data_structure => './cache/ds/',
33			lookup_regex => $lookup->regex,
34			);
35
36			Optional parameter C<cache_data_structure> defines path to directory
37			in which cache file for C<data_structure> call will be created.
38
39			Recommended parametar C<lookup_regex> is used to enable parsing of lookups
40			in structures.
41
42	dpavlin	10	=cut
43
44	dpavlin	13	sub new {
45			my $class = shift;
46			my $self = {@_};
47			bless($self, $class);
48
49			$self->setup_cache_dir( $self->{'cache_data_structure'} );
50
51			$self ? return $self : return undef;
52	dpavlin	10	}
53
54	dpavlin	13	=head2 setup_cache_dir
55
56			Check if specified cache directory exist, and if not, disable caching.
57
58			$setup_cache_dir('./cache/ds/');
59
60			If you pass false or zero value to this function, it will disable
61			cacheing.
62
63			=cut
64
65			sub setup_cache_dir {
66			my $self = shift;
67
68			my $dir = shift;
69
70			my $log = $self->_get_logger();
71
72			if ($dir) {
73			my $msg;
74			if (! -e $dir) {
75			$msg = "doesn't exist";
76			} elsif (! -d $dir) {
77			$msg = "is not directory";
78			} elsif (! -w $dir) {
79			$msg = "not writable";
80			}
81
82			if ($msg) {
83			undef $self->{'cache_data_structure'};
84			$log->warn("cache_data_structure $dir $msg, disabling...");
85			} else {
86			$log->debug("using cache dir $dir");
87			}
88			} else {
89			$log->debug("disabling cache");
90			undef $self->{'cache_data_structure'};
91			}
92			}
93
94
95			=head2 data_structure
96
97			Create in-memory data structure which represents normalized layout from
98			C<conf/normalize/*.xml>.
99
100			This structures are used to produce output.
101
102			my @ds = $webpac->data_structure($rec);
103
104			B<Note: historical oddity follows>
105
106			This method will also set C<< $webpac->{'currnet_filename'} >> if there is
107			C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
108			C<< <headline> >> tag.
109
110			=cut
111
112			sub data_structure {
113			my $self = shift;
114
115			my $log = $self->_get_logger();
116
117			my $rec = shift;
118			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
119
120			my $cache_file;
121
122			if (my $cache_path = $self->{'cache_data_structure'}) {
123			my $id = $rec->{'000'};
124			$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
125			unless (defined($id)) {
126			$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
127			undef $self->{'cache_data_structure'};
128			} else {
129			$cache_file = "$cache_path/$id";
130			if (-r $cache_file) {
131			my $ds_ref = retrieve($cache_file);
132			if ($ds_ref) {
133			$log->debug("cache hit: $cache_file");
134			my $ok = 1;
135			foreach my $f (qw(current_filename headline)) {
136			if ($ds_ref->{$f}) {
137			$self->{$f} = $ds_ref->{$f};
138			} else {
139			$ok = 0;
140			}
141			};
142			if ($ok && $ds_ref->{'ds'}) {
143			return @{ $ds_ref->{'ds'} };
144			} else {
145			$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
146			undef $self->{'cache_data_structure'};
147			}
148			}
149			}
150			}
151			}
152
153			undef $self->{'currnet_filename'};
154			undef $self->{'headline'};
155
156			my @sorted_tags;
157			if ($self->{tags_by_order}) {
158			@sorted_tags = @{$self->{tags_by_order}};
159			} else {
160			@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
161			$self->{tags_by_order} = \@sorted_tags;
162			}
163
164			my @ds;
165
166			$log->debug("tags: ",sub { join(", ",@sorted_tags) });
167
168			foreach my $field (@sorted_tags) {
169
170			my $row;
171
172			#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173
174			foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
175			my $format = $tag->{'value'} \|\| $tag->{'content'};
176
177			$log->debug("format: $format");
178
179			my @v;
180			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
181			@v = $self->fill_in_to_arr($rec,$format);
182			} else {
183			@v = $self->parse_to_arr($rec,$format);
184			}
185			next if (! @v);
186
187			if ($tag->{'sort'}) {
188			@v = $self->sort_arr(@v);
189			}
190
191			# use format?
192			if ($tag->{'format_name'}) {
193			@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
194			}
195
196			if ($field eq 'filename') {
197			$self->{'current_filename'} = join('',@v);
198			$log->debug("filename: ",$self->{'current_filename'});
199			} elsif ($field eq 'headline') {
200			$self->{'headline'} .= join('',@v);
201			$log->debug("headline: ",$self->{'headline'});
202			next; # don't return headline in data_structure!
203			}
204
205			# delimiter will join repeatable fields
206			if ($tag->{'delimiter'}) {
207			@v = ( join($tag->{'delimiter'}, @v) );
208			}
209
210			# default types
211			my @types = qw(display swish);
212			# override by type attribute
213			@types = ( $tag->{'type'} ) if ($tag->{'type'});
214
215			foreach my $type (@types) {
216			# append to previous line?
217			$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
218			if ($tag->{'append'}) {
219
220			# I will delimit appended part with
221			# delimiter (or ,)
222			my $d = $tag->{'delimiter'};
223			# default delimiter
224			$d \|\|= " ";
225
226			my $last = pop @{$row->{$type}};
227			$d = "" if (! $last);
228			$last .= $d . join($d, @v);
229			push @{$row->{$type}}, $last;
230
231			} else {
232			push @{$row->{$type}}, @v;
233			}
234			}
235
236
237			}
238
239			if ($row) {
240			$row->{'tag'} = $field;
241
242			# TODO: name_sigular, name_plural
243			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
244			$row->{'name'} = $name ? $self->_x($name) : $field;
245
246			# post-sort all values in field
247			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
248			$log->warn("sort at field tag not implemented");
249			}
250
251			push @ds, $row;
252
253			$log->debug("row $field: ",sub { Dumper($row) });
254			}
255
256			}
257
258			if ($cache_file) {
259			store {
260			ds => \@ds,
261			current_filename => $self->{'current_filename'},
262			headline => $self->{'headline'},
263			}, $cache_file;
264			$log->debug("created storable cache file $cache_file");
265			}
266
267			return @ds;
268
269			}
270
271			=head2 apply_format
272
273			Apply format specified in tag with C<format_name="name"> and
274			C<format_delimiter=";;">.
275
276			my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
277
278			Formats can contain C<lookup{...}> if you need them.
279
280			=cut
281
282			sub apply_format {
283			my $self = shift;
284
285			my ($name,$delimiter,$data) = @_;
286
287			my $log = $self->_get_logger();
288
289			if (! $self->{'import_xml'}->{'format'}->{$name}) {
290			$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
291			return $data;
292			}
293
294			$log->warn("no delimiter for format $name") if (! $delimiter);
295
296			my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
297
298			my @data = split(/\Q$delimiter\E/, $data);
299
300			my $out = sprintf($format, @data);
301			$log->debug("using format $name [$format] on $data to produce: $out");
302
303			if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
304			return $self->lookup($out);
305			} else {
306			return $out;
307			}
308
309			}
310
311			=head2 parse
312
313			Perform smart parsing of string, skipping delimiters for fields which aren't
314			defined. It can also eval code in format starting with C<eval{...}> and
315			return output or nothing depending on eval code.
316
317			my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
318
319			=cut
320
321			sub parse {
322			my $self = shift;
323
324			my ($rec, $format_utf8, $i) = @_;
325
326			return if (! $format_utf8);
327
328			my $log = $self->_get_logger();
329
330			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
331
332			$i = 0 if (! $i);
333
334			my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
335
336			my @out;
337
338			$log->debug("format: $format");
339
340			my $eval_code;
341			# remove eval{...} from beginning
342			$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
343
344			my $filter_name;
345			# remove filter{...} from beginning
346			$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
347
348			my $prefix;
349			my $all_found=0;
350
351			while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
352
353			my $del = $1 \|\| '';
354			$prefix \|\|= $del if ($all_found == 0);
355
356			# repeatable index
357			my $r = $i;
358			$r = 0 if (lc("$2") eq 's');
359
360			my $found = 0;
361			my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
362
363			if ($found) {
364			push @out, $del;
365			push @out, $tmp;
366			$all_found += $found;
367			}
368			}
369
370			return if (! $all_found);
371
372			my $out = join('',@out);
373
374			if ($out) {
375			# add rest of format (suffix)
376			$out .= $format;
377
378			# add prefix if not there
379			$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
380
381			$log->debug("result: $out");
382			}
383
384			if ($eval_code) {
385			my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
386			$log->debug("about to eval{$eval} format: $out");
387			return if (! $self->_eval($eval));
388			}
389
390			if ($filter_name && $self->{'filter'}->{$filter_name}) {
391			$log->debug("about to filter{$filter_name} format: $out");
392			$out = $self->{'filter'}->{$filter_name}->($out);
393			return unless(defined($out));
394			$log->debug("filter result: $out");
395			}
396
397			return $out;
398			}
399
400			=head2 parse_to_arr
401
402			Similar to C<parse>, but returns array of all repeatable fields
403
404			my @arr = $webpac->parse_to_arr($rec,'v250^a');
405
406			=cut
407
408			sub parse_to_arr {
409			my $self = shift;
410
411			my ($rec, $format_utf8) = @_;
412
413			my $log = $self->_get_logger();
414
415			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
416			return if (! $format_utf8);
417
418			my $i = 0;
419			my @arr;
420
421			while (my $v = $self->parse($rec,$format_utf8,$i++)) {
422			push @arr, $v;
423			}
424
425			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
426
427			return @arr;
428			}
429
430			=head2 fill_in_to_arr
431
432			Similar to C<fill_in>, but returns array of all repeatable fields. Usable
433			for fields which have lookups, so they shouldn't be parsed but rather
434			C<fill_id>ed.
435
436			my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
437
438			=cut
439
440			sub fill_in_to_arr {
441			my $self = shift;
442
443			my ($rec, $format_utf8) = @_;
444
445			my $log = $self->_get_logger();
446
447			$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
448			return if (! $format_utf8);
449
450			my $i = 0;
451			my @arr;
452
453			while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
454			push @arr, @v;
455			}
456
457			$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
458
459			return @arr;
460			}
461
462			=head2 sort_arr
463
464			Sort array ignoring case and html in data
465
466			my @sorted = $webpac->sort_arr(@unsorted);
467
468			=cut
469
470			sub sort_arr {
471			my $self = shift;
472
473			my $log = $self->_get_logger();
474
475			# FIXME add Schwartzian Transformation?
476
477			my @sorted = sort {
478			$a =~ s#<[^>]+/*>##;
479			$b =~ s#<[^>]+/*>##;
480			lc($b) cmp lc($a)
481			} @_;
482			$log->debug("sorted values: ",sub { join(", ",@sorted) });
483
484			return @sorted;
485			}
486
487
488			=head2 _sort_by_order
489
490			Sort xml tags data structure accoding to C<order=""> attribute.
491
492			=cut
493
494			sub _sort_by_order {
495			my $self = shift;
496
497			my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
498			$self->{'import_xml'}->{'indexer'}->{$a};
499			my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
500			$self->{'import_xml'}->{'indexer'}->{$b};
501
502			return $va <=> $vb;
503			}
504
505			=head2 _x
506
507			Convert strings from C<conf/normalize> encoding into application specific
508			(optinally specified using C<code_page> to C<new> constructor.
509
510			my $text = $n->_x('normalize text string');
511
512			This is a stub so that other modules doesn't have to implement it.
513
514			=cut
515
516			sub _x {
517			my $self = shift;
518			return shift;
519			}
520
521
522	dpavlin	10	=head1 AUTHOR
523
524			Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
525
526			=head1 COPYRIGHT & LICENSE
527
528			Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
529
530			This program is free software; you can redistribute it and/or modify it
531			under the same terms as Perl itself.
532
533			=cut
534
535			1; # End of WebPAC::DB