lib/WebPAC/Normalize.pm

package WebPAC::Normalize;

use warnings;
use strict;
use Data::Dumper;
use Storable;

=head1 NAME

WebPAC::Normalize - normalisation of source file

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 SYNOPSIS

This package contains code that could be helpful in implementing different
normalisation front-ends.

=head1 FUNCTIONS

=head2 new

Create new normalisation object

  my $n = new WebPAC::Normalize::Something(
        cache_data_structure => './cache/ds/',
        lookup_regex => $lookup->regex,
  );

Optional parameter C<cache_data_structure> defines path to directory
in which cache file for C<data_structure> call will be created.

Recommended parametar C<lookup_regex> is used to enable parsing of lookups
in structures.

=cut

sub new {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        $self->setup_cache_dir( $self->{'cache_data_structure'} );

        $self ? return $self : return undef;
}

=head2 setup_cache_dir

Check if specified cache directory exist, and if not, disable caching.

 $setup_cache_dir('./cache/ds/');

If you pass false or zero value to this function, it will disable
cacheing.

=cut

sub setup_cache_dir {
        my $self = shift;

        my $dir = shift;

        my $log = $self->_get_logger();

        if ($dir) {
                my $msg;
                if (! -e $dir) {
                        $msg = "doesn't exist";
                } elsif (! -d $dir) {
                        $msg = "is not directory";
                } elsif (! -w $dir) {
                        $msg = "not writable";
                }

                if ($msg) {
                        undef $self->{'cache_data_structure'};
                        $log->warn("cache_data_structure $dir $msg, disabling...");
                } else {
                        $log->debug("using cache dir $dir");
                }
        } else {
                $log->debug("disabling cache");
                undef $self->{'cache_data_structure'};
        }
}


=head2 data_structure

Create in-memory data structure which represents normalized layout from
C<conf/normalize/*.xml>.

This structures are used to produce output.

 my @ds = $webpac->data_structure($rec);

B<Note: historical oddity follows>

This method will also set C<< $webpac->{'currnet_filename'} >> if there is
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
C<< <headline> >> tag.

=cut

sub data_structure {
        my $self = shift;

        my $log = $self->_get_logger();

        my $rec = shift;
        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        my $cache_file;

        if (my $cache_path = $self->{'cache_data_structure'}) {
                my $id = $rec->{'000'};
                $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
                unless (defined($id)) {
                        $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
                        undef $self->{'cache_data_structure'};
                } else {
                        $cache_file = "$cache_path/$id";
                        if (-r $cache_file) {
                                my $ds_ref = retrieve($cache_file);
                                if ($ds_ref) {
                                        $log->debug("cache hit: $cache_file");
                                        my $ok = 1;
                                        foreach my $f (qw(current_filename headline)) {
                                                if ($ds_ref->{$f}) {
                                                        $self->{$f} = $ds_ref->{$f};
                                                } else {
                                                        $ok = 0;
                                                }
                                        };
                                        if ($ok && $ds_ref->{'ds'}) {
                                                return @{ $ds_ref->{'ds'} };
                                        } else {
                                                $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
                                                undef $self->{'cache_data_structure'};
                                        }
                                }
                        }
                }
        }

        undef $self->{'currnet_filename'};
        undef $self->{'headline'};

        my @sorted_tags;
        if ($self->{tags_by_order}) {
                @sorted_tags = @{$self->{tags_by_order}};
        } else {
                @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
                $self->{tags_by_order} = \@sorted_tags;
        }

        my @ds;

        $log->debug("tags: ",sub { join(", ",@sorted_tags) });

        foreach my $field (@sorted_tags) {

                my $row;

#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});

                foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
                        my $format = $tag->{'value'} || $tag->{'content'};

                        $log->debug("format: $format");

                        my @v;
                        if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
                                @v = $self->fill_in_to_arr($rec,$format);
                        } else {
                                @v = $self->parse_to_arr($rec,$format);
                        }
                        next if (! @v);

                        if ($tag->{'sort'}) {
                                @v = $self->sort_arr(@v);
                        }

                        # use format?
                        if ($tag->{'format_name'}) {
                                @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
                        }

                        if ($field eq 'filename') {
                                $self->{'current_filename'} = join('',@v);
                                $log->debug("filename: ",$self->{'current_filename'});
                        } elsif ($field eq 'headline') {
                                $self->{'headline'} .= join('',@v);
                                $log->debug("headline: ",$self->{'headline'});
                                next; # don't return headline in data_structure!
                        }

                        # delimiter will join repeatable fields
                        if ($tag->{'delimiter'}) {
                                @v = ( join($tag->{'delimiter'}, @v) );
                        }

                        # default types 
                        my @types = qw(display swish);
                        # override by type attribute
                        @types = ( $tag->{'type'} ) if ($tag->{'type'});

                        foreach my $type (@types) {
                                # append to previous line?
                                $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
                                if ($tag->{'append'}) {

                                        # I will delimit appended part with
                                        # delimiter (or ,)
                                        my $d = $tag->{'delimiter'};
                                        # default delimiter
                                        $d ||= " ";

                                        my $last = pop @{$row->{$type}};
                                        $d = "" if (! $last);
                                        $last .= $d . join($d, @v);
                                        push @{$row->{$type}}, $last;

                                } else {
                                        push @{$row->{$type}}, @v;
                                }
                        }


                }

                if ($row) {
                        $row->{'tag'} = $field;

                        # TODO: name_sigular, name_plural
                        my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                        $row->{'name'} = $name ? $self->_x($name) : $field;

                        # post-sort all values in field
                        if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
                                $log->warn("sort at field tag not implemented");
                        }

                        push @ds, $row;

                        $log->debug("row $field: ",sub { Dumper($row) });
                }

        }

        if ($cache_file) {
                store {
                        ds => \@ds,
                        current_filename => $self->{'current_filename'},
                        headline => $self->{'headline'},
                }, $cache_file;
                $log->debug("created storable cache file $cache_file");
        }

        return @ds;

}

=head2 apply_format

Apply format specified in tag with C<format_name="name"> and
C<format_delimiter=";;">.

 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);

Formats can contain C<lookup{...}> if you need them.

=cut

sub apply_format {
        my $self = shift;

        my ($name,$delimiter,$data) = @_;

        my $log = $self->_get_logger();

        if (! $self->{'import_xml'}->{'format'}->{$name}) {
                $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
                return $data;
        }

        $log->warn("no delimiter for format $name") if (! $delimiter);

        my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");

        my @data = split(/\Q$delimiter\E/, $data);

        my $out = sprintf($format, @data);
        $log->debug("using format $name [$format] on $data to produce: $out");

        if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
                return $self->lookup($out);
        } else {
                return $out;
        }

}

=head2 parse

Perform smart parsing of string, skipping delimiters for fields which aren't
defined. It can also eval code in format starting with C<eval{...}> and
return output or nothing depending on eval code.

 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);

=cut

sub parse {
        my $self = shift;

        my ($rec, $format_utf8, $i) = @_;

        return if (! $format_utf8);

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);

        $i = 0 if (! $i);

        my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});

        my @out;

        $log->debug("format: $format");

        my $eval_code;
        # remove eval{...} from beginning
        $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);

        my $filter_name;
        # remove filter{...} from beginning
        $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);

        my $prefix;
        my $all_found=0;

        while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {

                my $del = $1 || '';
                $prefix ||= $del if ($all_found == 0);

                # repeatable index
                my $r = $i;
                $r = 0 if (lc("$2") eq 's');

                my $found = 0;
                my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);

                if ($found) {
                        push @out, $del;
                        push @out, $tmp;
                        $all_found += $found;
                }
        }

        return if (! $all_found);

        my $out = join('',@out);

        if ($out) {
                # add rest of format (suffix)
                $out .= $format;

                # add prefix if not there
                $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);

                $log->debug("result: $out");
        }

        if ($eval_code) {
                my $eval = $self->fill_in($rec,$eval_code,$i) || return;
                $log->debug("about to eval{$eval} format: $out");
                return if (! $self->_eval($eval));
        }
        
        if ($filter_name && $self->{'filter'}->{$filter_name}) {
                $log->debug("about to filter{$filter_name} format: $out");
                $out = $self->{'filter'}->{$filter_name}->($out);
                return unless(defined($out));
                $log->debug("filter result: $out");
        }

        return $out;
}

=head2 parse_to_arr

Similar to C<parse>, but returns array of all repeatable fields

 my @arr = $webpac->parse_to_arr($rec,'v250^a');

=cut

sub parse_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my $v = $self->parse($rec,$format_utf8,$i++)) {
                push @arr, $v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}

=head2 fill_in_to_arr

Similar to C<fill_in>, but returns array of all repeatable fields. Usable
for fields which have lookups, so they shouldn't be parsed but rather
C<fill_id>ed.

 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');

=cut

sub fill_in_to_arr {
        my $self = shift;

        my ($rec, $format_utf8) = @_;

        my $log = $self->_get_logger();

        $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
        return if (! $format_utf8);

        my $i = 0;
        my @arr;

        while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
                push @arr, @v;
        }

        $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);

        return @arr;
}

=head2 sort_arr

Sort array ignoring case and html in data

 my @sorted = $webpac->sort_arr(@unsorted);

=cut

sub sort_arr {
        my $self = shift;

        my $log = $self->_get_logger();

        # FIXME add Schwartzian Transformation?

        my @sorted = sort {
                $a =~ s#<[^>]+/*>##;
                $b =~ s#<[^>]+/*>##;
                lc($b) cmp lc($a)
        } @_;
        $log->debug("sorted values: ",sub { join(", ",@sorted) });

        return @sorted;
}


=head2 _sort_by_order

Sort xml tags data structure accoding to C<order=""> attribute.

=cut

sub _sort_by_order {
        my $self = shift;

        my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$a};
        my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
                $self->{'import_xml'}->{'indexer'}->{$b};

        return $va <=> $vb;
}

=head2 _x

Convert strings from C<conf/normalize> encoding into application specific
(optinally specified using C<code_page> to C<new> constructor.

 my $text = $n->_x('normalize text string');

This is a stub so that other modules doesn't have to implement it.

=cut

sub _x {
        my $self = shift;
        return shift;
}


=head1 AUTHOR

Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>

=head1 COPYRIGHT & LICENSE

Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.

This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.

=cut

1; # End of WebPAC::DB
1	package WebPAC::Normalize;
2
3	use warnings;
4	use strict;
5	use Data::Dumper;
6	use Storable;
7
8	=head1 NAME
9
10	WebPAC::Normalize - normalisation of source file
11
12	=head1 VERSION
13
14	Version 0.01
15
16	=cut
17
18	our $VERSION = '0.01';
19
20	=head1 SYNOPSIS
21
22	This package contains code that could be helpful in implementing different
23	normalisation front-ends.
24
25	=head1 FUNCTIONS
26
27	=head2 new
28
29	Create new normalisation object
30
31	my $n = new WebPAC::Normalize::Something(
32	cache_data_structure => './cache/ds/',
33	lookup_regex => $lookup->regex,
34	);
35
36	Optional parameter C<cache_data_structure> defines path to directory
37	in which cache file for C<data_structure> call will be created.
38
39	Recommended parametar C<lookup_regex> is used to enable parsing of lookups
40	in structures.
41
42	=cut
43
44	sub new {
45	my $class = shift;
46	my $self = {@_};
47	bless($self, $class);
48
49	$self->setup_cache_dir( $self->{'cache_data_structure'} );
50
51	$self ? return $self : return undef;
52	}
53
54	=head2 setup_cache_dir
55
56	Check if specified cache directory exist, and if not, disable caching.
57
58	$setup_cache_dir('./cache/ds/');
59
60	If you pass false or zero value to this function, it will disable
61	cacheing.
62
63	=cut
64
65	sub setup_cache_dir {
66	my $self = shift;
67
68	my $dir = shift;
69
70	my $log = $self->_get_logger();
71
72	if ($dir) {
73	my $msg;
74	if (! -e $dir) {
75	$msg = "doesn't exist";
76	} elsif (! -d $dir) {
77	$msg = "is not directory";
78	} elsif (! -w $dir) {
79	$msg = "not writable";
80	}
81
82	if ($msg) {
83	undef $self->{'cache_data_structure'};
84	$log->warn("cache_data_structure $dir $msg, disabling...");
85	} else {
86	$log->debug("using cache dir $dir");
87	}
88	} else {
89	$log->debug("disabling cache");
90	undef $self->{'cache_data_structure'};
91	}
92	}
93
94
95	=head2 data_structure
96
97	Create in-memory data structure which represents normalized layout from
98	C<conf/normalize/*.xml>.
99
100	This structures are used to produce output.
101
102	my @ds = $webpac->data_structure($rec);
103
104	B<Note: historical oddity follows>
105
106	This method will also set C<< $webpac->{'currnet_filename'} >> if there is
107	C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
108	C<< <headline> >> tag.
109
110	=cut
111
112	sub data_structure {
113	my $self = shift;
114
115	my $log = $self->_get_logger();
116
117	my $rec = shift;
118	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
119
120	my $cache_file;
121
122	if (my $cache_path = $self->{'cache_data_structure'}) {
123	my $id = $rec->{'000'};
124	$id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
125	unless (defined($id)) {
126	$log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
127	undef $self->{'cache_data_structure'};
128	} else {
129	$cache_file = "$cache_path/$id";
130	if (-r $cache_file) {
131	my $ds_ref = retrieve($cache_file);
132	if ($ds_ref) {
133	$log->debug("cache hit: $cache_file");
134	my $ok = 1;
135	foreach my $f (qw(current_filename headline)) {
136	if ($ds_ref->{$f}) {
137	$self->{$f} = $ds_ref->{$f};
138	} else {
139	$ok = 0;
140	}
141	};
142	if ($ok && $ds_ref->{'ds'}) {
143	return @{ $ds_ref->{'ds'} };
144	} else {
145	$log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
146	undef $self->{'cache_data_structure'};
147	}
148	}
149	}
150	}
151	}
152
153	undef $self->{'currnet_filename'};
154	undef $self->{'headline'};
155
156	my @sorted_tags;
157	if ($self->{tags_by_order}) {
158	@sorted_tags = @{$self->{tags_by_order}};
159	} else {
160	@sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
161	$self->{tags_by_order} = \@sorted_tags;
162	}
163
164	my @ds;
165
166	$log->debug("tags: ",sub { join(", ",@sorted_tags) });
167
168	foreach my $field (@sorted_tags) {
169
170	my $row;
171
172	#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173
174	foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
175	my $format = $tag->{'value'} \|\| $tag->{'content'};
176
177	$log->debug("format: $format");
178
179	my @v;
180	if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
181	@v = $self->fill_in_to_arr($rec,$format);
182	} else {
183	@v = $self->parse_to_arr($rec,$format);
184	}
185	next if (! @v);
186
187	if ($tag->{'sort'}) {
188	@v = $self->sort_arr(@v);
189	}
190
191	# use format?
192	if ($tag->{'format_name'}) {
193	@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
194	}
195
196	if ($field eq 'filename') {
197	$self->{'current_filename'} = join('',@v);
198	$log->debug("filename: ",$self->{'current_filename'});
199	} elsif ($field eq 'headline') {
200	$self->{'headline'} .= join('',@v);
201	$log->debug("headline: ",$self->{'headline'});
202	next; # don't return headline in data_structure!
203	}
204
205	# delimiter will join repeatable fields
206	if ($tag->{'delimiter'}) {
207	@v = ( join($tag->{'delimiter'}, @v) );
208	}
209
210	# default types
211	my @types = qw(display swish);
212	# override by type attribute
213	@types = ( $tag->{'type'} ) if ($tag->{'type'});
214
215	foreach my $type (@types) {
216	# append to previous line?
217	$log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} \|\| 'no append');
218	if ($tag->{'append'}) {
219
220	# I will delimit appended part with
221	# delimiter (or ,)
222	my $d = $tag->{'delimiter'};
223	# default delimiter
224	$d \|\|= " ";
225
226	my $last = pop @{$row->{$type}};
227	$d = "" if (! $last);
228	$last .= $d . join($d, @v);
229	push @{$row->{$type}}, $last;
230
231	} else {
232	push @{$row->{$type}}, @v;
233	}
234	}
235
236
237	}
238
239	if ($row) {
240	$row->{'tag'} = $field;
241
242	# TODO: name_sigular, name_plural
243	my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
244	$row->{'name'} = $name ? $self->_x($name) : $field;
245
246	# post-sort all values in field
247	if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
248	$log->warn("sort at field tag not implemented");
249	}
250
251	push @ds, $row;
252
253	$log->debug("row $field: ",sub { Dumper($row) });
254	}
255
256	}
257
258	if ($cache_file) {
259	store {
260	ds => \@ds,
261	current_filename => $self->{'current_filename'},
262	headline => $self->{'headline'},
263	}, $cache_file;
264	$log->debug("created storable cache file $cache_file");
265	}
266
267	return @ds;
268
269	}
270
271	=head2 apply_format
272
273	Apply format specified in tag with C<format_name="name"> and
274	C<format_delimiter=";;">.
275
276	my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
277
278	Formats can contain C<lookup{...}> if you need them.
279
280	=cut
281
282	sub apply_format {
283	my $self = shift;
284
285	my ($name,$delimiter,$data) = @_;
286
287	my $log = $self->_get_logger();
288
289	if (! $self->{'import_xml'}->{'format'}->{$name}) {
290	$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
291	return $data;
292	}
293
294	$log->warn("no delimiter for format $name") if (! $delimiter);
295
296	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) \|\| $log->logdie("can't find format '$name'");
297
298	my @data = split(/\Q$delimiter\E/, $data);
299
300	my $out = sprintf($format, @data);
301	$log->debug("using format $name [$format] on $data to produce: $out");
302
303	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
304	return $self->lookup($out);
305	} else {
306	return $out;
307	}
308
309	}
310
311	=head2 parse
312
313	Perform smart parsing of string, skipping delimiters for fields which aren't
314	defined. It can also eval code in format starting with C<eval{...}> and
315	return output or nothing depending on eval code.
316
317	my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
318
319	=cut
320
321	sub parse {
322	my $self = shift;
323
324	my ($rec, $format_utf8, $i) = @_;
325
326	return if (! $format_utf8);
327
328	my $log = $self->_get_logger();
329
330	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
331
332	$i = 0 if (! $i);
333
334	my $format = $self->_x($format_utf8) \|\| $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
335
336	my @out;
337
338	$log->debug("format: $format");
339
340	my $eval_code;
341	# remove eval{...} from beginning
342	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
343
344	my $filter_name;
345	# remove filter{...} from beginning
346	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
347
348	my $prefix;
349	my $all_found=0;
350
351	while ($format =~ s/^(.*?)(v\|s)(\d+)(?:\^(\w))?//s) {
352
353	my $del = $1 \|\| '';
354	$prefix \|\|= $del if ($all_found == 0);
355
356	# repeatable index
357	my $r = $i;
358	$r = 0 if (lc("$2") eq 's');
359
360	my $found = 0;
361	my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
362
363	if ($found) {
364	push @out, $del;
365	push @out, $tmp;
366	$all_found += $found;
367	}
368	}
369
370	return if (! $all_found);
371
372	my $out = join('',@out);
373
374	if ($out) {
375	# add rest of format (suffix)
376	$out .= $format;
377
378	# add prefix if not there
379	$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
380
381	$log->debug("result: $out");
382	}
383
384	if ($eval_code) {
385	my $eval = $self->fill_in($rec,$eval_code,$i) \|\| return;
386	$log->debug("about to eval{$eval} format: $out");
387	return if (! $self->_eval($eval));
388	}
389
390	if ($filter_name && $self->{'filter'}->{$filter_name}) {
391	$log->debug("about to filter{$filter_name} format: $out");
392	$out = $self->{'filter'}->{$filter_name}->($out);
393	return unless(defined($out));
394	$log->debug("filter result: $out");
395	}
396
397	return $out;
398	}
399
400	=head2 parse_to_arr
401
402	Similar to C<parse>, but returns array of all repeatable fields
403
404	my @arr = $webpac->parse_to_arr($rec,'v250^a');
405
406	=cut
407
408	sub parse_to_arr {
409	my $self = shift;
410
411	my ($rec, $format_utf8) = @_;
412
413	my $log = $self->_get_logger();
414
415	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
416	return if (! $format_utf8);
417
418	my $i = 0;
419	my @arr;
420
421	while (my $v = $self->parse($rec,$format_utf8,$i++)) {
422	push @arr, $v;
423	}
424
425	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
426
427	return @arr;
428	}
429
430	=head2 fill_in_to_arr
431
432	Similar to C<fill_in>, but returns array of all repeatable fields. Usable
433	for fields which have lookups, so they shouldn't be parsed but rather
434	C<fill_id>ed.
435
436	my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
437
438	=cut
439
440	sub fill_in_to_arr {
441	my $self = shift;
442
443	my ($rec, $format_utf8) = @_;
444
445	my $log = $self->_get_logger();
446
447	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
448	return if (! $format_utf8);
449
450	my $i = 0;
451	my @arr;
452
453	while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
454	push @arr, @v;
455	}
456
457	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" \| ",@arr) }) if (@arr);
458
459	return @arr;
460	}
461
462	=head2 sort_arr
463
464	Sort array ignoring case and html in data
465
466	my @sorted = $webpac->sort_arr(@unsorted);
467
468	=cut
469
470	sub sort_arr {
471	my $self = shift;
472
473	my $log = $self->_get_logger();
474
475	# FIXME add Schwartzian Transformation?
476
477	my @sorted = sort {
478	$a =~ s#<[^>]+/*>##;
479	$b =~ s#<[^>]+/*>##;
480	lc($b) cmp lc($a)
481	} @_;
482	$log->debug("sorted values: ",sub { join(", ",@sorted) });
483
484	return @sorted;
485	}
486
487
488	=head2 _sort_by_order
489
490	Sort xml tags data structure accoding to C<order=""> attribute.
491
492	=cut
493
494	sub _sort_by_order {
495	my $self = shift;
496
497	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} \|\|
498	$self->{'import_xml'}->{'indexer'}->{$a};
499	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} \|\|
500	$self->{'import_xml'}->{'indexer'}->{$b};
501
502	return $va <=> $vb;
503	}
504
505	=head2 _x
506
507	Convert strings from C<conf/normalize> encoding into application specific
508	(optinally specified using C<code_page> to C<new> constructor.
509
510	my $text = $n->_x('normalize text string');
511
512	This is a stub so that other modules doesn't have to implement it.
513
514	=cut
515
516	sub _x {
517	my $self = shift;
518	return shift;
519	}
520
521
522	=head1 AUTHOR
523
524	Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
525
526	=head1 COPYRIGHT & LICENSE
527
528	Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
529
530	This program is free software; you can redistribute it and/or modify it
531	under the same terms as Perl itself.
532
533	=cut
534
535	1; # End of WebPAC::DB