/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (hide annotations)
Sun Jul 17 00:04:25 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 11728 byte(s)
small fixes

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 13 use Data::Dumper;
6 dpavlin 14 use Storable;
7 dpavlin 10
8     =head1 NAME
9    
10     WebPAC::Normalize - normalisation of source file
11    
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22     This package contains code that could be helpful in implementing different
23     normalisation front-ends.
24    
25     =head1 FUNCTIONS
26    
27 dpavlin 13 =head2 new
28 dpavlin 10
29 dpavlin 13 Create new normalisation object
30    
31     my $n = new WebPAC::Normalize::Something(
32     cache_data_structure => './cache/ds/',
33     lookup_regex => $lookup->regex,
34     );
35    
36     Optional parameter C<cache_data_structure> defines path to directory
37     in which cache file for C<data_structure> call will be created.
38    
39     Recommended parametar C<lookup_regex> is used to enable parsing of lookups
40     in structures.
41    
42 dpavlin 10 =cut
43    
44 dpavlin 13 sub new {
45     my $class = shift;
46     my $self = {@_};
47     bless($self, $class);
48    
49     $self->setup_cache_dir( $self->{'cache_data_structure'} );
50    
51     $self ? return $self : return undef;
52 dpavlin 10 }
53    
54 dpavlin 13 =head2 setup_cache_dir
55    
56     Check if specified cache directory exist, and if not, disable caching.
57    
58     $setup_cache_dir('./cache/ds/');
59    
60     If you pass false or zero value to this function, it will disable
61     cacheing.
62    
63     =cut
64    
65     sub setup_cache_dir {
66     my $self = shift;
67    
68     my $dir = shift;
69    
70     my $log = $self->_get_logger();
71    
72     if ($dir) {
73     my $msg;
74     if (! -e $dir) {
75     $msg = "doesn't exist";
76     } elsif (! -d $dir) {
77     $msg = "is not directory";
78     } elsif (! -w $dir) {
79     $msg = "not writable";
80     }
81    
82     if ($msg) {
83     undef $self->{'cache_data_structure'};
84     $log->warn("cache_data_structure $dir $msg, disabling...");
85     } else {
86     $log->debug("using cache dir $dir");
87     }
88     } else {
89     $log->debug("disabling cache");
90     undef $self->{'cache_data_structure'};
91     }
92     }
93    
94    
95     =head2 data_structure
96    
97     Create in-memory data structure which represents normalized layout from
98     C<conf/normalize/*.xml>.
99    
100     This structures are used to produce output.
101    
102     my @ds = $webpac->data_structure($rec);
103    
104     B<Note: historical oddity follows>
105    
106     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
107     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
108     C<< <headline> >> tag.
109    
110     =cut
111    
112     sub data_structure {
113     my $self = shift;
114    
115     my $log = $self->_get_logger();
116    
117     my $rec = shift;
118     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
119    
120     my $cache_file;
121    
122     if (my $cache_path = $self->{'cache_data_structure'}) {
123     my $id = $rec->{'000'};
124     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
125     unless (defined($id)) {
126     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
127     undef $self->{'cache_data_structure'};
128     } else {
129     $cache_file = "$cache_path/$id";
130     if (-r $cache_file) {
131     my $ds_ref = retrieve($cache_file);
132     if ($ds_ref) {
133     $log->debug("cache hit: $cache_file");
134     my $ok = 1;
135     foreach my $f (qw(current_filename headline)) {
136     if ($ds_ref->{$f}) {
137     $self->{$f} = $ds_ref->{$f};
138     } else {
139     $ok = 0;
140     }
141     };
142     if ($ok && $ds_ref->{'ds'}) {
143     return @{ $ds_ref->{'ds'} };
144     } else {
145     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
146     undef $self->{'cache_data_structure'};
147     }
148     }
149     }
150     }
151     }
152    
153     undef $self->{'currnet_filename'};
154     undef $self->{'headline'};
155    
156     my @sorted_tags;
157     if ($self->{tags_by_order}) {
158     @sorted_tags = @{$self->{tags_by_order}};
159     } else {
160     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
161     $self->{tags_by_order} = \@sorted_tags;
162     }
163    
164     my @ds;
165    
166     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167    
168     foreach my $field (@sorted_tags) {
169    
170     my $row;
171    
172     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173    
174     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
175     my $format = $tag->{'value'} || $tag->{'content'};
176    
177     $log->debug("format: $format");
178    
179     my @v;
180     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
181     @v = $self->fill_in_to_arr($rec,$format);
182     } else {
183     @v = $self->parse_to_arr($rec,$format);
184     }
185     next if (! @v);
186    
187     if ($tag->{'sort'}) {
188     @v = $self->sort_arr(@v);
189     }
190    
191     # use format?
192     if ($tag->{'format_name'}) {
193     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
194     }
195    
196     if ($field eq 'filename') {
197     $self->{'current_filename'} = join('',@v);
198     $log->debug("filename: ",$self->{'current_filename'});
199     } elsif ($field eq 'headline') {
200     $self->{'headline'} .= join('',@v);
201     $log->debug("headline: ",$self->{'headline'});
202     next; # don't return headline in data_structure!
203     }
204    
205     # delimiter will join repeatable fields
206     if ($tag->{'delimiter'}) {
207     @v = ( join($tag->{'delimiter'}, @v) );
208     }
209    
210     # default types
211     my @types = qw(display swish);
212     # override by type attribute
213     @types = ( $tag->{'type'} ) if ($tag->{'type'});
214    
215     foreach my $type (@types) {
216     # append to previous line?
217     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
218     if ($tag->{'append'}) {
219    
220     # I will delimit appended part with
221     # delimiter (or ,)
222     my $d = $tag->{'delimiter'};
223     # default delimiter
224     $d ||= " ";
225    
226     my $last = pop @{$row->{$type}};
227     $d = "" if (! $last);
228     $last .= $d . join($d, @v);
229     push @{$row->{$type}}, $last;
230    
231     } else {
232     push @{$row->{$type}}, @v;
233     }
234     }
235    
236    
237     }
238    
239     if ($row) {
240     $row->{'tag'} = $field;
241    
242     # TODO: name_sigular, name_plural
243     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
244     $row->{'name'} = $name ? $self->_x($name) : $field;
245    
246     # post-sort all values in field
247     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
248     $log->warn("sort at field tag not implemented");
249     }
250    
251     push @ds, $row;
252    
253     $log->debug("row $field: ",sub { Dumper($row) });
254     }
255    
256     }
257    
258     if ($cache_file) {
259     store {
260     ds => \@ds,
261     current_filename => $self->{'current_filename'},
262     headline => $self->{'headline'},
263     }, $cache_file;
264     $log->debug("created storable cache file $cache_file");
265     }
266    
267     return @ds;
268    
269     }
270    
271     =head2 apply_format
272    
273     Apply format specified in tag with C<format_name="name"> and
274     C<format_delimiter=";;">.
275    
276     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
277    
278     Formats can contain C<lookup{...}> if you need them.
279    
280     =cut
281    
282     sub apply_format {
283     my $self = shift;
284    
285     my ($name,$delimiter,$data) = @_;
286    
287     my $log = $self->_get_logger();
288    
289     if (! $self->{'import_xml'}->{'format'}->{$name}) {
290     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
291     return $data;
292     }
293    
294     $log->warn("no delimiter for format $name") if (! $delimiter);
295    
296     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
297    
298     my @data = split(/\Q$delimiter\E/, $data);
299    
300     my $out = sprintf($format, @data);
301     $log->debug("using format $name [$format] on $data to produce: $out");
302    
303     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
304     return $self->lookup($out);
305     } else {
306     return $out;
307     }
308    
309     }
310    
311     =head2 parse
312    
313     Perform smart parsing of string, skipping delimiters for fields which aren't
314     defined. It can also eval code in format starting with C<eval{...}> and
315     return output or nothing depending on eval code.
316    
317     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
318    
319     =cut
320    
321     sub parse {
322     my $self = shift;
323    
324     my ($rec, $format_utf8, $i) = @_;
325    
326     return if (! $format_utf8);
327    
328     my $log = $self->_get_logger();
329    
330     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
331    
332     $i = 0 if (! $i);
333    
334     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
335    
336     my @out;
337    
338     $log->debug("format: $format");
339    
340     my $eval_code;
341     # remove eval{...} from beginning
342     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
343    
344     my $filter_name;
345     # remove filter{...} from beginning
346     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
347    
348     my $prefix;
349     my $all_found=0;
350    
351     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
352    
353     my $del = $1 || '';
354     $prefix ||= $del if ($all_found == 0);
355    
356     # repeatable index
357     my $r = $i;
358     $r = 0 if (lc("$2") eq 's');
359    
360     my $found = 0;
361     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
362    
363     if ($found) {
364     push @out, $del;
365     push @out, $tmp;
366     $all_found += $found;
367     }
368     }
369    
370     return if (! $all_found);
371    
372     my $out = join('',@out);
373    
374     if ($out) {
375     # add rest of format (suffix)
376     $out .= $format;
377    
378     # add prefix if not there
379     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
380    
381     $log->debug("result: $out");
382     }
383    
384     if ($eval_code) {
385     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
386     $log->debug("about to eval{$eval} format: $out");
387     return if (! $self->_eval($eval));
388     }
389    
390     if ($filter_name && $self->{'filter'}->{$filter_name}) {
391     $log->debug("about to filter{$filter_name} format: $out");
392     $out = $self->{'filter'}->{$filter_name}->($out);
393     return unless(defined($out));
394     $log->debug("filter result: $out");
395     }
396    
397     return $out;
398     }
399    
400     =head2 parse_to_arr
401    
402     Similar to C<parse>, but returns array of all repeatable fields
403    
404     my @arr = $webpac->parse_to_arr($rec,'v250^a');
405    
406     =cut
407    
408     sub parse_to_arr {
409     my $self = shift;
410    
411     my ($rec, $format_utf8) = @_;
412    
413     my $log = $self->_get_logger();
414    
415     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
416     return if (! $format_utf8);
417    
418     my $i = 0;
419     my @arr;
420    
421     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
422     push @arr, $v;
423     }
424    
425     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
426    
427     return @arr;
428     }
429    
430     =head2 fill_in_to_arr
431    
432     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
433     for fields which have lookups, so they shouldn't be parsed but rather
434     C<fill_id>ed.
435    
436     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
437    
438     =cut
439    
440     sub fill_in_to_arr {
441     my $self = shift;
442    
443     my ($rec, $format_utf8) = @_;
444    
445     my $log = $self->_get_logger();
446    
447     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
448     return if (! $format_utf8);
449    
450     my $i = 0;
451     my @arr;
452    
453     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
454     push @arr, @v;
455     }
456    
457     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
458    
459     return @arr;
460     }
461    
462     =head2 sort_arr
463    
464     Sort array ignoring case and html in data
465    
466     my @sorted = $webpac->sort_arr(@unsorted);
467    
468     =cut
469    
470     sub sort_arr {
471     my $self = shift;
472    
473     my $log = $self->_get_logger();
474    
475     # FIXME add Schwartzian Transformation?
476    
477     my @sorted = sort {
478     $a =~ s#<[^>]+/*>##;
479     $b =~ s#<[^>]+/*>##;
480     lc($b) cmp lc($a)
481     } @_;
482     $log->debug("sorted values: ",sub { join(", ",@sorted) });
483    
484     return @sorted;
485     }
486    
487    
488     =head2 _sort_by_order
489    
490     Sort xml tags data structure accoding to C<order=""> attribute.
491    
492     =cut
493    
494     sub _sort_by_order {
495     my $self = shift;
496    
497     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
498     $self->{'import_xml'}->{'indexer'}->{$a};
499     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
500     $self->{'import_xml'}->{'indexer'}->{$b};
501    
502     return $va <=> $vb;
503     }
504    
505     =head2 _x
506    
507     Convert strings from C<conf/normalize> encoding into application specific
508     (optinally specified using C<code_page> to C<new> constructor.
509    
510     my $text = $n->_x('normalize text string');
511    
512     This is a stub so that other modules doesn't have to implement it.
513    
514     =cut
515    
516     sub _x {
517     my $self = shift;
518     return shift;
519     }
520    
521    
522 dpavlin 10 =head1 AUTHOR
523    
524     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
525    
526     =head1 COPYRIGHT & LICENSE
527    
528     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
529    
530     This program is free software; you can redistribute it and/or modify it
531     under the same terms as Perl itself.
532    
533     =cut
534    
535     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26