/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (hide annotations)
Sun Jul 24 15:03:11 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 15304 byte(s)
re-worked logging, added no_log option to disable logging

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 13 );
86    
87 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
88     be use with C<filter{...}> notation.
89    
90 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
92     which is C<WebPAC::Lookup> object.
93 dpavlin 13
94 dpavlin 10 =cut
95    
96 dpavlin 13 sub new {
97     my $class = shift;
98     my $self = {@_};
99     bless($self, $class);
100    
101 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
102     my $l = $self->{'lookup'} ? 1 : 0;
103    
104     my $log = $self->_get_logger();
105    
106     # those two must be in pair
107     if ( ($r & $l) != ($r || $l) ) {
108     my $log = $self->_get_logger();
109     $log->logdie("lookup_regex and lookup must be in pair");
110     }
111    
112     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113    
114 dpavlin 13 $self ? return $self : return undef;
115 dpavlin 10 }
116    
117 dpavlin 13
118     =head2 data_structure
119    
120     Create in-memory data structure which represents normalized layout from
121     C<conf/normalize/*.xml>.
122    
123     This structures are used to produce output.
124    
125     my @ds = $webpac->data_structure($rec);
126    
127     B<Note: historical oddity follows>
128    
129     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131     C<< <headline> >> tag.
132    
133     =cut
134    
135     sub data_structure {
136     my $self = shift;
137    
138     my $log = $self->_get_logger();
139    
140     my $rec = shift;
141     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142    
143     my $cache_file;
144    
145 dpavlin 18 if ($self->{'db'}) {
146 dpavlin 22 my @ds = $self->{'db'}->load_ds($rec);
147 dpavlin 29 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148     return @ds if ($#ds > 0);
149     $log->debug("cache miss, creating");
150 dpavlin 13 }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163     my @ds;
164    
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174     my $format = $tag->{'value'} || $tag->{'content'};
175    
176     $log->debug("format: $format");
177    
178     my @v;
179     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180     @v = $self->fill_in_to_arr($rec,$format);
181     } else {
182     @v = $self->parse_to_arr($rec,$format);
183     }
184     next if (! @v);
185    
186     if ($tag->{'sort'}) {
187     @v = $self->sort_arr(@v);
188     }
189    
190     # use format?
191     if ($tag->{'format_name'}) {
192     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193     }
194    
195     if ($field eq 'filename') {
196     $self->{'current_filename'} = join('',@v);
197     $log->debug("filename: ",$self->{'current_filename'});
198     } elsif ($field eq 'headline') {
199     $self->{'headline'} .= join('',@v);
200     $log->debug("headline: ",$self->{'headline'});
201     next; # don't return headline in data_structure!
202     }
203    
204     # delimiter will join repeatable fields
205     if ($tag->{'delimiter'}) {
206     @v = ( join($tag->{'delimiter'}, @v) );
207     }
208    
209     # default types
210     my @types = qw(display swish);
211     # override by type attribute
212     @types = ( $tag->{'type'} ) if ($tag->{'type'});
213    
214     foreach my $type (@types) {
215     # append to previous line?
216     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217     if ($tag->{'append'}) {
218    
219     # I will delimit appended part with
220     # delimiter (or ,)
221     my $d = $tag->{'delimiter'};
222     # default delimiter
223     $d ||= " ";
224    
225     my $last = pop @{$row->{$type}};
226     $d = "" if (! $last);
227     $last .= $d . join($d, @v);
228     push @{$row->{$type}}, $last;
229    
230     } else {
231     push @{$row->{$type}}, @v;
232     }
233     }
234    
235    
236     }
237    
238     if ($row) {
239     $row->{'tag'} = $field;
240    
241     # TODO: name_sigular, name_plural
242     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243     $row->{'name'} = $name ? $self->_x($name) : $field;
244    
245     # post-sort all values in field
246     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247     $log->warn("sort at field tag not implemented");
248     }
249    
250     push @ds, $row;
251    
252     $log->debug("row $field: ",sub { Dumper($row) });
253     }
254    
255     }
256    
257 dpavlin 22 $self->{'db'}->save_ds(
258 dpavlin 18 ds => \@ds,
259     current_filename => $self->{'current_filename'},
260     headline => $self->{'headline'},
261     ) if ($self->{'db'});
262 dpavlin 13
263 dpavlin 29 $log->debug("ds: ", sub { Dumper(@ds) });
264    
265 dpavlin 13 return @ds;
266    
267     }
268    
269     =head2 parse
270    
271     Perform smart parsing of string, skipping delimiters for fields which aren't
272     defined. It can also eval code in format starting with C<eval{...}> and
273     return output or nothing depending on eval code.
274    
275     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
276    
277     =cut
278    
279     sub parse {
280     my $self = shift;
281    
282     my ($rec, $format_utf8, $i) = @_;
283    
284     return if (! $format_utf8);
285    
286     my $log = $self->_get_logger();
287    
288     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
289    
290     $i = 0 if (! $i);
291    
292     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
293    
294     my @out;
295    
296     $log->debug("format: $format");
297    
298     my $eval_code;
299     # remove eval{...} from beginning
300     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
301    
302     my $filter_name;
303     # remove filter{...} from beginning
304     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
305    
306     my $prefix;
307     my $all_found=0;
308    
309     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
310    
311     my $del = $1 || '';
312     $prefix ||= $del if ($all_found == 0);
313    
314     # repeatable index
315     my $r = $i;
316     $r = 0 if (lc("$2") eq 's');
317    
318     my $found = 0;
319     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
320    
321     if ($found) {
322     push @out, $del;
323     push @out, $tmp;
324     $all_found += $found;
325     }
326     }
327    
328     return if (! $all_found);
329    
330     my $out = join('',@out);
331    
332     if ($out) {
333     # add rest of format (suffix)
334     $out .= $format;
335    
336     # add prefix if not there
337     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
338    
339     $log->debug("result: $out");
340     }
341    
342     if ($eval_code) {
343     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
344     $log->debug("about to eval{$eval} format: $out");
345     return if (! $self->_eval($eval));
346     }
347    
348     if ($filter_name && $self->{'filter'}->{$filter_name}) {
349     $log->debug("about to filter{$filter_name} format: $out");
350     $out = $self->{'filter'}->{$filter_name}->($out);
351     return unless(defined($out));
352     $log->debug("filter result: $out");
353     }
354    
355     return $out;
356     }
357    
358     =head2 parse_to_arr
359    
360     Similar to C<parse>, but returns array of all repeatable fields
361    
362     my @arr = $webpac->parse_to_arr($rec,'v250^a');
363    
364     =cut
365    
366     sub parse_to_arr {
367     my $self = shift;
368    
369     my ($rec, $format_utf8) = @_;
370    
371     my $log = $self->_get_logger();
372    
373     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
374     return if (! $format_utf8);
375    
376     my $i = 0;
377     my @arr;
378    
379     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
380     push @arr, $v;
381     }
382    
383     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
384    
385     return @arr;
386     }
387    
388 dpavlin 15
389     =head2 fill_in
390    
391     Workhourse of all: takes record from in-memory structure of database and
392     strings with placeholders and returns string or array of with substituted
393     values from record.
394    
395     my $text = $webpac->fill_in($rec,'v250^a');
396    
397     Optional argument is ordinal number for repeatable fields. By default,
398     it's assume to be first repeatable field (fields are perl array, so first
399     element is 0).
400     Following example will read second value from repeatable field.
401    
402     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
403    
404     This function B<does not> perform parsing of format to inteligenty skip
405     delimiters before fields which aren't used.
406    
407     This method will automatically decode UTF-8 string to local code page
408     if needed.
409    
410     =cut
411    
412     sub fill_in {
413     my $self = shift;
414    
415     my $log = $self->_get_logger();
416    
417     my $rec = shift || $log->logconfess("need data record");
418     my $format = shift || $log->logconfess("need format to parse");
419     # iteration (for repeatable fields)
420     my $i = shift || 0;
421    
422     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
423    
424     # FIXME remove for speedup?
425     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
426    
427     if (utf8::is_utf8($format)) {
428     $format = $self->_x($format);
429     }
430    
431     my $found = 0;
432    
433     my $eval_code;
434     # remove eval{...} from beginning
435     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
436    
437     my $filter_name;
438     # remove filter{...} from beginning
439     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
440    
441     # do actual replacement of placeholders
442     # repeatable fields
443     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
444     # non-repeatable fields
445     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
446    
447     if ($found) {
448     $log->debug("format: $format");
449     if ($eval_code) {
450     my $eval = $self->fill_in($rec,$eval_code,$i);
451     return if (! $self->_eval($eval));
452     }
453     if ($filter_name && $self->{'filter'}->{$filter_name}) {
454     $log->debug("filter '$filter_name' for $format");
455     $format = $self->{'filter'}->{$filter_name}->($format);
456     return unless(defined($format));
457     $log->debug("filter result: $format");
458     }
459     # do we have lookups?
460     if ($self->{'lookup'}) {
461 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
462     return $self->{'lookup'}->lookup($format);
463     } else {
464     $log->warn("Have lookup object but can't invoke lookup method");
465     }
466 dpavlin 15 } else {
467     return $format;
468     }
469     } else {
470     return;
471     }
472     }
473    
474    
475 dpavlin 13 =head2 fill_in_to_arr
476    
477     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
478     for fields which have lookups, so they shouldn't be parsed but rather
479     C<fill_id>ed.
480    
481     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
482    
483     =cut
484    
485     sub fill_in_to_arr {
486     my $self = shift;
487    
488     my ($rec, $format_utf8) = @_;
489    
490     my $log = $self->_get_logger();
491    
492     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
493     return if (! $format_utf8);
494    
495     my $i = 0;
496     my @arr;
497    
498     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
499     push @arr, @v;
500     }
501    
502     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
503    
504     return @arr;
505     }
506    
507 dpavlin 15
508     =head2 get_data
509    
510     Returns value from record.
511    
512     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
513    
514     Arguments are:
515     record reference C<$rec>,
516     field C<$f>,
517     optional subfiled C<$sf>,
518     index for repeatable values C<$i>.
519    
520     Optinal variable C<$found> will be incremeted if there
521     is field.
522    
523     Returns value or empty string.
524    
525     =cut
526    
527     sub get_data {
528     my $self = shift;
529    
530     my ($rec,$f,$sf,$i,$found) = @_;
531    
532     if ($$rec->{$f}) {
533     return '' if (! $$rec->{$f}->[$i]);
534     no strict 'refs';
535     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
536     $$found++ if (defined($$found));
537     return $$rec->{$f}->[$i]->{$sf};
538     } elsif ($$rec->{$f}->[$i]) {
539     $$found++ if (defined($$found));
540     # it still might have subfield, just
541     # not specified, so we'll dump all
542     if ($$rec->{$f}->[$i] =~ /HASH/o) {
543     my $out;
544     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
545     $out .= $$rec->{$f}->[$i]->{$k}." ";
546     }
547     return $out;
548     } else {
549     return $$rec->{$f}->[$i];
550     }
551     }
552     } else {
553     return '';
554     }
555     }
556    
557    
558     =head2 apply_format
559    
560     Apply format specified in tag with C<format_name="name"> and
561     C<format_delimiter=";;">.
562    
563     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
564    
565     Formats can contain C<lookup{...}> if you need them.
566    
567     =cut
568    
569     sub apply_format {
570     my $self = shift;
571    
572     my ($name,$delimiter,$data) = @_;
573    
574     my $log = $self->_get_logger();
575    
576     if (! $self->{'import_xml'}->{'format'}->{$name}) {
577     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
578     return $data;
579     }
580    
581     $log->warn("no delimiter for format $name") if (! $delimiter);
582    
583     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
584    
585     my @data = split(/\Q$delimiter\E/, $data);
586    
587     my $out = sprintf($format, @data);
588     $log->debug("using format $name [$format] on $data to produce: $out");
589    
590     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
591 dpavlin 31 return $self->{'lookup'}->lookup($out);
592 dpavlin 15 } else {
593     return $out;
594     }
595    
596     }
597    
598 dpavlin 13 =head2 sort_arr
599    
600     Sort array ignoring case and html in data
601    
602     my @sorted = $webpac->sort_arr(@unsorted);
603    
604     =cut
605    
606     sub sort_arr {
607     my $self = shift;
608    
609     my $log = $self->_get_logger();
610    
611     # FIXME add Schwartzian Transformation?
612    
613     my @sorted = sort {
614     $a =~ s#<[^>]+/*>##;
615     $b =~ s#<[^>]+/*>##;
616     lc($b) cmp lc($a)
617     } @_;
618     $log->debug("sorted values: ",sub { join(", ",@sorted) });
619    
620     return @sorted;
621     }
622    
623    
624 dpavlin 15 =head1 INTERNAL METHODS
625    
626 dpavlin 13 =head2 _sort_by_order
627    
628     Sort xml tags data structure accoding to C<order=""> attribute.
629    
630     =cut
631    
632     sub _sort_by_order {
633     my $self = shift;
634    
635     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
636     $self->{'import_xml'}->{'indexer'}->{$a};
637     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
638     $self->{'import_xml'}->{'indexer'}->{$b};
639    
640     return $va <=> $vb;
641     }
642    
643     =head2 _x
644    
645 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
646     specific encoding (optinally specified using C<code_page> to C<new>
647     constructor).
648 dpavlin 13
649     my $text = $n->_x('normalize text string');
650    
651     This is a stub so that other modules doesn't have to implement it.
652    
653     =cut
654    
655     sub _x {
656     my $self = shift;
657     return shift;
658     }
659    
660    
661 dpavlin 10 =head1 AUTHOR
662    
663     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
664    
665     =head1 COPYRIGHT & LICENSE
666    
667     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
668    
669     This program is free software; you can redistribute it and/or modify it
670     under the same terms as Perl itself.
671    
672     =cut
673    
674     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26