/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 38 - (hide annotations)
Sat Nov 12 21:21:50 2005 UTC (18 years, 5 months ago) by dpavlin
File size: 15397 byte(s)
added ForceContent so that tags without attributes work, added strict checking

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 13 );
86    
87 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
88     be use with C<filter{...}> notation.
89    
90 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
92     which is C<WebPAC::Lookup> object.
93 dpavlin 13
94 dpavlin 10 =cut
95    
96 dpavlin 13 sub new {
97     my $class = shift;
98     my $self = {@_};
99     bless($self, $class);
100    
101 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
102     my $l = $self->{'lookup'} ? 1 : 0;
103    
104     my $log = $self->_get_logger();
105    
106     # those two must be in pair
107     if ( ($r & $l) != ($r || $l) ) {
108     my $log = $self->_get_logger();
109     $log->logdie("lookup_regex and lookup must be in pair");
110     }
111    
112     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113    
114 dpavlin 13 $self ? return $self : return undef;
115 dpavlin 10 }
116    
117 dpavlin 13
118     =head2 data_structure
119    
120     Create in-memory data structure which represents normalized layout from
121     C<conf/normalize/*.xml>.
122    
123     This structures are used to produce output.
124    
125     my @ds = $webpac->data_structure($rec);
126    
127     B<Note: historical oddity follows>
128    
129     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131     C<< <headline> >> tag.
132    
133     =cut
134    
135     sub data_structure {
136     my $self = shift;
137    
138     my $log = $self->_get_logger();
139    
140     my $rec = shift;
141     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142    
143     my $cache_file;
144    
145 dpavlin 18 if ($self->{'db'}) {
146 dpavlin 22 my @ds = $self->{'db'}->load_ds($rec);
147 dpavlin 29 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148     return @ds if ($#ds > 0);
149     $log->debug("cache miss, creating");
150 dpavlin 13 }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163     my @ds;
164    
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 dpavlin 38 my $format;
175 dpavlin 13
176 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177     $format = $tag->{'value'} || $tag->{'content'};
178    
179 dpavlin 13 $log->debug("format: $format");
180    
181     my @v;
182     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183     @v = $self->fill_in_to_arr($rec,$format);
184     } else {
185     @v = $self->parse_to_arr($rec,$format);
186     }
187     next if (! @v);
188    
189     if ($tag->{'sort'}) {
190     @v = $self->sort_arr(@v);
191     }
192    
193     # use format?
194     if ($tag->{'format_name'}) {
195     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196     }
197    
198     if ($field eq 'filename') {
199     $self->{'current_filename'} = join('',@v);
200     $log->debug("filename: ",$self->{'current_filename'});
201     } elsif ($field eq 'headline') {
202     $self->{'headline'} .= join('',@v);
203     $log->debug("headline: ",$self->{'headline'});
204     next; # don't return headline in data_structure!
205     }
206    
207     # delimiter will join repeatable fields
208     if ($tag->{'delimiter'}) {
209     @v = ( join($tag->{'delimiter'}, @v) );
210     }
211    
212     # default types
213     my @types = qw(display swish);
214     # override by type attribute
215     @types = ( $tag->{'type'} ) if ($tag->{'type'});
216    
217     foreach my $type (@types) {
218     # append to previous line?
219     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220     if ($tag->{'append'}) {
221    
222     # I will delimit appended part with
223     # delimiter (or ,)
224     my $d = $tag->{'delimiter'};
225     # default delimiter
226     $d ||= " ";
227    
228     my $last = pop @{$row->{$type}};
229     $d = "" if (! $last);
230     $last .= $d . join($d, @v);
231     push @{$row->{$type}}, $last;
232    
233     } else {
234     push @{$row->{$type}}, @v;
235     }
236     }
237    
238    
239     }
240    
241     if ($row) {
242     $row->{'tag'} = $field;
243    
244     # TODO: name_sigular, name_plural
245     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246     $row->{'name'} = $name ? $self->_x($name) : $field;
247    
248     # post-sort all values in field
249     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250     $log->warn("sort at field tag not implemented");
251     }
252    
253     push @ds, $row;
254    
255     $log->debug("row $field: ",sub { Dumper($row) });
256     }
257    
258     }
259    
260 dpavlin 22 $self->{'db'}->save_ds(
261 dpavlin 18 ds => \@ds,
262     current_filename => $self->{'current_filename'},
263     headline => $self->{'headline'},
264     ) if ($self->{'db'});
265 dpavlin 13
266 dpavlin 29 $log->debug("ds: ", sub { Dumper(@ds) });
267    
268 dpavlin 13 return @ds;
269    
270     }
271    
272     =head2 parse
273    
274     Perform smart parsing of string, skipping delimiters for fields which aren't
275     defined. It can also eval code in format starting with C<eval{...}> and
276     return output or nothing depending on eval code.
277    
278     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
279    
280     =cut
281    
282     sub parse {
283     my $self = shift;
284    
285     my ($rec, $format_utf8, $i) = @_;
286    
287     return if (! $format_utf8);
288    
289     my $log = $self->_get_logger();
290    
291     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
292    
293     $i = 0 if (! $i);
294    
295     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
296    
297     my @out;
298    
299     $log->debug("format: $format");
300    
301     my $eval_code;
302     # remove eval{...} from beginning
303     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
304    
305     my $filter_name;
306     # remove filter{...} from beginning
307     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
308    
309     my $prefix;
310     my $all_found=0;
311    
312     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
313    
314     my $del = $1 || '';
315     $prefix ||= $del if ($all_found == 0);
316    
317     # repeatable index
318     my $r = $i;
319     $r = 0 if (lc("$2") eq 's');
320    
321     my $found = 0;
322     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
323    
324     if ($found) {
325     push @out, $del;
326     push @out, $tmp;
327     $all_found += $found;
328     }
329     }
330    
331     return if (! $all_found);
332    
333     my $out = join('',@out);
334    
335     if ($out) {
336     # add rest of format (suffix)
337     $out .= $format;
338    
339     # add prefix if not there
340     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
341    
342     $log->debug("result: $out");
343     }
344    
345     if ($eval_code) {
346     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
347     $log->debug("about to eval{$eval} format: $out");
348     return if (! $self->_eval($eval));
349     }
350    
351     if ($filter_name && $self->{'filter'}->{$filter_name}) {
352     $log->debug("about to filter{$filter_name} format: $out");
353     $out = $self->{'filter'}->{$filter_name}->($out);
354     return unless(defined($out));
355     $log->debug("filter result: $out");
356     }
357    
358     return $out;
359     }
360    
361     =head2 parse_to_arr
362    
363     Similar to C<parse>, but returns array of all repeatable fields
364    
365     my @arr = $webpac->parse_to_arr($rec,'v250^a');
366    
367     =cut
368    
369     sub parse_to_arr {
370     my $self = shift;
371    
372     my ($rec, $format_utf8) = @_;
373    
374     my $log = $self->_get_logger();
375    
376     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
377     return if (! $format_utf8);
378    
379     my $i = 0;
380     my @arr;
381    
382     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
383     push @arr, $v;
384     }
385    
386     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
387    
388     return @arr;
389     }
390    
391 dpavlin 15
392     =head2 fill_in
393    
394     Workhourse of all: takes record from in-memory structure of database and
395     strings with placeholders and returns string or array of with substituted
396     values from record.
397    
398     my $text = $webpac->fill_in($rec,'v250^a');
399    
400     Optional argument is ordinal number for repeatable fields. By default,
401     it's assume to be first repeatable field (fields are perl array, so first
402     element is 0).
403     Following example will read second value from repeatable field.
404    
405     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
406    
407     This function B<does not> perform parsing of format to inteligenty skip
408     delimiters before fields which aren't used.
409    
410     This method will automatically decode UTF-8 string to local code page
411     if needed.
412    
413     =cut
414    
415     sub fill_in {
416     my $self = shift;
417    
418     my $log = $self->_get_logger();
419    
420     my $rec = shift || $log->logconfess("need data record");
421     my $format = shift || $log->logconfess("need format to parse");
422     # iteration (for repeatable fields)
423     my $i = shift || 0;
424    
425     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
426    
427     # FIXME remove for speedup?
428     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429    
430     if (utf8::is_utf8($format)) {
431     $format = $self->_x($format);
432     }
433    
434     my $found = 0;
435    
436     my $eval_code;
437     # remove eval{...} from beginning
438     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
439    
440     my $filter_name;
441     # remove filter{...} from beginning
442     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
443    
444     # do actual replacement of placeholders
445     # repeatable fields
446     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
447     # non-repeatable fields
448     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
449    
450     if ($found) {
451     $log->debug("format: $format");
452     if ($eval_code) {
453     my $eval = $self->fill_in($rec,$eval_code,$i);
454     return if (! $self->_eval($eval));
455     }
456     if ($filter_name && $self->{'filter'}->{$filter_name}) {
457     $log->debug("filter '$filter_name' for $format");
458     $format = $self->{'filter'}->{$filter_name}->($format);
459     return unless(defined($format));
460     $log->debug("filter result: $format");
461     }
462     # do we have lookups?
463     if ($self->{'lookup'}) {
464 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
465     return $self->{'lookup'}->lookup($format);
466     } else {
467     $log->warn("Have lookup object but can't invoke lookup method");
468     }
469 dpavlin 15 } else {
470     return $format;
471     }
472     } else {
473     return;
474     }
475     }
476    
477    
478 dpavlin 13 =head2 fill_in_to_arr
479    
480     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
481     for fields which have lookups, so they shouldn't be parsed but rather
482     C<fill_id>ed.
483    
484     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
485    
486     =cut
487    
488     sub fill_in_to_arr {
489     my $self = shift;
490    
491     my ($rec, $format_utf8) = @_;
492    
493     my $log = $self->_get_logger();
494    
495     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
496     return if (! $format_utf8);
497    
498     my $i = 0;
499     my @arr;
500    
501     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
502     push @arr, @v;
503     }
504    
505     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
506    
507     return @arr;
508     }
509    
510 dpavlin 15
511     =head2 get_data
512    
513     Returns value from record.
514    
515     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
516    
517     Arguments are:
518     record reference C<$rec>,
519     field C<$f>,
520     optional subfiled C<$sf>,
521     index for repeatable values C<$i>.
522    
523     Optinal variable C<$found> will be incremeted if there
524     is field.
525    
526     Returns value or empty string.
527    
528     =cut
529    
530     sub get_data {
531     my $self = shift;
532    
533     my ($rec,$f,$sf,$i,$found) = @_;
534    
535     if ($$rec->{$f}) {
536     return '' if (! $$rec->{$f}->[$i]);
537     no strict 'refs';
538     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
539     $$found++ if (defined($$found));
540     return $$rec->{$f}->[$i]->{$sf};
541     } elsif ($$rec->{$f}->[$i]) {
542     $$found++ if (defined($$found));
543     # it still might have subfield, just
544     # not specified, so we'll dump all
545     if ($$rec->{$f}->[$i] =~ /HASH/o) {
546     my $out;
547     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
548     $out .= $$rec->{$f}->[$i]->{$k}." ";
549     }
550     return $out;
551     } else {
552     return $$rec->{$f}->[$i];
553     }
554     }
555     } else {
556     return '';
557     }
558     }
559    
560    
561     =head2 apply_format
562    
563     Apply format specified in tag with C<format_name="name"> and
564     C<format_delimiter=";;">.
565    
566     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
567    
568     Formats can contain C<lookup{...}> if you need them.
569    
570     =cut
571    
572     sub apply_format {
573     my $self = shift;
574    
575     my ($name,$delimiter,$data) = @_;
576    
577     my $log = $self->_get_logger();
578    
579     if (! $self->{'import_xml'}->{'format'}->{$name}) {
580     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
581     return $data;
582     }
583    
584     $log->warn("no delimiter for format $name") if (! $delimiter);
585    
586     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
587    
588     my @data = split(/\Q$delimiter\E/, $data);
589    
590     my $out = sprintf($format, @data);
591     $log->debug("using format $name [$format] on $data to produce: $out");
592    
593     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
594 dpavlin 31 return $self->{'lookup'}->lookup($out);
595 dpavlin 15 } else {
596     return $out;
597     }
598    
599     }
600    
601 dpavlin 13 =head2 sort_arr
602    
603     Sort array ignoring case and html in data
604    
605     my @sorted = $webpac->sort_arr(@unsorted);
606    
607     =cut
608    
609     sub sort_arr {
610     my $self = shift;
611    
612     my $log = $self->_get_logger();
613    
614     # FIXME add Schwartzian Transformation?
615    
616     my @sorted = sort {
617     $a =~ s#<[^>]+/*>##;
618     $b =~ s#<[^>]+/*>##;
619     lc($b) cmp lc($a)
620     } @_;
621     $log->debug("sorted values: ",sub { join(", ",@sorted) });
622    
623     return @sorted;
624     }
625    
626    
627 dpavlin 15 =head1 INTERNAL METHODS
628    
629 dpavlin 13 =head2 _sort_by_order
630    
631     Sort xml tags data structure accoding to C<order=""> attribute.
632    
633     =cut
634    
635     sub _sort_by_order {
636     my $self = shift;
637    
638     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
639     $self->{'import_xml'}->{'indexer'}->{$a};
640     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
641     $self->{'import_xml'}->{'indexer'}->{$b};
642    
643     return $va <=> $vb;
644     }
645    
646     =head2 _x
647    
648 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
649     specific encoding (optinally specified using C<code_page> to C<new>
650     constructor).
651 dpavlin 13
652     my $text = $n->_x('normalize text string');
653    
654     This is a stub so that other modules doesn't have to implement it.
655    
656     =cut
657    
658     sub _x {
659     my $self = shift;
660     return shift;
661     }
662    
663    
664 dpavlin 10 =head1 AUTHOR
665    
666     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
667    
668     =head1 COPYRIGHT & LICENSE
669    
670     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
671    
672     This program is free software; you can redistribute it and/or modify it
673     under the same terms as Perl itself.
674    
675     =cut
676    
677     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26