/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 39 - (hide annotations)
Sat Nov 12 21:31:47 2005 UTC (17 years, 2 months ago) by dpavlin
File size: 15540 byte(s)
check for current_filename and die if need (needs more work)

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 13 );
86    
87 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
88     be use with C<filter{...}> notation.
89    
90 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
92     which is C<WebPAC::Lookup> object.
93 dpavlin 13
94 dpavlin 10 =cut
95    
96 dpavlin 13 sub new {
97     my $class = shift;
98     my $self = {@_};
99     bless($self, $class);
100    
101 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
102     my $l = $self->{'lookup'} ? 1 : 0;
103    
104     my $log = $self->_get_logger();
105    
106     # those two must be in pair
107     if ( ($r & $l) != ($r || $l) ) {
108     my $log = $self->_get_logger();
109     $log->logdie("lookup_regex and lookup must be in pair");
110     }
111    
112     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113    
114 dpavlin 13 $self ? return $self : return undef;
115 dpavlin 10 }
116    
117 dpavlin 13
118     =head2 data_structure
119    
120     Create in-memory data structure which represents normalized layout from
121     C<conf/normalize/*.xml>.
122    
123     This structures are used to produce output.
124    
125     my @ds = $webpac->data_structure($rec);
126    
127     B<Note: historical oddity follows>
128    
129     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131     C<< <headline> >> tag.
132    
133     =cut
134    
135     sub data_structure {
136     my $self = shift;
137    
138     my $log = $self->_get_logger();
139    
140     my $rec = shift;
141     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142    
143     my $cache_file;
144    
145 dpavlin 18 if ($self->{'db'}) {
146 dpavlin 22 my @ds = $self->{'db'}->load_ds($rec);
147 dpavlin 29 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148     return @ds if ($#ds > 0);
149     $log->debug("cache miss, creating");
150 dpavlin 13 }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163     my @ds;
164    
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 dpavlin 38 my $format;
175 dpavlin 13
176 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177     $format = $tag->{'value'} || $tag->{'content'};
178    
179 dpavlin 13 $log->debug("format: $format");
180    
181     my @v;
182     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183     @v = $self->fill_in_to_arr($rec,$format);
184     } else {
185     @v = $self->parse_to_arr($rec,$format);
186     }
187     next if (! @v);
188    
189     if ($tag->{'sort'}) {
190     @v = $self->sort_arr(@v);
191     }
192    
193     # use format?
194     if ($tag->{'format_name'}) {
195     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196     }
197    
198     if ($field eq 'filename') {
199     $self->{'current_filename'} = join('',@v);
200     $log->debug("filename: ",$self->{'current_filename'});
201     } elsif ($field eq 'headline') {
202     $self->{'headline'} .= join('',@v);
203     $log->debug("headline: ",$self->{'headline'});
204     next; # don't return headline in data_structure!
205     }
206    
207     # delimiter will join repeatable fields
208     if ($tag->{'delimiter'}) {
209     @v = ( join($tag->{'delimiter'}, @v) );
210     }
211    
212     # default types
213     my @types = qw(display swish);
214     # override by type attribute
215     @types = ( $tag->{'type'} ) if ($tag->{'type'});
216    
217     foreach my $type (@types) {
218     # append to previous line?
219     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220     if ($tag->{'append'}) {
221    
222     # I will delimit appended part with
223     # delimiter (or ,)
224     my $d = $tag->{'delimiter'};
225     # default delimiter
226     $d ||= " ";
227    
228     my $last = pop @{$row->{$type}};
229     $d = "" if (! $last);
230     $last .= $d . join($d, @v);
231     push @{$row->{$type}}, $last;
232    
233     } else {
234     push @{$row->{$type}}, @v;
235     }
236     }
237    
238    
239     }
240    
241     if ($row) {
242     $row->{'tag'} = $field;
243    
244     # TODO: name_sigular, name_plural
245     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246     $row->{'name'} = $name ? $self->_x($name) : $field;
247    
248     # post-sort all values in field
249     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250     $log->warn("sort at field tag not implemented");
251     }
252    
253     push @ds, $row;
254    
255     $log->debug("row $field: ",sub { Dumper($row) });
256     }
257    
258     }
259    
260 dpavlin 39 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261    
262 dpavlin 22 $self->{'db'}->save_ds(
263 dpavlin 18 ds => \@ds,
264     current_filename => $self->{'current_filename'},
265     headline => $self->{'headline'},
266     ) if ($self->{'db'});
267 dpavlin 13
268 dpavlin 29 $log->debug("ds: ", sub { Dumper(@ds) });
269    
270 dpavlin 13 return @ds;
271    
272     }
273    
274     =head2 parse
275    
276     Perform smart parsing of string, skipping delimiters for fields which aren't
277     defined. It can also eval code in format starting with C<eval{...}> and
278     return output or nothing depending on eval code.
279    
280     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
281    
282     =cut
283    
284     sub parse {
285     my $self = shift;
286    
287     my ($rec, $format_utf8, $i) = @_;
288    
289     return if (! $format_utf8);
290    
291     my $log = $self->_get_logger();
292    
293     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
294    
295     $i = 0 if (! $i);
296    
297     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
298    
299     my @out;
300    
301     $log->debug("format: $format");
302    
303     my $eval_code;
304     # remove eval{...} from beginning
305     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
306    
307     my $filter_name;
308     # remove filter{...} from beginning
309     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
310    
311     my $prefix;
312     my $all_found=0;
313    
314     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
315    
316     my $del = $1 || '';
317     $prefix ||= $del if ($all_found == 0);
318    
319     # repeatable index
320     my $r = $i;
321     $r = 0 if (lc("$2") eq 's');
322    
323     my $found = 0;
324     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
325    
326     if ($found) {
327     push @out, $del;
328     push @out, $tmp;
329     $all_found += $found;
330     }
331     }
332    
333     return if (! $all_found);
334    
335     my $out = join('',@out);
336    
337     if ($out) {
338     # add rest of format (suffix)
339     $out .= $format;
340    
341     # add prefix if not there
342     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
343    
344     $log->debug("result: $out");
345     }
346    
347     if ($eval_code) {
348     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
349     $log->debug("about to eval{$eval} format: $out");
350     return if (! $self->_eval($eval));
351     }
352    
353     if ($filter_name && $self->{'filter'}->{$filter_name}) {
354     $log->debug("about to filter{$filter_name} format: $out");
355     $out = $self->{'filter'}->{$filter_name}->($out);
356     return unless(defined($out));
357     $log->debug("filter result: $out");
358     }
359    
360     return $out;
361     }
362    
363     =head2 parse_to_arr
364    
365     Similar to C<parse>, but returns array of all repeatable fields
366    
367     my @arr = $webpac->parse_to_arr($rec,'v250^a');
368    
369     =cut
370    
371     sub parse_to_arr {
372     my $self = shift;
373    
374     my ($rec, $format_utf8) = @_;
375    
376     my $log = $self->_get_logger();
377    
378     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
379     return if (! $format_utf8);
380    
381     my $i = 0;
382     my @arr;
383    
384     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
385     push @arr, $v;
386     }
387    
388     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
389    
390     return @arr;
391     }
392    
393 dpavlin 15
394     =head2 fill_in
395    
396     Workhourse of all: takes record from in-memory structure of database and
397     strings with placeholders and returns string or array of with substituted
398     values from record.
399    
400     my $text = $webpac->fill_in($rec,'v250^a');
401    
402     Optional argument is ordinal number for repeatable fields. By default,
403     it's assume to be first repeatable field (fields are perl array, so first
404     element is 0).
405     Following example will read second value from repeatable field.
406    
407     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
408    
409     This function B<does not> perform parsing of format to inteligenty skip
410     delimiters before fields which aren't used.
411    
412     This method will automatically decode UTF-8 string to local code page
413     if needed.
414    
415     =cut
416    
417     sub fill_in {
418     my $self = shift;
419    
420     my $log = $self->_get_logger();
421    
422     my $rec = shift || $log->logconfess("need data record");
423     my $format = shift || $log->logconfess("need format to parse");
424     # iteration (for repeatable fields)
425     my $i = shift || 0;
426    
427     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
428    
429     # FIXME remove for speedup?
430     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
431    
432     if (utf8::is_utf8($format)) {
433     $format = $self->_x($format);
434     }
435    
436     my $found = 0;
437    
438     my $eval_code;
439     # remove eval{...} from beginning
440     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441    
442     my $filter_name;
443     # remove filter{...} from beginning
444     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
445    
446     # do actual replacement of placeholders
447     # repeatable fields
448     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
449     # non-repeatable fields
450     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
451    
452     if ($found) {
453     $log->debug("format: $format");
454     if ($eval_code) {
455     my $eval = $self->fill_in($rec,$eval_code,$i);
456     return if (! $self->_eval($eval));
457     }
458     if ($filter_name && $self->{'filter'}->{$filter_name}) {
459     $log->debug("filter '$filter_name' for $format");
460     $format = $self->{'filter'}->{$filter_name}->($format);
461     return unless(defined($format));
462     $log->debug("filter result: $format");
463     }
464     # do we have lookups?
465     if ($self->{'lookup'}) {
466 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
467     return $self->{'lookup'}->lookup($format);
468     } else {
469     $log->warn("Have lookup object but can't invoke lookup method");
470     }
471 dpavlin 15 } else {
472     return $format;
473     }
474     } else {
475     return;
476     }
477     }
478    
479    
480 dpavlin 13 =head2 fill_in_to_arr
481    
482     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
483     for fields which have lookups, so they shouldn't be parsed but rather
484     C<fill_id>ed.
485    
486     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
487    
488     =cut
489    
490     sub fill_in_to_arr {
491     my $self = shift;
492    
493     my ($rec, $format_utf8) = @_;
494    
495     my $log = $self->_get_logger();
496    
497     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
498     return if (! $format_utf8);
499    
500     my $i = 0;
501     my @arr;
502    
503     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
504     push @arr, @v;
505     }
506    
507     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
508    
509     return @arr;
510     }
511    
512 dpavlin 15
513     =head2 get_data
514    
515     Returns value from record.
516    
517     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518    
519     Arguments are:
520     record reference C<$rec>,
521     field C<$f>,
522     optional subfiled C<$sf>,
523     index for repeatable values C<$i>.
524    
525     Optinal variable C<$found> will be incremeted if there
526     is field.
527    
528     Returns value or empty string.
529    
530     =cut
531    
532     sub get_data {
533     my $self = shift;
534    
535     my ($rec,$f,$sf,$i,$found) = @_;
536    
537     if ($$rec->{$f}) {
538     return '' if (! $$rec->{$f}->[$i]);
539     no strict 'refs';
540     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541     $$found++ if (defined($$found));
542     return $$rec->{$f}->[$i]->{$sf};
543     } elsif ($$rec->{$f}->[$i]) {
544     $$found++ if (defined($$found));
545     # it still might have subfield, just
546     # not specified, so we'll dump all
547     if ($$rec->{$f}->[$i] =~ /HASH/o) {
548     my $out;
549     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550     $out .= $$rec->{$f}->[$i]->{$k}." ";
551     }
552     return $out;
553     } else {
554     return $$rec->{$f}->[$i];
555     }
556     }
557     } else {
558     return '';
559     }
560     }
561    
562    
563     =head2 apply_format
564    
565     Apply format specified in tag with C<format_name="name"> and
566     C<format_delimiter=";;">.
567    
568     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
569    
570     Formats can contain C<lookup{...}> if you need them.
571    
572     =cut
573    
574     sub apply_format {
575     my $self = shift;
576    
577     my ($name,$delimiter,$data) = @_;
578    
579     my $log = $self->_get_logger();
580    
581     if (! $self->{'import_xml'}->{'format'}->{$name}) {
582     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
583     return $data;
584     }
585    
586     $log->warn("no delimiter for format $name") if (! $delimiter);
587    
588     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
589    
590     my @data = split(/\Q$delimiter\E/, $data);
591    
592     my $out = sprintf($format, @data);
593     $log->debug("using format $name [$format] on $data to produce: $out");
594    
595     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
596 dpavlin 31 return $self->{'lookup'}->lookup($out);
597 dpavlin 15 } else {
598     return $out;
599     }
600    
601     }
602    
603 dpavlin 13 =head2 sort_arr
604    
605     Sort array ignoring case and html in data
606    
607     my @sorted = $webpac->sort_arr(@unsorted);
608    
609     =cut
610    
611     sub sort_arr {
612     my $self = shift;
613    
614     my $log = $self->_get_logger();
615    
616     # FIXME add Schwartzian Transformation?
617    
618     my @sorted = sort {
619     $a =~ s#<[^>]+/*>##;
620     $b =~ s#<[^>]+/*>##;
621     lc($b) cmp lc($a)
622     } @_;
623     $log->debug("sorted values: ",sub { join(", ",@sorted) });
624    
625     return @sorted;
626     }
627    
628    
629 dpavlin 15 =head1 INTERNAL METHODS
630    
631 dpavlin 13 =head2 _sort_by_order
632    
633     Sort xml tags data structure accoding to C<order=""> attribute.
634    
635     =cut
636    
637     sub _sort_by_order {
638     my $self = shift;
639    
640     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
641     $self->{'import_xml'}->{'indexer'}->{$a};
642     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
643     $self->{'import_xml'}->{'indexer'}->{$b};
644    
645     return $va <=> $vb;
646     }
647    
648     =head2 _x
649    
650 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
651     specific encoding (optinally specified using C<code_page> to C<new>
652     constructor).
653 dpavlin 13
654     my $text = $n->_x('normalize text string');
655    
656     This is a stub so that other modules doesn't have to implement it.
657    
658     =cut
659    
660     sub _x {
661     my $self = shift;
662     return shift;
663     }
664    
665    
666 dpavlin 10 =head1 AUTHOR
667    
668     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
669    
670     =head1 COPYRIGHT & LICENSE
671    
672     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
673    
674     This program is free software; you can redistribute it and/or modify it
675     under the same terms as Perl itself.
676    
677     =cut
678    
679     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26