/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (hide annotations)
Sun Nov 20 20:13:39 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 15659 byte(s)
 r8988@llin:  dpavlin | 2005-11-20 20:46:12 +0100
 added real implementation for WebPAC::Output::Estraier along with run.pl
 script which run test indexing (which will in one point move to
 WebPAC::Simple or something like that)

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 70 Version 0.02
15 dpavlin 10
16     =cut
17    
18 dpavlin 70 our $VERSION = '0.02';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 13 );
86    
87 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
88     be use with C<filter{...}> notation.
89    
90 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
92     which is C<WebPAC::Lookup> object.
93 dpavlin 13
94 dpavlin 10 =cut
95    
96 dpavlin 13 sub new {
97     my $class = shift;
98     my $self = {@_};
99     bless($self, $class);
100    
101 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
102     my $l = $self->{'lookup'} ? 1 : 0;
103    
104     my $log = $self->_get_logger();
105    
106     # those two must be in pair
107     if ( ($r & $l) != ($r || $l) ) {
108     my $log = $self->_get_logger();
109     $log->logdie("lookup_regex and lookup must be in pair");
110     }
111    
112     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113    
114 dpavlin 13 $self ? return $self : return undef;
115 dpavlin 10 }
116    
117 dpavlin 13
118     =head2 data_structure
119    
120     Create in-memory data structure which represents normalized layout from
121     C<conf/normalize/*.xml>.
122    
123     This structures are used to produce output.
124    
125 dpavlin 70 my $ds = $webpac->data_structure($rec);
126 dpavlin 13
127     B<Note: historical oddity follows>
128    
129     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131     C<< <headline> >> tag.
132    
133     =cut
134    
135     sub data_structure {
136     my $self = shift;
137    
138     my $log = $self->_get_logger();
139    
140     my $rec = shift;
141     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142    
143     my $cache_file;
144    
145 dpavlin 18 if ($self->{'db'}) {
146 dpavlin 70 my $ds = $self->{'db'}->load_ds($rec);
147     $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
148     return $ds if ($ds);
149 dpavlin 29 $log->debug("cache miss, creating");
150 dpavlin 13 }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163 dpavlin 70 my $ds;
164 dpavlin 13
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 dpavlin 38 my $format;
175 dpavlin 13
176 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177     $format = $tag->{'value'} || $tag->{'content'};
178    
179 dpavlin 13 $log->debug("format: $format");
180    
181     my @v;
182     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183     @v = $self->fill_in_to_arr($rec,$format);
184     } else {
185     @v = $self->parse_to_arr($rec,$format);
186     }
187     next if (! @v);
188    
189     if ($tag->{'sort'}) {
190     @v = $self->sort_arr(@v);
191     }
192    
193     # use format?
194     if ($tag->{'format_name'}) {
195     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196     }
197    
198     if ($field eq 'filename') {
199     $self->{'current_filename'} = join('',@v);
200     $log->debug("filename: ",$self->{'current_filename'});
201     } elsif ($field eq 'headline') {
202     $self->{'headline'} .= join('',@v);
203     $log->debug("headline: ",$self->{'headline'});
204     next; # don't return headline in data_structure!
205     }
206    
207     # delimiter will join repeatable fields
208     if ($tag->{'delimiter'}) {
209     @v = ( join($tag->{'delimiter'}, @v) );
210     }
211    
212     # default types
213 dpavlin 74 my @types = qw(display search);
214 dpavlin 13 # override by type attribute
215     @types = ( $tag->{'type'} ) if ($tag->{'type'});
216    
217     foreach my $type (@types) {
218     # append to previous line?
219     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220     if ($tag->{'append'}) {
221    
222     # I will delimit appended part with
223     # delimiter (or ,)
224     my $d = $tag->{'delimiter'};
225     # default delimiter
226     $d ||= " ";
227    
228     my $last = pop @{$row->{$type}};
229     $d = "" if (! $last);
230     $last .= $d . join($d, @v);
231     push @{$row->{$type}}, $last;
232    
233     } else {
234     push @{$row->{$type}}, @v;
235     }
236     }
237    
238    
239     }
240    
241     if ($row) {
242     $row->{'tag'} = $field;
243    
244     # TODO: name_sigular, name_plural
245     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
247 dpavlin 13
248     # post-sort all values in field
249     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250     $log->warn("sort at field tag not implemented");
251     }
252    
253 dpavlin 70 $ds->{$row_name} = $row;
254 dpavlin 13
255     $log->debug("row $field: ",sub { Dumper($row) });
256     }
257    
258     }
259    
260 dpavlin 39 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261    
262 dpavlin 22 $self->{'db'}->save_ds(
263 dpavlin 70 ds => $ds,
264 dpavlin 18 current_filename => $self->{'current_filename'},
265     headline => $self->{'headline'},
266     ) if ($self->{'db'});
267 dpavlin 13
268 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
269 dpavlin 29
270 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
271 dpavlin 13
272 dpavlin 70 return $ds;
273    
274 dpavlin 13 }
275    
276     =head2 parse
277    
278     Perform smart parsing of string, skipping delimiters for fields which aren't
279     defined. It can also eval code in format starting with C<eval{...}> and
280     return output or nothing depending on eval code.
281    
282     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
283    
284     =cut
285    
286     sub parse {
287     my $self = shift;
288    
289     my ($rec, $format_utf8, $i) = @_;
290    
291     return if (! $format_utf8);
292    
293     my $log = $self->_get_logger();
294    
295     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
296    
297     $i = 0 if (! $i);
298    
299     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
300    
301     my @out;
302    
303     $log->debug("format: $format");
304    
305     my $eval_code;
306     # remove eval{...} from beginning
307     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
308    
309     my $filter_name;
310     # remove filter{...} from beginning
311     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
312    
313     my $prefix;
314     my $all_found=0;
315    
316     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
317    
318     my $del = $1 || '';
319     $prefix ||= $del if ($all_found == 0);
320    
321     # repeatable index
322     my $r = $i;
323     $r = 0 if (lc("$2") eq 's');
324    
325     my $found = 0;
326     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
327    
328     if ($found) {
329     push @out, $del;
330     push @out, $tmp;
331     $all_found += $found;
332     }
333     }
334    
335     return if (! $all_found);
336    
337     my $out = join('',@out);
338    
339     if ($out) {
340     # add rest of format (suffix)
341     $out .= $format;
342    
343     # add prefix if not there
344     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
345    
346     $log->debug("result: $out");
347     }
348    
349     if ($eval_code) {
350     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
351     $log->debug("about to eval{$eval} format: $out");
352     return if (! $self->_eval($eval));
353     }
354    
355     if ($filter_name && $self->{'filter'}->{$filter_name}) {
356     $log->debug("about to filter{$filter_name} format: $out");
357     $out = $self->{'filter'}->{$filter_name}->($out);
358     return unless(defined($out));
359     $log->debug("filter result: $out");
360     }
361    
362     return $out;
363     }
364    
365     =head2 parse_to_arr
366    
367     Similar to C<parse>, but returns array of all repeatable fields
368    
369     my @arr = $webpac->parse_to_arr($rec,'v250^a');
370    
371     =cut
372    
373     sub parse_to_arr {
374     my $self = shift;
375    
376     my ($rec, $format_utf8) = @_;
377    
378     my $log = $self->_get_logger();
379    
380     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
381     return if (! $format_utf8);
382    
383     my $i = 0;
384     my @arr;
385    
386     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
387     push @arr, $v;
388     }
389    
390     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
391    
392     return @arr;
393     }
394    
395 dpavlin 15
396     =head2 fill_in
397    
398     Workhourse of all: takes record from in-memory structure of database and
399     strings with placeholders and returns string or array of with substituted
400     values from record.
401    
402     my $text = $webpac->fill_in($rec,'v250^a');
403    
404     Optional argument is ordinal number for repeatable fields. By default,
405     it's assume to be first repeatable field (fields are perl array, so first
406     element is 0).
407     Following example will read second value from repeatable field.
408    
409     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
410    
411     This function B<does not> perform parsing of format to inteligenty skip
412     delimiters before fields which aren't used.
413    
414     This method will automatically decode UTF-8 string to local code page
415     if needed.
416    
417     =cut
418    
419     sub fill_in {
420     my $self = shift;
421    
422     my $log = $self->_get_logger();
423    
424     my $rec = shift || $log->logconfess("need data record");
425     my $format = shift || $log->logconfess("need format to parse");
426     # iteration (for repeatable fields)
427     my $i = shift || 0;
428    
429     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
430    
431     # FIXME remove for speedup?
432     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
433    
434     if (utf8::is_utf8($format)) {
435     $format = $self->_x($format);
436     }
437    
438     my $found = 0;
439    
440     my $eval_code;
441     # remove eval{...} from beginning
442     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
443    
444     my $filter_name;
445     # remove filter{...} from beginning
446     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
447    
448     # do actual replacement of placeholders
449     # repeatable fields
450     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
451     # non-repeatable fields
452     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
453    
454     if ($found) {
455     $log->debug("format: $format");
456     if ($eval_code) {
457     my $eval = $self->fill_in($rec,$eval_code,$i);
458     return if (! $self->_eval($eval));
459     }
460     if ($filter_name && $self->{'filter'}->{$filter_name}) {
461     $log->debug("filter '$filter_name' for $format");
462     $format = $self->{'filter'}->{$filter_name}->($format);
463     return unless(defined($format));
464     $log->debug("filter result: $format");
465     }
466     # do we have lookups?
467     if ($self->{'lookup'}) {
468 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
469     return $self->{'lookup'}->lookup($format);
470     } else {
471     $log->warn("Have lookup object but can't invoke lookup method");
472     }
473 dpavlin 15 } else {
474     return $format;
475     }
476     } else {
477     return;
478     }
479     }
480    
481    
482 dpavlin 13 =head2 fill_in_to_arr
483    
484     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
485     for fields which have lookups, so they shouldn't be parsed but rather
486     C<fill_id>ed.
487    
488     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
489    
490     =cut
491    
492     sub fill_in_to_arr {
493     my $self = shift;
494    
495     my ($rec, $format_utf8) = @_;
496    
497     my $log = $self->_get_logger();
498    
499     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500     return if (! $format_utf8);
501    
502     my $i = 0;
503     my @arr;
504    
505     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
506     push @arr, @v;
507     }
508    
509     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
510    
511     return @arr;
512     }
513    
514 dpavlin 15
515     =head2 get_data
516    
517     Returns value from record.
518    
519     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
520    
521     Arguments are:
522     record reference C<$rec>,
523     field C<$f>,
524     optional subfiled C<$sf>,
525     index for repeatable values C<$i>.
526    
527     Optinal variable C<$found> will be incremeted if there
528     is field.
529    
530     Returns value or empty string.
531    
532     =cut
533    
534     sub get_data {
535     my $self = shift;
536    
537     my ($rec,$f,$sf,$i,$found) = @_;
538    
539     if ($$rec->{$f}) {
540     return '' if (! $$rec->{$f}->[$i]);
541     no strict 'refs';
542     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
543     $$found++ if (defined($$found));
544     return $$rec->{$f}->[$i]->{$sf};
545 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
546 dpavlin 15 $$found++ if (defined($$found));
547     # it still might have subfield, just
548     # not specified, so we'll dump all
549     if ($$rec->{$f}->[$i] =~ /HASH/o) {
550     my $out;
551     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
552     $out .= $$rec->{$f}->[$i]->{$k}." ";
553     }
554     return $out;
555     } else {
556     return $$rec->{$f}->[$i];
557     }
558 dpavlin 64 } else {
559     return '';
560 dpavlin 15 }
561     } else {
562     return '';
563     }
564     }
565    
566    
567     =head2 apply_format
568    
569     Apply format specified in tag with C<format_name="name"> and
570     C<format_delimiter=";;">.
571    
572     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
573    
574     Formats can contain C<lookup{...}> if you need them.
575    
576     =cut
577    
578     sub apply_format {
579     my $self = shift;
580    
581     my ($name,$delimiter,$data) = @_;
582    
583     my $log = $self->_get_logger();
584    
585     if (! $self->{'import_xml'}->{'format'}->{$name}) {
586     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
587     return $data;
588     }
589    
590     $log->warn("no delimiter for format $name") if (! $delimiter);
591    
592     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
593    
594     my @data = split(/\Q$delimiter\E/, $data);
595    
596     my $out = sprintf($format, @data);
597     $log->debug("using format $name [$format] on $data to produce: $out");
598    
599     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
600 dpavlin 31 return $self->{'lookup'}->lookup($out);
601 dpavlin 15 } else {
602     return $out;
603     }
604    
605     }
606    
607 dpavlin 13 =head2 sort_arr
608    
609     Sort array ignoring case and html in data
610    
611     my @sorted = $webpac->sort_arr(@unsorted);
612    
613     =cut
614    
615     sub sort_arr {
616     my $self = shift;
617    
618     my $log = $self->_get_logger();
619    
620     # FIXME add Schwartzian Transformation?
621    
622     my @sorted = sort {
623     $a =~ s#<[^>]+/*>##;
624     $b =~ s#<[^>]+/*>##;
625     lc($b) cmp lc($a)
626     } @_;
627     $log->debug("sorted values: ",sub { join(", ",@sorted) });
628    
629     return @sorted;
630     }
631    
632    
633 dpavlin 15 =head1 INTERNAL METHODS
634    
635 dpavlin 13 =head2 _sort_by_order
636    
637     Sort xml tags data structure accoding to C<order=""> attribute.
638    
639     =cut
640    
641     sub _sort_by_order {
642     my $self = shift;
643    
644     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
645     $self->{'import_xml'}->{'indexer'}->{$a};
646     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
647     $self->{'import_xml'}->{'indexer'}->{$b};
648    
649     return $va <=> $vb;
650     }
651    
652     =head2 _x
653    
654 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
655     specific encoding (optinally specified using C<code_page> to C<new>
656     constructor).
657 dpavlin 13
658     my $text = $n->_x('normalize text string');
659    
660     This is a stub so that other modules doesn't have to implement it.
661    
662     =cut
663    
664     sub _x {
665     my $self = shift;
666     return shift;
667     }
668    
669    
670 dpavlin 10 =head1 AUTHOR
671    
672     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
673    
674     =head1 COPYRIGHT & LICENSE
675    
676     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
677    
678     This program is free software; you can redistribute it and/or modify it
679     under the same terms as Perl itself.
680    
681     =cut
682    
683     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26