/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 219 - (hide annotations)
Mon Dec 5 17:48:08 2005 UTC (18 years, 3 months ago) by dpavlin
File size: 15470 byte(s)
 r11541@llin:  dpavlin | 2005-12-05 16:47:44 +0100
 added prefix [0.04]

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 219 Version 0.04
15 dpavlin 10
16     =cut
17    
18 dpavlin 219 our $VERSION = '0.04';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 219 prefix => 'foobar',
86 dpavlin 13 );
87    
88 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
89     be use with C<filter{...}> notation.
90    
91 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
92     source files which are joined in one database).
93    
94 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
96     which is C<WebPAC::Lookup> object.
97 dpavlin 13
98 dpavlin 10 =cut
99    
100 dpavlin 13 sub new {
101     my $class = shift;
102     my $self = {@_};
103     bless($self, $class);
104    
105 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
106     my $l = $self->{'lookup'} ? 1 : 0;
107    
108     my $log = $self->_get_logger();
109    
110     # those two must be in pair
111     if ( ($r & $l) != ($r || $l) ) {
112     my $log = $self->_get_logger();
113     $log->logdie("lookup_regex and lookup must be in pair");
114     }
115    
116     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117    
118 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119    
120 dpavlin 13 $self ? return $self : return undef;
121 dpavlin 10 }
122    
123 dpavlin 13
124     =head2 data_structure
125    
126     Create in-memory data structure which represents normalized layout from
127     C<conf/normalize/*.xml>.
128    
129     This structures are used to produce output.
130    
131 dpavlin 70 my $ds = $webpac->data_structure($rec);
132 dpavlin 13
133     =cut
134    
135     sub data_structure {
136     my $self = shift;
137    
138     my $log = $self->_get_logger();
139    
140     my $rec = shift;
141     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142    
143 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
144    
145     $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
146    
147 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
148 dpavlin 125
149 dpavlin 13 my $cache_file;
150    
151 dpavlin 18 if ($self->{'db'}) {
152 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
153 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
154     return $ds if ($ds);
155 dpavlin 29 $log->debug("cache miss, creating");
156 dpavlin 13 }
157    
158     undef $self->{'currnet_filename'};
159     undef $self->{'headline'};
160    
161     my @sorted_tags;
162     if ($self->{tags_by_order}) {
163     @sorted_tags = @{$self->{tags_by_order}};
164     } else {
165     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
166     $self->{tags_by_order} = \@sorted_tags;
167     }
168    
169 dpavlin 70 my $ds;
170 dpavlin 13
171     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
172    
173     foreach my $field (@sorted_tags) {
174    
175     my $row;
176    
177     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
178    
179     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
180 dpavlin 38 my $format;
181 dpavlin 13
182 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
183     $format = $tag->{'value'} || $tag->{'content'};
184    
185 dpavlin 13 $log->debug("format: $format");
186    
187     my @v;
188     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
189     @v = $self->fill_in_to_arr($rec,$format);
190     } else {
191     @v = $self->parse_to_arr($rec,$format);
192     }
193     next if (! @v);
194    
195     if ($tag->{'sort'}) {
196     @v = $self->sort_arr(@v);
197     }
198    
199     # use format?
200     if ($tag->{'format_name'}) {
201     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
202     }
203    
204     # delimiter will join repeatable fields
205     if ($tag->{'delimiter'}) {
206     @v = ( join($tag->{'delimiter'}, @v) );
207     }
208    
209     # default types
210 dpavlin 74 my @types = qw(display search);
211 dpavlin 13 # override by type attribute
212     @types = ( $tag->{'type'} ) if ($tag->{'type'});
213    
214     foreach my $type (@types) {
215     # append to previous line?
216     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217     if ($tag->{'append'}) {
218    
219     # I will delimit appended part with
220     # delimiter (or ,)
221     my $d = $tag->{'delimiter'};
222     # default delimiter
223     $d ||= " ";
224    
225     my $last = pop @{$row->{$type}};
226     $d = "" if (! $last);
227     $last .= $d . join($d, @v);
228     push @{$row->{$type}}, $last;
229    
230     } else {
231     push @{$row->{$type}}, @v;
232     }
233     }
234    
235    
236     }
237    
238     if ($row) {
239     $row->{'tag'} = $field;
240    
241     # TODO: name_sigular, name_plural
242     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
244 dpavlin 13
245     # post-sort all values in field
246     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247     $log->warn("sort at field tag not implemented");
248     }
249    
250 dpavlin 70 $ds->{$row_name} = $row;
251 dpavlin 13
252     $log->debug("row $field: ",sub { Dumper($row) });
253     }
254    
255     }
256    
257 dpavlin 22 $self->{'db'}->save_ds(
258 dpavlin 219 id => $id,
259 dpavlin 70 ds => $ds,
260 dpavlin 219 prefix => $self->{prefix},
261 dpavlin 18 ) if ($self->{'db'});
262 dpavlin 13
263 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
264 dpavlin 29
265 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
266 dpavlin 13
267 dpavlin 70 return $ds;
268    
269 dpavlin 13 }
270    
271     =head2 parse
272    
273     Perform smart parsing of string, skipping delimiters for fields which aren't
274     defined. It can also eval code in format starting with C<eval{...}> and
275     return output or nothing depending on eval code.
276    
277     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
278    
279     =cut
280    
281     sub parse {
282     my $self = shift;
283    
284     my ($rec, $format_utf8, $i) = @_;
285    
286     return if (! $format_utf8);
287    
288     my $log = $self->_get_logger();
289    
290     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
291    
292     $i = 0 if (! $i);
293    
294     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
295    
296     my @out;
297    
298     $log->debug("format: $format");
299    
300     my $eval_code;
301     # remove eval{...} from beginning
302     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
303    
304     my $filter_name;
305     # remove filter{...} from beginning
306     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
307    
308     my $prefix;
309     my $all_found=0;
310    
311     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
312    
313     my $del = $1 || '';
314     $prefix ||= $del if ($all_found == 0);
315    
316     # repeatable index
317     my $r = $i;
318     $r = 0 if (lc("$2") eq 's');
319    
320     my $found = 0;
321     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
322    
323     if ($found) {
324     push @out, $del;
325     push @out, $tmp;
326     $all_found += $found;
327     }
328     }
329    
330     return if (! $all_found);
331    
332     my $out = join('',@out);
333    
334     if ($out) {
335     # add rest of format (suffix)
336     $out .= $format;
337    
338     # add prefix if not there
339     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
340    
341     $log->debug("result: $out");
342     }
343    
344     if ($eval_code) {
345     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
346     $log->debug("about to eval{$eval} format: $out");
347     return if (! $self->_eval($eval));
348     }
349    
350     if ($filter_name && $self->{'filter'}->{$filter_name}) {
351     $log->debug("about to filter{$filter_name} format: $out");
352     $out = $self->{'filter'}->{$filter_name}->($out);
353     return unless(defined($out));
354     $log->debug("filter result: $out");
355     }
356    
357     return $out;
358     }
359    
360     =head2 parse_to_arr
361    
362     Similar to C<parse>, but returns array of all repeatable fields
363    
364     my @arr = $webpac->parse_to_arr($rec,'v250^a');
365    
366     =cut
367    
368     sub parse_to_arr {
369     my $self = shift;
370    
371     my ($rec, $format_utf8) = @_;
372    
373     my $log = $self->_get_logger();
374    
375     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
376     return if (! $format_utf8);
377    
378     my $i = 0;
379     my @arr;
380    
381     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
382     push @arr, $v;
383     }
384    
385     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
386    
387     return @arr;
388     }
389    
390 dpavlin 15
391     =head2 fill_in
392    
393     Workhourse of all: takes record from in-memory structure of database and
394     strings with placeholders and returns string or array of with substituted
395     values from record.
396    
397     my $text = $webpac->fill_in($rec,'v250^a');
398    
399     Optional argument is ordinal number for repeatable fields. By default,
400     it's assume to be first repeatable field (fields are perl array, so first
401     element is 0).
402     Following example will read second value from repeatable field.
403    
404     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
405    
406     This function B<does not> perform parsing of format to inteligenty skip
407     delimiters before fields which aren't used.
408    
409     This method will automatically decode UTF-8 string to local code page
410     if needed.
411    
412     =cut
413    
414     sub fill_in {
415     my $self = shift;
416    
417     my $log = $self->_get_logger();
418    
419     my $rec = shift || $log->logconfess("need data record");
420     my $format = shift || $log->logconfess("need format to parse");
421     # iteration (for repeatable fields)
422     my $i = shift || 0;
423    
424     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
425    
426     # FIXME remove for speedup?
427     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
428    
429     if (utf8::is_utf8($format)) {
430     $format = $self->_x($format);
431     }
432    
433     my $found = 0;
434    
435     my $eval_code;
436     # remove eval{...} from beginning
437     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
438    
439     my $filter_name;
440     # remove filter{...} from beginning
441     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
442    
443     # do actual replacement of placeholders
444     # repeatable fields
445     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
446     # non-repeatable fields
447     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
448    
449     if ($found) {
450     $log->debug("format: $format");
451     if ($eval_code) {
452     my $eval = $self->fill_in($rec,$eval_code,$i);
453     return if (! $self->_eval($eval));
454     }
455     if ($filter_name && $self->{'filter'}->{$filter_name}) {
456     $log->debug("filter '$filter_name' for $format");
457     $format = $self->{'filter'}->{$filter_name}->($format);
458     return unless(defined($format));
459     $log->debug("filter result: $format");
460     }
461     # do we have lookups?
462     if ($self->{'lookup'}) {
463 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
464     return $self->{'lookup'}->lookup($format);
465     } else {
466     $log->warn("Have lookup object but can't invoke lookup method");
467     }
468 dpavlin 15 } else {
469     return $format;
470     }
471     } else {
472     return;
473     }
474     }
475    
476    
477 dpavlin 13 =head2 fill_in_to_arr
478    
479     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
480     for fields which have lookups, so they shouldn't be parsed but rather
481     C<fill_id>ed.
482    
483     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
484    
485     =cut
486    
487     sub fill_in_to_arr {
488     my $self = shift;
489    
490     my ($rec, $format_utf8) = @_;
491    
492     my $log = $self->_get_logger();
493    
494     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
495     return if (! $format_utf8);
496    
497     my $i = 0;
498     my @arr;
499    
500     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
501     push @arr, @v;
502     }
503    
504     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
505    
506     return @arr;
507     }
508    
509 dpavlin 15
510     =head2 get_data
511    
512     Returns value from record.
513    
514     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
515    
516     Arguments are:
517     record reference C<$rec>,
518     field C<$f>,
519     optional subfiled C<$sf>,
520     index for repeatable values C<$i>.
521    
522     Optinal variable C<$found> will be incremeted if there
523     is field.
524    
525     Returns value or empty string.
526    
527     =cut
528    
529     sub get_data {
530     my $self = shift;
531    
532     my ($rec,$f,$sf,$i,$found) = @_;
533    
534     if ($$rec->{$f}) {
535     return '' if (! $$rec->{$f}->[$i]);
536     no strict 'refs';
537     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
538     $$found++ if (defined($$found));
539     return $$rec->{$f}->[$i]->{$sf};
540 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
541 dpavlin 15 $$found++ if (defined($$found));
542     # it still might have subfield, just
543     # not specified, so we'll dump all
544     if ($$rec->{$f}->[$i] =~ /HASH/o) {
545     my $out;
546     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
547     $out .= $$rec->{$f}->[$i]->{$k}." ";
548     }
549     return $out;
550     } else {
551     return $$rec->{$f}->[$i];
552     }
553 dpavlin 64 } else {
554     return '';
555 dpavlin 15 }
556     } else {
557     return '';
558     }
559     }
560    
561    
562     =head2 apply_format
563    
564     Apply format specified in tag with C<format_name="name"> and
565     C<format_delimiter=";;">.
566    
567     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
568    
569     Formats can contain C<lookup{...}> if you need them.
570    
571     =cut
572    
573     sub apply_format {
574     my $self = shift;
575    
576     my ($name,$delimiter,$data) = @_;
577    
578     my $log = $self->_get_logger();
579    
580     if (! $self->{'import_xml'}->{'format'}->{$name}) {
581     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
582     return $data;
583     }
584    
585     $log->warn("no delimiter for format $name") if (! $delimiter);
586    
587     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
588    
589     my @data = split(/\Q$delimiter\E/, $data);
590    
591     my $out = sprintf($format, @data);
592     $log->debug("using format $name [$format] on $data to produce: $out");
593    
594     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
595 dpavlin 31 return $self->{'lookup'}->lookup($out);
596 dpavlin 15 } else {
597     return $out;
598     }
599    
600     }
601    
602 dpavlin 13 =head2 sort_arr
603    
604     Sort array ignoring case and html in data
605    
606     my @sorted = $webpac->sort_arr(@unsorted);
607    
608     =cut
609    
610     sub sort_arr {
611     my $self = shift;
612    
613     my $log = $self->_get_logger();
614    
615     # FIXME add Schwartzian Transformation?
616    
617     my @sorted = sort {
618     $a =~ s#<[^>]+/*>##;
619     $b =~ s#<[^>]+/*>##;
620     lc($b) cmp lc($a)
621     } @_;
622     $log->debug("sorted values: ",sub { join(", ",@sorted) });
623    
624     return @sorted;
625     }
626    
627    
628 dpavlin 15 =head1 INTERNAL METHODS
629    
630 dpavlin 13 =head2 _sort_by_order
631    
632     Sort xml tags data structure accoding to C<order=""> attribute.
633    
634     =cut
635    
636     sub _sort_by_order {
637     my $self = shift;
638    
639     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
640     $self->{'import_xml'}->{'indexer'}->{$a};
641     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
642     $self->{'import_xml'}->{'indexer'}->{$b};
643    
644     return $va <=> $vb;
645     }
646    
647     =head2 _x
648    
649 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
650     specific encoding (optinally specified using C<code_page> to C<new>
651     constructor).
652 dpavlin 13
653     my $text = $n->_x('normalize text string');
654    
655     This is a stub so that other modules doesn't have to implement it.
656    
657     =cut
658    
659     sub _x {
660     my $self = shift;
661     return shift;
662     }
663    
664    
665 dpavlin 10 =head1 AUTHOR
666    
667     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
668    
669     =head1 COPYRIGHT & LICENSE
670    
671     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
672    
673     This program is free software; you can redistribute it and/or modify it
674     under the same terms as Perl itself.
675    
676     =cut
677    
678 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26