/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 261 - (hide annotations)
Fri Dec 16 16:00:18 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 16517 byte(s)
 r11729@llin:  dpavlin | 2005-12-16 21:00:26 +0100
 warn about non-defined filters just once

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 260 Version 0.06
15 dpavlin 10
16     =cut
17    
18 dpavlin 260 our $VERSION = '0.06';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
51    
52     filter{regex(s/foo/bar/)}
53    
54 dpavlin 15 =item *
55    
56     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57    
58     =item *
59    
60     at end, optional C<format>s rules are resolved. Format rules are similar to
61     C<sprintf> and can also contain C<lookup{...}> which is performed after
62     values are inserted in format.
63    
64     =back
65    
66     This also describes order in which transformations are applied (eval,
67     filter, lookup, format) which is important to undestand when deciding how to
68     solve your data mungling and normalisation process.
69    
70    
71    
72    
73 dpavlin 10 =head1 FUNCTIONS
74    
75 dpavlin 13 =head2 new
76 dpavlin 10
77 dpavlin 13 Create new normalisation object
78    
79     my $n = new WebPAC::Normalize::Something(
80 dpavlin 15 filter => {
81     'filter_name_1' => sub {
82     # filter code
83     return length($_);
84     }, ...
85     },
86 dpavlin 29 db => $db_obj,
87 dpavlin 13 lookup_regex => $lookup->regex,
88 dpavlin 31 lookup => $lookup_obj,
89 dpavlin 219 prefix => 'foobar',
90 dpavlin 13 );
91    
92 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
93     be use with C<filter{...}> notation.
94    
95 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
96     source files which are joined in one database).
97    
98 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
100     which is C<WebPAC::Lookup> object.
101 dpavlin 13
102 dpavlin 10 =cut
103    
104 dpavlin 13 sub new {
105     my $class = shift;
106     my $self = {@_};
107     bless($self, $class);
108    
109 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
110     my $l = $self->{'lookup'} ? 1 : 0;
111    
112     my $log = $self->_get_logger();
113    
114     # those two must be in pair
115     if ( ($r & $l) != ($r || $l) ) {
116     my $log = $self->_get_logger();
117     $log->logdie("lookup_regex and lookup must be in pair");
118     }
119    
120     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121    
122 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123    
124 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125    
126 dpavlin 260 if ($self->{filter} && ! $self->{filter}->{regex}) {
127     $log->debug("adding built-in filter regex");
128     $self->{filter}->{regex} = sub {
129     my ($val, $regex) = @_;
130     eval "\$val =~ $regex";
131     return $val;
132     };
133     }
134    
135 dpavlin 13 $self ? return $self : return undef;
136 dpavlin 10 }
137    
138 dpavlin 13
139     =head2 data_structure
140    
141     Create in-memory data structure which represents normalized layout from
142     C<conf/normalize/*.xml>.
143    
144     This structures are used to produce output.
145    
146 dpavlin 70 my $ds = $webpac->data_structure($rec);
147 dpavlin 13
148     =cut
149    
150     sub data_structure {
151     my $self = shift;
152    
153     my $log = $self->_get_logger();
154    
155     my $rec = shift;
156     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157    
158 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159    
160     $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
161    
162 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163 dpavlin 125
164 dpavlin 13 my $cache_file;
165    
166 dpavlin 18 if ($self->{'db'}) {
167 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169     return $ds if ($ds);
170 dpavlin 29 $log->debug("cache miss, creating");
171 dpavlin 13 }
172    
173     undef $self->{'currnet_filename'};
174     undef $self->{'headline'};
175    
176     my @sorted_tags;
177     if ($self->{tags_by_order}) {
178     @sorted_tags = @{$self->{tags_by_order}};
179     } else {
180     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
181     $self->{tags_by_order} = \@sorted_tags;
182     }
183    
184 dpavlin 70 my $ds;
185 dpavlin 13
186     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
187    
188     foreach my $field (@sorted_tags) {
189    
190     my $row;
191    
192     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
193    
194     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
195 dpavlin 38 my $format;
196 dpavlin 13
197 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
198     $format = $tag->{'value'} || $tag->{'content'};
199    
200 dpavlin 13 $log->debug("format: $format");
201    
202     my @v;
203     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
204     @v = $self->fill_in_to_arr($rec,$format);
205     } else {
206     @v = $self->parse_to_arr($rec,$format);
207     }
208     next if (! @v);
209    
210     if ($tag->{'sort'}) {
211     @v = $self->sort_arr(@v);
212     }
213    
214     # use format?
215     if ($tag->{'format_name'}) {
216     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
217     }
218    
219     # delimiter will join repeatable fields
220     if ($tag->{'delimiter'}) {
221     @v = ( join($tag->{'delimiter'}, @v) );
222     }
223    
224     # default types
225 dpavlin 74 my @types = qw(display search);
226 dpavlin 13 # override by type attribute
227     @types = ( $tag->{'type'} ) if ($tag->{'type'});
228    
229     foreach my $type (@types) {
230     # append to previous line?
231 dpavlin 252 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
232 dpavlin 13 if ($tag->{'append'}) {
233    
234     # I will delimit appended part with
235     # delimiter (or ,)
236     my $d = $tag->{'delimiter'};
237     # default delimiter
238     $d ||= " ";
239    
240     my $last = pop @{$row->{$type}};
241     $d = "" if (! $last);
242     $last .= $d . join($d, @v);
243     push @{$row->{$type}}, $last;
244    
245     } else {
246     push @{$row->{$type}}, @v;
247     }
248     }
249    
250    
251     }
252    
253     if ($row) {
254     $row->{'tag'} = $field;
255    
256     # TODO: name_sigular, name_plural
257     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
259 dpavlin 13
260     # post-sort all values in field
261     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262     $log->warn("sort at field tag not implemented");
263     }
264    
265 dpavlin 70 $ds->{$row_name} = $row;
266 dpavlin 13
267     $log->debug("row $field: ",sub { Dumper($row) });
268     }
269    
270     }
271    
272 dpavlin 22 $self->{'db'}->save_ds(
273 dpavlin 219 id => $id,
274 dpavlin 70 ds => $ds,
275 dpavlin 219 prefix => $self->{prefix},
276 dpavlin 18 ) if ($self->{'db'});
277 dpavlin 13
278 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
279 dpavlin 29
280 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
281 dpavlin 13
282 dpavlin 70 return $ds;
283    
284 dpavlin 13 }
285    
286     =head2 parse
287    
288     Perform smart parsing of string, skipping delimiters for fields which aren't
289     defined. It can also eval code in format starting with C<eval{...}> and
290     return output or nothing depending on eval code.
291    
292     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
293    
294 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
295    
296     filter{name_of_filter}
297    
298     but, filters can also have variable number of parametars like this:
299    
300     filter{name_of_filter(param,param,param)}
301    
302 dpavlin 13 =cut
303    
304 dpavlin 261 my $warn_once;
305    
306 dpavlin 13 sub parse {
307     my $self = shift;
308    
309     my ($rec, $format_utf8, $i) = @_;
310    
311     return if (! $format_utf8);
312    
313     my $log = $self->_get_logger();
314    
315     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
316    
317     $i = 0 if (! $i);
318    
319     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
320    
321     my @out;
322    
323     $log->debug("format: $format");
324    
325     my $eval_code;
326     # remove eval{...} from beginning
327     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328    
329     my $filter_name;
330     # remove filter{...} from beginning
331     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
332    
333     my $prefix;
334     my $all_found=0;
335    
336     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
337    
338     my $del = $1 || '';
339     $prefix ||= $del if ($all_found == 0);
340    
341     # repeatable index
342     my $r = $i;
343     $r = 0 if (lc("$2") eq 's');
344    
345     my $found = 0;
346     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
347    
348     if ($found) {
349     push @out, $del;
350     push @out, $tmp;
351     $all_found += $found;
352     }
353     }
354    
355     return if (! $all_found);
356    
357     my $out = join('',@out);
358    
359     if ($out) {
360     # add rest of format (suffix)
361     $out .= $format;
362    
363     # add prefix if not there
364     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
365    
366     $log->debug("result: $out");
367     }
368    
369     if ($eval_code) {
370     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
371     $log->debug("about to eval{$eval} format: $out");
372     return if (! $self->_eval($eval));
373     }
374    
375 dpavlin 260 if ($filter_name) {
376     my @filter_args;
377     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
378     @filter_args = split(/,/, $2);
379     }
380     if ($self->{'filter'}->{$filter_name}) {
381     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
382     unshift @filter_args, $out;
383     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
384     return unless(defined($out));
385     $log->debug("filter result: $out");
386 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
387 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
388 dpavlin 261 $warn_once->{$filter_name}++;
389 dpavlin 260 }
390 dpavlin 13 }
391    
392     return $out;
393     }
394    
395     =head2 parse_to_arr
396    
397     Similar to C<parse>, but returns array of all repeatable fields
398    
399     my @arr = $webpac->parse_to_arr($rec,'v250^a');
400    
401     =cut
402    
403     sub parse_to_arr {
404     my $self = shift;
405    
406     my ($rec, $format_utf8) = @_;
407    
408     my $log = $self->_get_logger();
409    
410     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411     return if (! $format_utf8);
412    
413     my $i = 0;
414     my @arr;
415    
416     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
417     push @arr, $v;
418     }
419    
420     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
421    
422     return @arr;
423     }
424    
425 dpavlin 15
426     =head2 fill_in
427    
428     Workhourse of all: takes record from in-memory structure of database and
429     strings with placeholders and returns string or array of with substituted
430     values from record.
431    
432     my $text = $webpac->fill_in($rec,'v250^a');
433    
434     Optional argument is ordinal number for repeatable fields. By default,
435     it's assume to be first repeatable field (fields are perl array, so first
436     element is 0).
437     Following example will read second value from repeatable field.
438    
439     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
440    
441     This function B<does not> perform parsing of format to inteligenty skip
442     delimiters before fields which aren't used.
443    
444     This method will automatically decode UTF-8 string to local code page
445     if needed.
446    
447     =cut
448    
449     sub fill_in {
450     my $self = shift;
451    
452     my $log = $self->_get_logger();
453    
454     my $rec = shift || $log->logconfess("need data record");
455     my $format = shift || $log->logconfess("need format to parse");
456     # iteration (for repeatable fields)
457     my $i = shift || 0;
458    
459     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
460    
461     # FIXME remove for speedup?
462     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
463    
464     if (utf8::is_utf8($format)) {
465     $format = $self->_x($format);
466     }
467    
468     my $found = 0;
469    
470     my $eval_code;
471     # remove eval{...} from beginning
472     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
473    
474     my $filter_name;
475     # remove filter{...} from beginning
476     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
477    
478     # do actual replacement of placeholders
479     # repeatable fields
480     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
481     # non-repeatable fields
482     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
483    
484     if ($found) {
485     $log->debug("format: $format");
486     if ($eval_code) {
487     my $eval = $self->fill_in($rec,$eval_code,$i);
488     return if (! $self->_eval($eval));
489     }
490     if ($filter_name && $self->{'filter'}->{$filter_name}) {
491     $log->debug("filter '$filter_name' for $format");
492     $format = $self->{'filter'}->{$filter_name}->($format);
493     return unless(defined($format));
494     $log->debug("filter result: $format");
495     }
496     # do we have lookups?
497     if ($self->{'lookup'}) {
498 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
499 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
500 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
501 dpavlin 252 return @lookup;
502 dpavlin 31 } else {
503     $log->warn("Have lookup object but can't invoke lookup method");
504     }
505 dpavlin 15 } else {
506     return $format;
507     }
508     } else {
509     return;
510     }
511     }
512    
513    
514 dpavlin 13 =head2 fill_in_to_arr
515    
516     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
517     for fields which have lookups, so they shouldn't be parsed but rather
518     C<fill_id>ed.
519    
520     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
521    
522     =cut
523    
524     sub fill_in_to_arr {
525     my $self = shift;
526    
527     my ($rec, $format_utf8) = @_;
528    
529     my $log = $self->_get_logger();
530    
531     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
532     return if (! $format_utf8);
533    
534     my $i = 0;
535     my @arr;
536    
537     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
538     push @arr, @v;
539     }
540    
541     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
542    
543     return @arr;
544     }
545    
546 dpavlin 15
547     =head2 get_data
548    
549     Returns value from record.
550    
551     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
552    
553     Arguments are:
554     record reference C<$rec>,
555     field C<$f>,
556     optional subfiled C<$sf>,
557     index for repeatable values C<$i>.
558    
559     Optinal variable C<$found> will be incremeted if there
560     is field.
561    
562     Returns value or empty string.
563    
564     =cut
565    
566     sub get_data {
567     my $self = shift;
568    
569     my ($rec,$f,$sf,$i,$found) = @_;
570    
571     if ($$rec->{$f}) {
572     return '' if (! $$rec->{$f}->[$i]);
573     no strict 'refs';
574     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
575     $$found++ if (defined($$found));
576     return $$rec->{$f}->[$i]->{$sf};
577 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
578 dpavlin 15 $$found++ if (defined($$found));
579     # it still might have subfield, just
580     # not specified, so we'll dump all
581     if ($$rec->{$f}->[$i] =~ /HASH/o) {
582     my $out;
583     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
584     $out .= $$rec->{$f}->[$i]->{$k}." ";
585     }
586     return $out;
587     } else {
588     return $$rec->{$f}->[$i];
589     }
590 dpavlin 64 } else {
591     return '';
592 dpavlin 15 }
593     } else {
594     return '';
595     }
596     }
597    
598    
599     =head2 apply_format
600    
601     Apply format specified in tag with C<format_name="name"> and
602     C<format_delimiter=";;">.
603    
604     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
605    
606     Formats can contain C<lookup{...}> if you need them.
607    
608     =cut
609    
610     sub apply_format {
611     my $self = shift;
612    
613     my ($name,$delimiter,$data) = @_;
614    
615     my $log = $self->_get_logger();
616    
617     if (! $self->{'import_xml'}->{'format'}->{$name}) {
618     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
619     return $data;
620     }
621    
622     $log->warn("no delimiter for format $name") if (! $delimiter);
623    
624     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
625    
626     my @data = split(/\Q$delimiter\E/, $data);
627    
628     my $out = sprintf($format, @data);
629     $log->debug("using format $name [$format] on $data to produce: $out");
630    
631     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
632 dpavlin 31 return $self->{'lookup'}->lookup($out);
633 dpavlin 15 } else {
634     return $out;
635     }
636    
637     }
638    
639 dpavlin 13 =head2 sort_arr
640    
641     Sort array ignoring case and html in data
642    
643     my @sorted = $webpac->sort_arr(@unsorted);
644    
645     =cut
646    
647     sub sort_arr {
648     my $self = shift;
649    
650     my $log = $self->_get_logger();
651    
652     # FIXME add Schwartzian Transformation?
653    
654     my @sorted = sort {
655     $a =~ s#<[^>]+/*>##;
656     $b =~ s#<[^>]+/*>##;
657     lc($b) cmp lc($a)
658     } @_;
659     $log->debug("sorted values: ",sub { join(", ",@sorted) });
660    
661     return @sorted;
662     }
663    
664    
665 dpavlin 15 =head1 INTERNAL METHODS
666    
667 dpavlin 13 =head2 _sort_by_order
668    
669     Sort xml tags data structure accoding to C<order=""> attribute.
670    
671     =cut
672    
673     sub _sort_by_order {
674     my $self = shift;
675    
676     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
677     $self->{'import_xml'}->{'indexer'}->{$a};
678     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
679     $self->{'import_xml'}->{'indexer'}->{$b};
680    
681     return $va <=> $vb;
682     }
683    
684     =head2 _x
685    
686 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
687     specific encoding (optinally specified using C<code_page> to C<new>
688     constructor).
689 dpavlin 13
690     my $text = $n->_x('normalize text string');
691    
692     This is a stub so that other modules doesn't have to implement it.
693    
694     =cut
695    
696     sub _x {
697     my $self = shift;
698     return shift;
699     }
700    
701    
702 dpavlin 10 =head1 AUTHOR
703    
704     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
705    
706     =head1 COPYRIGHT & LICENSE
707    
708     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
709    
710     This program is free software; you can redistribute it and/or modify it
711     under the same terms as Perl itself.
712    
713     =cut
714    
715 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26