/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 29 - (hide annotations)
Sun Jul 24 11:17:44 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 14636 byte(s)
some logging improvements and sample configuration file

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84     );
85    
86 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
87     be use with C<filter{...}> notation.
88    
89 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
90     in structures.
91    
92 dpavlin 10 =cut
93    
94 dpavlin 13 sub new {
95     my $class = shift;
96     my $self = {@_};
97     bless($self, $class);
98    
99     $self ? return $self : return undef;
100 dpavlin 10 }
101    
102 dpavlin 13
103     =head2 data_structure
104    
105     Create in-memory data structure which represents normalized layout from
106     C<conf/normalize/*.xml>.
107    
108     This structures are used to produce output.
109    
110     my @ds = $webpac->data_structure($rec);
111    
112     B<Note: historical oddity follows>
113    
114     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
115     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
116     C<< <headline> >> tag.
117    
118     =cut
119    
120     sub data_structure {
121     my $self = shift;
122    
123     my $log = $self->_get_logger();
124    
125     my $rec = shift;
126     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
127    
128     my $cache_file;
129    
130 dpavlin 18 if ($self->{'db'}) {
131 dpavlin 22 my @ds = $self->{'db'}->load_ds($rec);
132 dpavlin 29 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
133     return @ds if ($#ds > 0);
134     $log->debug("cache miss, creating");
135 dpavlin 13 }
136    
137     undef $self->{'currnet_filename'};
138     undef $self->{'headline'};
139    
140     my @sorted_tags;
141     if ($self->{tags_by_order}) {
142     @sorted_tags = @{$self->{tags_by_order}};
143     } else {
144     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
145     $self->{tags_by_order} = \@sorted_tags;
146     }
147    
148     my @ds;
149    
150     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
151    
152     foreach my $field (@sorted_tags) {
153    
154     my $row;
155    
156     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
157    
158     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
159     my $format = $tag->{'value'} || $tag->{'content'};
160    
161     $log->debug("format: $format");
162    
163     my @v;
164     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
165     @v = $self->fill_in_to_arr($rec,$format);
166     } else {
167     @v = $self->parse_to_arr($rec,$format);
168     }
169     next if (! @v);
170    
171     if ($tag->{'sort'}) {
172     @v = $self->sort_arr(@v);
173     }
174    
175     # use format?
176     if ($tag->{'format_name'}) {
177     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
178     }
179    
180     if ($field eq 'filename') {
181     $self->{'current_filename'} = join('',@v);
182     $log->debug("filename: ",$self->{'current_filename'});
183     } elsif ($field eq 'headline') {
184     $self->{'headline'} .= join('',@v);
185     $log->debug("headline: ",$self->{'headline'});
186     next; # don't return headline in data_structure!
187     }
188    
189     # delimiter will join repeatable fields
190     if ($tag->{'delimiter'}) {
191     @v = ( join($tag->{'delimiter'}, @v) );
192     }
193    
194     # default types
195     my @types = qw(display swish);
196     # override by type attribute
197     @types = ( $tag->{'type'} ) if ($tag->{'type'});
198    
199     foreach my $type (@types) {
200     # append to previous line?
201     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
202     if ($tag->{'append'}) {
203    
204     # I will delimit appended part with
205     # delimiter (or ,)
206     my $d = $tag->{'delimiter'};
207     # default delimiter
208     $d ||= " ";
209    
210     my $last = pop @{$row->{$type}};
211     $d = "" if (! $last);
212     $last .= $d . join($d, @v);
213     push @{$row->{$type}}, $last;
214    
215     } else {
216     push @{$row->{$type}}, @v;
217     }
218     }
219    
220    
221     }
222    
223     if ($row) {
224     $row->{'tag'} = $field;
225    
226     # TODO: name_sigular, name_plural
227     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
228     $row->{'name'} = $name ? $self->_x($name) : $field;
229    
230     # post-sort all values in field
231     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
232     $log->warn("sort at field tag not implemented");
233     }
234    
235     push @ds, $row;
236    
237     $log->debug("row $field: ",sub { Dumper($row) });
238     }
239    
240     }
241    
242 dpavlin 22 $self->{'db'}->save_ds(
243 dpavlin 18 ds => \@ds,
244     current_filename => $self->{'current_filename'},
245     headline => $self->{'headline'},
246     ) if ($self->{'db'});
247 dpavlin 13
248 dpavlin 29 $log->debug("ds: ", sub { Dumper(@ds) });
249    
250 dpavlin 13 return @ds;
251    
252     }
253    
254     =head2 parse
255    
256     Perform smart parsing of string, skipping delimiters for fields which aren't
257     defined. It can also eval code in format starting with C<eval{...}> and
258     return output or nothing depending on eval code.
259    
260     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
261    
262     =cut
263    
264     sub parse {
265     my $self = shift;
266    
267     my ($rec, $format_utf8, $i) = @_;
268    
269     return if (! $format_utf8);
270    
271     my $log = $self->_get_logger();
272    
273     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
274    
275     $i = 0 if (! $i);
276    
277     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
278    
279     my @out;
280    
281     $log->debug("format: $format");
282    
283     my $eval_code;
284     # remove eval{...} from beginning
285     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
286    
287     my $filter_name;
288     # remove filter{...} from beginning
289     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
290    
291     my $prefix;
292     my $all_found=0;
293    
294     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
295    
296     my $del = $1 || '';
297     $prefix ||= $del if ($all_found == 0);
298    
299     # repeatable index
300     my $r = $i;
301     $r = 0 if (lc("$2") eq 's');
302    
303     my $found = 0;
304     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
305    
306     if ($found) {
307     push @out, $del;
308     push @out, $tmp;
309     $all_found += $found;
310     }
311     }
312    
313     return if (! $all_found);
314    
315     my $out = join('',@out);
316    
317     if ($out) {
318     # add rest of format (suffix)
319     $out .= $format;
320    
321     # add prefix if not there
322     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
323    
324     $log->debug("result: $out");
325     }
326    
327     if ($eval_code) {
328     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
329     $log->debug("about to eval{$eval} format: $out");
330     return if (! $self->_eval($eval));
331     }
332    
333     if ($filter_name && $self->{'filter'}->{$filter_name}) {
334     $log->debug("about to filter{$filter_name} format: $out");
335     $out = $self->{'filter'}->{$filter_name}->($out);
336     return unless(defined($out));
337     $log->debug("filter result: $out");
338     }
339    
340     return $out;
341     }
342    
343     =head2 parse_to_arr
344    
345     Similar to C<parse>, but returns array of all repeatable fields
346    
347     my @arr = $webpac->parse_to_arr($rec,'v250^a');
348    
349     =cut
350    
351     sub parse_to_arr {
352     my $self = shift;
353    
354     my ($rec, $format_utf8) = @_;
355    
356     my $log = $self->_get_logger();
357    
358     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
359     return if (! $format_utf8);
360    
361     my $i = 0;
362     my @arr;
363    
364     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
365     push @arr, $v;
366     }
367    
368     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
369    
370     return @arr;
371     }
372    
373 dpavlin 15
374     =head2 fill_in
375    
376     Workhourse of all: takes record from in-memory structure of database and
377     strings with placeholders and returns string or array of with substituted
378     values from record.
379    
380     my $text = $webpac->fill_in($rec,'v250^a');
381    
382     Optional argument is ordinal number for repeatable fields. By default,
383     it's assume to be first repeatable field (fields are perl array, so first
384     element is 0).
385     Following example will read second value from repeatable field.
386    
387     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
388    
389     This function B<does not> perform parsing of format to inteligenty skip
390     delimiters before fields which aren't used.
391    
392     This method will automatically decode UTF-8 string to local code page
393     if needed.
394    
395     =cut
396    
397     sub fill_in {
398     my $self = shift;
399    
400     my $log = $self->_get_logger();
401    
402     my $rec = shift || $log->logconfess("need data record");
403     my $format = shift || $log->logconfess("need format to parse");
404     # iteration (for repeatable fields)
405     my $i = shift || 0;
406    
407     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
408    
409     # FIXME remove for speedup?
410     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411    
412     if (utf8::is_utf8($format)) {
413     $format = $self->_x($format);
414     }
415    
416     my $found = 0;
417    
418     my $eval_code;
419     # remove eval{...} from beginning
420     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
421    
422     my $filter_name;
423     # remove filter{...} from beginning
424     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
425    
426     # do actual replacement of placeholders
427     # repeatable fields
428     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
429     # non-repeatable fields
430     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
431    
432     if ($found) {
433     $log->debug("format: $format");
434     if ($eval_code) {
435     my $eval = $self->fill_in($rec,$eval_code,$i);
436     return if (! $self->_eval($eval));
437     }
438     if ($filter_name && $self->{'filter'}->{$filter_name}) {
439     $log->debug("filter '$filter_name' for $format");
440     $format = $self->{'filter'}->{$filter_name}->($format);
441     return unless(defined($format));
442     $log->debug("filter result: $format");
443     }
444     # do we have lookups?
445     if ($self->{'lookup'}) {
446     return $self->lookup($format);
447     } else {
448     return $format;
449     }
450     } else {
451     return;
452     }
453     }
454    
455    
456 dpavlin 13 =head2 fill_in_to_arr
457    
458     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
459     for fields which have lookups, so they shouldn't be parsed but rather
460     C<fill_id>ed.
461    
462     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
463    
464     =cut
465    
466     sub fill_in_to_arr {
467     my $self = shift;
468    
469     my ($rec, $format_utf8) = @_;
470    
471     my $log = $self->_get_logger();
472    
473     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
474     return if (! $format_utf8);
475    
476     my $i = 0;
477     my @arr;
478    
479     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
480     push @arr, @v;
481     }
482    
483     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
484    
485     return @arr;
486     }
487    
488 dpavlin 15
489     =head2 get_data
490    
491     Returns value from record.
492    
493     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
494    
495     Arguments are:
496     record reference C<$rec>,
497     field C<$f>,
498     optional subfiled C<$sf>,
499     index for repeatable values C<$i>.
500    
501     Optinal variable C<$found> will be incremeted if there
502     is field.
503    
504     Returns value or empty string.
505    
506     =cut
507    
508     sub get_data {
509     my $self = shift;
510    
511     my ($rec,$f,$sf,$i,$found) = @_;
512    
513     if ($$rec->{$f}) {
514     return '' if (! $$rec->{$f}->[$i]);
515     no strict 'refs';
516     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
517     $$found++ if (defined($$found));
518     return $$rec->{$f}->[$i]->{$sf};
519     } elsif ($$rec->{$f}->[$i]) {
520     $$found++ if (defined($$found));
521     # it still might have subfield, just
522     # not specified, so we'll dump all
523     if ($$rec->{$f}->[$i] =~ /HASH/o) {
524     my $out;
525     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
526     $out .= $$rec->{$f}->[$i]->{$k}." ";
527     }
528     return $out;
529     } else {
530     return $$rec->{$f}->[$i];
531     }
532     }
533     } else {
534     return '';
535     }
536     }
537    
538    
539     =head2 apply_format
540    
541     Apply format specified in tag with C<format_name="name"> and
542     C<format_delimiter=";;">.
543    
544     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
545    
546     Formats can contain C<lookup{...}> if you need them.
547    
548     =cut
549    
550     sub apply_format {
551     my $self = shift;
552    
553     my ($name,$delimiter,$data) = @_;
554    
555     my $log = $self->_get_logger();
556    
557     if (! $self->{'import_xml'}->{'format'}->{$name}) {
558     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
559     return $data;
560     }
561    
562     $log->warn("no delimiter for format $name") if (! $delimiter);
563    
564     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
565    
566     my @data = split(/\Q$delimiter\E/, $data);
567    
568     my $out = sprintf($format, @data);
569     $log->debug("using format $name [$format] on $data to produce: $out");
570    
571     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
572     return $self->lookup($out);
573     } else {
574     return $out;
575     }
576    
577     }
578    
579 dpavlin 13 =head2 sort_arr
580    
581     Sort array ignoring case and html in data
582    
583     my @sorted = $webpac->sort_arr(@unsorted);
584    
585     =cut
586    
587     sub sort_arr {
588     my $self = shift;
589    
590     my $log = $self->_get_logger();
591    
592     # FIXME add Schwartzian Transformation?
593    
594     my @sorted = sort {
595     $a =~ s#<[^>]+/*>##;
596     $b =~ s#<[^>]+/*>##;
597     lc($b) cmp lc($a)
598     } @_;
599     $log->debug("sorted values: ",sub { join(", ",@sorted) });
600    
601     return @sorted;
602     }
603    
604    
605 dpavlin 15 =head1 INTERNAL METHODS
606    
607 dpavlin 13 =head2 _sort_by_order
608    
609     Sort xml tags data structure accoding to C<order=""> attribute.
610    
611     =cut
612    
613     sub _sort_by_order {
614     my $self = shift;
615    
616     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
617     $self->{'import_xml'}->{'indexer'}->{$a};
618     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
619     $self->{'import_xml'}->{'indexer'}->{$b};
620    
621     return $va <=> $vb;
622     }
623    
624     =head2 _x
625    
626 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
627     specific encoding (optinally specified using C<code_page> to C<new>
628     constructor).
629 dpavlin 13
630     my $text = $n->_x('normalize text string');
631    
632     This is a stub so that other modules doesn't have to implement it.
633    
634     =cut
635    
636     sub _x {
637     my $self = shift;
638     return shift;
639     }
640    
641    
642 dpavlin 10 =head1 AUTHOR
643    
644     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
645    
646     =head1 COPYRIGHT & LICENSE
647    
648     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
649    
650     This program is free software; you can redistribute it and/or modify it
651     under the same terms as Perl itself.
652    
653     =cut
654    
655     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26