/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 253 - (hide annotations)
Thu Dec 15 17:01:10 2005 UTC (17 years, 1 month ago) by dpavlin
File size: 15631 byte(s)
 r11712@llin:  dpavlin | 2005-12-15 21:01:03 +0100
 lookups now work [2.00_3]

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 253 Version 0.05
15 dpavlin 10
16     =cut
17    
18 dpavlin 253 our $VERSION = '0.05';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 219 prefix => 'foobar',
86 dpavlin 13 );
87    
88 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
89     be use with C<filter{...}> notation.
90    
91 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
92     source files which are joined in one database).
93    
94 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
96     which is C<WebPAC::Lookup> object.
97 dpavlin 13
98 dpavlin 10 =cut
99    
100 dpavlin 13 sub new {
101     my $class = shift;
102     my $self = {@_};
103     bless($self, $class);
104    
105 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
106     my $l = $self->{'lookup'} ? 1 : 0;
107    
108     my $log = $self->_get_logger();
109    
110     # those two must be in pair
111     if ( ($r & $l) != ($r || $l) ) {
112     my $log = $self->_get_logger();
113     $log->logdie("lookup_regex and lookup must be in pair");
114     }
115    
116     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117    
118 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119    
120 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
121    
122 dpavlin 13 $self ? return $self : return undef;
123 dpavlin 10 }
124    
125 dpavlin 13
126     =head2 data_structure
127    
128     Create in-memory data structure which represents normalized layout from
129     C<conf/normalize/*.xml>.
130    
131     This structures are used to produce output.
132    
133 dpavlin 70 my $ds = $webpac->data_structure($rec);
134 dpavlin 13
135     =cut
136    
137     sub data_structure {
138     my $self = shift;
139    
140     my $log = $self->_get_logger();
141    
142     my $rec = shift;
143     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
144    
145 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
146    
147     $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
148    
149 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
150 dpavlin 125
151 dpavlin 13 my $cache_file;
152    
153 dpavlin 18 if ($self->{'db'}) {
154 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
155 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
156     return $ds if ($ds);
157 dpavlin 29 $log->debug("cache miss, creating");
158 dpavlin 13 }
159    
160     undef $self->{'currnet_filename'};
161     undef $self->{'headline'};
162    
163     my @sorted_tags;
164     if ($self->{tags_by_order}) {
165     @sorted_tags = @{$self->{tags_by_order}};
166     } else {
167     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
168     $self->{tags_by_order} = \@sorted_tags;
169     }
170    
171 dpavlin 70 my $ds;
172 dpavlin 13
173     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
174    
175     foreach my $field (@sorted_tags) {
176    
177     my $row;
178    
179     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
180    
181     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
182 dpavlin 38 my $format;
183 dpavlin 13
184 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
185     $format = $tag->{'value'} || $tag->{'content'};
186    
187 dpavlin 13 $log->debug("format: $format");
188    
189     my @v;
190     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
191     @v = $self->fill_in_to_arr($rec,$format);
192     } else {
193     @v = $self->parse_to_arr($rec,$format);
194     }
195     next if (! @v);
196    
197     if ($tag->{'sort'}) {
198     @v = $self->sort_arr(@v);
199     }
200    
201     # use format?
202     if ($tag->{'format_name'}) {
203     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
204     }
205    
206     # delimiter will join repeatable fields
207     if ($tag->{'delimiter'}) {
208     @v = ( join($tag->{'delimiter'}, @v) );
209     }
210    
211     # default types
212 dpavlin 74 my @types = qw(display search);
213 dpavlin 13 # override by type attribute
214     @types = ( $tag->{'type'} ) if ($tag->{'type'});
215    
216     foreach my $type (@types) {
217     # append to previous line?
218 dpavlin 252 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
219 dpavlin 13 if ($tag->{'append'}) {
220    
221     # I will delimit appended part with
222     # delimiter (or ,)
223     my $d = $tag->{'delimiter'};
224     # default delimiter
225     $d ||= " ";
226    
227     my $last = pop @{$row->{$type}};
228     $d = "" if (! $last);
229     $last .= $d . join($d, @v);
230     push @{$row->{$type}}, $last;
231    
232     } else {
233     push @{$row->{$type}}, @v;
234     }
235     }
236    
237    
238     }
239    
240     if ($row) {
241     $row->{'tag'} = $field;
242    
243     # TODO: name_sigular, name_plural
244     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
245 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
246 dpavlin 13
247     # post-sort all values in field
248     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
249     $log->warn("sort at field tag not implemented");
250     }
251    
252 dpavlin 70 $ds->{$row_name} = $row;
253 dpavlin 13
254     $log->debug("row $field: ",sub { Dumper($row) });
255     }
256    
257     }
258    
259 dpavlin 22 $self->{'db'}->save_ds(
260 dpavlin 219 id => $id,
261 dpavlin 70 ds => $ds,
262 dpavlin 219 prefix => $self->{prefix},
263 dpavlin 18 ) if ($self->{'db'});
264 dpavlin 13
265 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
266 dpavlin 29
267 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
268 dpavlin 13
269 dpavlin 70 return $ds;
270    
271 dpavlin 13 }
272    
273     =head2 parse
274    
275     Perform smart parsing of string, skipping delimiters for fields which aren't
276     defined. It can also eval code in format starting with C<eval{...}> and
277     return output or nothing depending on eval code.
278    
279     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
280    
281     =cut
282    
283     sub parse {
284     my $self = shift;
285    
286     my ($rec, $format_utf8, $i) = @_;
287    
288     return if (! $format_utf8);
289    
290     my $log = $self->_get_logger();
291    
292     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
293    
294     $i = 0 if (! $i);
295    
296     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
297    
298     my @out;
299    
300     $log->debug("format: $format");
301    
302     my $eval_code;
303     # remove eval{...} from beginning
304     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
305    
306     my $filter_name;
307     # remove filter{...} from beginning
308     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
309    
310     my $prefix;
311     my $all_found=0;
312    
313     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
314    
315     my $del = $1 || '';
316     $prefix ||= $del if ($all_found == 0);
317    
318     # repeatable index
319     my $r = $i;
320     $r = 0 if (lc("$2") eq 's');
321    
322     my $found = 0;
323     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
324    
325     if ($found) {
326     push @out, $del;
327     push @out, $tmp;
328     $all_found += $found;
329     }
330     }
331    
332     return if (! $all_found);
333    
334     my $out = join('',@out);
335    
336     if ($out) {
337     # add rest of format (suffix)
338     $out .= $format;
339    
340     # add prefix if not there
341     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
342    
343     $log->debug("result: $out");
344     }
345    
346     if ($eval_code) {
347     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
348     $log->debug("about to eval{$eval} format: $out");
349     return if (! $self->_eval($eval));
350     }
351    
352     if ($filter_name && $self->{'filter'}->{$filter_name}) {
353     $log->debug("about to filter{$filter_name} format: $out");
354     $out = $self->{'filter'}->{$filter_name}->($out);
355     return unless(defined($out));
356     $log->debug("filter result: $out");
357     }
358    
359     return $out;
360     }
361    
362     =head2 parse_to_arr
363    
364     Similar to C<parse>, but returns array of all repeatable fields
365    
366     my @arr = $webpac->parse_to_arr($rec,'v250^a');
367    
368     =cut
369    
370     sub parse_to_arr {
371     my $self = shift;
372    
373     my ($rec, $format_utf8) = @_;
374    
375     my $log = $self->_get_logger();
376    
377     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
378     return if (! $format_utf8);
379    
380     my $i = 0;
381     my @arr;
382    
383     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
384     push @arr, $v;
385     }
386    
387     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
388    
389     return @arr;
390     }
391    
392 dpavlin 15
393     =head2 fill_in
394    
395     Workhourse of all: takes record from in-memory structure of database and
396     strings with placeholders and returns string or array of with substituted
397     values from record.
398    
399     my $text = $webpac->fill_in($rec,'v250^a');
400    
401     Optional argument is ordinal number for repeatable fields. By default,
402     it's assume to be first repeatable field (fields are perl array, so first
403     element is 0).
404     Following example will read second value from repeatable field.
405    
406     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
407    
408     This function B<does not> perform parsing of format to inteligenty skip
409     delimiters before fields which aren't used.
410    
411     This method will automatically decode UTF-8 string to local code page
412     if needed.
413    
414     =cut
415    
416     sub fill_in {
417     my $self = shift;
418    
419     my $log = $self->_get_logger();
420    
421     my $rec = shift || $log->logconfess("need data record");
422     my $format = shift || $log->logconfess("need format to parse");
423     # iteration (for repeatable fields)
424     my $i = shift || 0;
425    
426     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
427    
428     # FIXME remove for speedup?
429     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
430    
431     if (utf8::is_utf8($format)) {
432     $format = $self->_x($format);
433     }
434    
435     my $found = 0;
436    
437     my $eval_code;
438     # remove eval{...} from beginning
439     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
440    
441     my $filter_name;
442     # remove filter{...} from beginning
443     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
444    
445     # do actual replacement of placeholders
446     # repeatable fields
447     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
448     # non-repeatable fields
449     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
450    
451     if ($found) {
452     $log->debug("format: $format");
453     if ($eval_code) {
454     my $eval = $self->fill_in($rec,$eval_code,$i);
455     return if (! $self->_eval($eval));
456     }
457     if ($filter_name && $self->{'filter'}->{$filter_name}) {
458     $log->debug("filter '$filter_name' for $format");
459     $format = $self->{'filter'}->{$filter_name}->($format);
460     return unless(defined($format));
461     $log->debug("filter result: $format");
462     }
463     # do we have lookups?
464     if ($self->{'lookup'}) {
465 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
466 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
467 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
468 dpavlin 252 return @lookup;
469 dpavlin 31 } else {
470     $log->warn("Have lookup object but can't invoke lookup method");
471     }
472 dpavlin 15 } else {
473     return $format;
474     }
475     } else {
476     return;
477     }
478     }
479    
480    
481 dpavlin 13 =head2 fill_in_to_arr
482    
483     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
484     for fields which have lookups, so they shouldn't be parsed but rather
485     C<fill_id>ed.
486    
487     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
488    
489     =cut
490    
491     sub fill_in_to_arr {
492     my $self = shift;
493    
494     my ($rec, $format_utf8) = @_;
495    
496     my $log = $self->_get_logger();
497    
498     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
499     return if (! $format_utf8);
500    
501     my $i = 0;
502     my @arr;
503    
504     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
505     push @arr, @v;
506     }
507    
508     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
509    
510     return @arr;
511     }
512    
513 dpavlin 15
514     =head2 get_data
515    
516     Returns value from record.
517    
518     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
519    
520     Arguments are:
521     record reference C<$rec>,
522     field C<$f>,
523     optional subfiled C<$sf>,
524     index for repeatable values C<$i>.
525    
526     Optinal variable C<$found> will be incremeted if there
527     is field.
528    
529     Returns value or empty string.
530    
531     =cut
532    
533     sub get_data {
534     my $self = shift;
535    
536     my ($rec,$f,$sf,$i,$found) = @_;
537    
538     if ($$rec->{$f}) {
539     return '' if (! $$rec->{$f}->[$i]);
540     no strict 'refs';
541     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
542     $$found++ if (defined($$found));
543     return $$rec->{$f}->[$i]->{$sf};
544 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
545 dpavlin 15 $$found++ if (defined($$found));
546     # it still might have subfield, just
547     # not specified, so we'll dump all
548     if ($$rec->{$f}->[$i] =~ /HASH/o) {
549     my $out;
550     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
551     $out .= $$rec->{$f}->[$i]->{$k}." ";
552     }
553     return $out;
554     } else {
555     return $$rec->{$f}->[$i];
556     }
557 dpavlin 64 } else {
558     return '';
559 dpavlin 15 }
560     } else {
561     return '';
562     }
563     }
564    
565    
566     =head2 apply_format
567    
568     Apply format specified in tag with C<format_name="name"> and
569     C<format_delimiter=";;">.
570    
571     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
572    
573     Formats can contain C<lookup{...}> if you need them.
574    
575     =cut
576    
577     sub apply_format {
578     my $self = shift;
579    
580     my ($name,$delimiter,$data) = @_;
581    
582     my $log = $self->_get_logger();
583    
584     if (! $self->{'import_xml'}->{'format'}->{$name}) {
585     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
586     return $data;
587     }
588    
589     $log->warn("no delimiter for format $name") if (! $delimiter);
590    
591     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
592    
593     my @data = split(/\Q$delimiter\E/, $data);
594    
595     my $out = sprintf($format, @data);
596     $log->debug("using format $name [$format] on $data to produce: $out");
597    
598     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
599 dpavlin 31 return $self->{'lookup'}->lookup($out);
600 dpavlin 15 } else {
601     return $out;
602     }
603    
604     }
605    
606 dpavlin 13 =head2 sort_arr
607    
608     Sort array ignoring case and html in data
609    
610     my @sorted = $webpac->sort_arr(@unsorted);
611    
612     =cut
613    
614     sub sort_arr {
615     my $self = shift;
616    
617     my $log = $self->_get_logger();
618    
619     # FIXME add Schwartzian Transformation?
620    
621     my @sorted = sort {
622     $a =~ s#<[^>]+/*>##;
623     $b =~ s#<[^>]+/*>##;
624     lc($b) cmp lc($a)
625     } @_;
626     $log->debug("sorted values: ",sub { join(", ",@sorted) });
627    
628     return @sorted;
629     }
630    
631    
632 dpavlin 15 =head1 INTERNAL METHODS
633    
634 dpavlin 13 =head2 _sort_by_order
635    
636     Sort xml tags data structure accoding to C<order=""> attribute.
637    
638     =cut
639    
640     sub _sort_by_order {
641     my $self = shift;
642    
643     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
644     $self->{'import_xml'}->{'indexer'}->{$a};
645     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
646     $self->{'import_xml'}->{'indexer'}->{$b};
647    
648     return $va <=> $vb;
649     }
650    
651     =head2 _x
652    
653 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
654     specific encoding (optinally specified using C<code_page> to C<new>
655     constructor).
656 dpavlin 13
657     my $text = $n->_x('normalize text string');
658    
659     This is a stub so that other modules doesn't have to implement it.
660    
661     =cut
662    
663     sub _x {
664     my $self = shift;
665     return shift;
666     }
667    
668    
669 dpavlin 10 =head1 AUTHOR
670    
671     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
672    
673     =head1 COPYRIGHT & LICENSE
674    
675     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
676    
677     This program is free software; you can redistribute it and/or modify it
678     under the same terms as Perl itself.
679    
680     =cut
681    
682 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26