/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 209 - (hide annotations)
Mon Dec 5 17:46:57 2005 UTC (17 years, 1 month ago) by dpavlin
File size: 15183 byte(s)
 r11518@llin:  dpavlin | 2005-12-04 19:43:29 +0100
 renamed WebPAC::DB to WebPAC::Store

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 70 Version 0.02
15 dpavlin 10
16     =cut
17    
18 dpavlin 70 our $VERSION = '0.02';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 29 db => $db_obj,
83 dpavlin 13 lookup_regex => $lookup->regex,
84 dpavlin 31 lookup => $lookup_obj,
85 dpavlin 13 );
86    
87 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
88     be use with C<filter{...}> notation.
89    
90 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
92     which is C<WebPAC::Lookup> object.
93 dpavlin 13
94 dpavlin 10 =cut
95    
96 dpavlin 13 sub new {
97     my $class = shift;
98     my $self = {@_};
99     bless($self, $class);
100    
101 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
102     my $l = $self->{'lookup'} ? 1 : 0;
103    
104     my $log = $self->_get_logger();
105    
106     # those two must be in pair
107     if ( ($r & $l) != ($r || $l) ) {
108     my $log = $self->_get_logger();
109     $log->logdie("lookup_regex and lookup must be in pair");
110     }
111    
112     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113    
114 dpavlin 13 $self ? return $self : return undef;
115 dpavlin 10 }
116    
117 dpavlin 13
118     =head2 data_structure
119    
120     Create in-memory data structure which represents normalized layout from
121     C<conf/normalize/*.xml>.
122    
123     This structures are used to produce output.
124    
125 dpavlin 70 my $ds = $webpac->data_structure($rec);
126 dpavlin 13
127     =cut
128    
129     sub data_structure {
130     my $self = shift;
131    
132     my $log = $self->_get_logger();
133    
134     my $rec = shift;
135     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
136    
137 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
138    
139     $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
140    
141     my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
142    
143 dpavlin 13 my $cache_file;
144    
145 dpavlin 18 if ($self->{'db'}) {
146 dpavlin 125 my $ds = $self->{'db'}->load_ds( $mfn );
147 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
148     return $ds if ($ds);
149 dpavlin 29 $log->debug("cache miss, creating");
150 dpavlin 13 }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163 dpavlin 70 my $ds;
164 dpavlin 13
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 dpavlin 38 my $format;
175 dpavlin 13
176 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177     $format = $tag->{'value'} || $tag->{'content'};
178    
179 dpavlin 13 $log->debug("format: $format");
180    
181     my @v;
182     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183     @v = $self->fill_in_to_arr($rec,$format);
184     } else {
185     @v = $self->parse_to_arr($rec,$format);
186     }
187     next if (! @v);
188    
189     if ($tag->{'sort'}) {
190     @v = $self->sort_arr(@v);
191     }
192    
193     # use format?
194     if ($tag->{'format_name'}) {
195     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196     }
197    
198     # delimiter will join repeatable fields
199     if ($tag->{'delimiter'}) {
200     @v = ( join($tag->{'delimiter'}, @v) );
201     }
202    
203     # default types
204 dpavlin 74 my @types = qw(display search);
205 dpavlin 13 # override by type attribute
206     @types = ( $tag->{'type'} ) if ($tag->{'type'});
207    
208     foreach my $type (@types) {
209     # append to previous line?
210     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
211     if ($tag->{'append'}) {
212    
213     # I will delimit appended part with
214     # delimiter (or ,)
215     my $d = $tag->{'delimiter'};
216     # default delimiter
217     $d ||= " ";
218    
219     my $last = pop @{$row->{$type}};
220     $d = "" if (! $last);
221     $last .= $d . join($d, @v);
222     push @{$row->{$type}}, $last;
223    
224     } else {
225     push @{$row->{$type}}, @v;
226     }
227     }
228    
229    
230     }
231    
232     if ($row) {
233     $row->{'tag'} = $field;
234    
235     # TODO: name_sigular, name_plural
236     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
237 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
238 dpavlin 13
239     # post-sort all values in field
240     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
241     $log->warn("sort at field tag not implemented");
242     }
243    
244 dpavlin 70 $ds->{$row_name} = $row;
245 dpavlin 13
246     $log->debug("row $field: ",sub { Dumper($row) });
247     }
248    
249     }
250    
251 dpavlin 22 $self->{'db'}->save_ds(
252 dpavlin 125 id => $mfn,
253 dpavlin 70 ds => $ds,
254 dpavlin 18 ) if ($self->{'db'});
255 dpavlin 13
256 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
257 dpavlin 29
258 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
259 dpavlin 13
260 dpavlin 70 return $ds;
261    
262 dpavlin 13 }
263    
264     =head2 parse
265    
266     Perform smart parsing of string, skipping delimiters for fields which aren't
267     defined. It can also eval code in format starting with C<eval{...}> and
268     return output or nothing depending on eval code.
269    
270     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
271    
272     =cut
273    
274     sub parse {
275     my $self = shift;
276    
277     my ($rec, $format_utf8, $i) = @_;
278    
279     return if (! $format_utf8);
280    
281     my $log = $self->_get_logger();
282    
283     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
284    
285     $i = 0 if (! $i);
286    
287     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
288    
289     my @out;
290    
291     $log->debug("format: $format");
292    
293     my $eval_code;
294     # remove eval{...} from beginning
295     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
296    
297     my $filter_name;
298     # remove filter{...} from beginning
299     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
300    
301     my $prefix;
302     my $all_found=0;
303    
304     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
305    
306     my $del = $1 || '';
307     $prefix ||= $del if ($all_found == 0);
308    
309     # repeatable index
310     my $r = $i;
311     $r = 0 if (lc("$2") eq 's');
312    
313     my $found = 0;
314     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
315    
316     if ($found) {
317     push @out, $del;
318     push @out, $tmp;
319     $all_found += $found;
320     }
321     }
322    
323     return if (! $all_found);
324    
325     my $out = join('',@out);
326    
327     if ($out) {
328     # add rest of format (suffix)
329     $out .= $format;
330    
331     # add prefix if not there
332     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
333    
334     $log->debug("result: $out");
335     }
336    
337     if ($eval_code) {
338     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
339     $log->debug("about to eval{$eval} format: $out");
340     return if (! $self->_eval($eval));
341     }
342    
343     if ($filter_name && $self->{'filter'}->{$filter_name}) {
344     $log->debug("about to filter{$filter_name} format: $out");
345     $out = $self->{'filter'}->{$filter_name}->($out);
346     return unless(defined($out));
347     $log->debug("filter result: $out");
348     }
349    
350     return $out;
351     }
352    
353     =head2 parse_to_arr
354    
355     Similar to C<parse>, but returns array of all repeatable fields
356    
357     my @arr = $webpac->parse_to_arr($rec,'v250^a');
358    
359     =cut
360    
361     sub parse_to_arr {
362     my $self = shift;
363    
364     my ($rec, $format_utf8) = @_;
365    
366     my $log = $self->_get_logger();
367    
368     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
369     return if (! $format_utf8);
370    
371     my $i = 0;
372     my @arr;
373    
374     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
375     push @arr, $v;
376     }
377    
378     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
379    
380     return @arr;
381     }
382    
383 dpavlin 15
384     =head2 fill_in
385    
386     Workhourse of all: takes record from in-memory structure of database and
387     strings with placeholders and returns string or array of with substituted
388     values from record.
389    
390     my $text = $webpac->fill_in($rec,'v250^a');
391    
392     Optional argument is ordinal number for repeatable fields. By default,
393     it's assume to be first repeatable field (fields are perl array, so first
394     element is 0).
395     Following example will read second value from repeatable field.
396    
397     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
398    
399     This function B<does not> perform parsing of format to inteligenty skip
400     delimiters before fields which aren't used.
401    
402     This method will automatically decode UTF-8 string to local code page
403     if needed.
404    
405     =cut
406    
407     sub fill_in {
408     my $self = shift;
409    
410     my $log = $self->_get_logger();
411    
412     my $rec = shift || $log->logconfess("need data record");
413     my $format = shift || $log->logconfess("need format to parse");
414     # iteration (for repeatable fields)
415     my $i = shift || 0;
416    
417     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
418    
419     # FIXME remove for speedup?
420     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
421    
422     if (utf8::is_utf8($format)) {
423     $format = $self->_x($format);
424     }
425    
426     my $found = 0;
427    
428     my $eval_code;
429     # remove eval{...} from beginning
430     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
431    
432     my $filter_name;
433     # remove filter{...} from beginning
434     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
435    
436     # do actual replacement of placeholders
437     # repeatable fields
438     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
439     # non-repeatable fields
440     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
441    
442     if ($found) {
443     $log->debug("format: $format");
444     if ($eval_code) {
445     my $eval = $self->fill_in($rec,$eval_code,$i);
446     return if (! $self->_eval($eval));
447     }
448     if ($filter_name && $self->{'filter'}->{$filter_name}) {
449     $log->debug("filter '$filter_name' for $format");
450     $format = $self->{'filter'}->{$filter_name}->($format);
451     return unless(defined($format));
452     $log->debug("filter result: $format");
453     }
454     # do we have lookups?
455     if ($self->{'lookup'}) {
456 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
457     return $self->{'lookup'}->lookup($format);
458     } else {
459     $log->warn("Have lookup object but can't invoke lookup method");
460     }
461 dpavlin 15 } else {
462     return $format;
463     }
464     } else {
465     return;
466     }
467     }
468    
469    
470 dpavlin 13 =head2 fill_in_to_arr
471    
472     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
473     for fields which have lookups, so they shouldn't be parsed but rather
474     C<fill_id>ed.
475    
476     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
477    
478     =cut
479    
480     sub fill_in_to_arr {
481     my $self = shift;
482    
483     my ($rec, $format_utf8) = @_;
484    
485     my $log = $self->_get_logger();
486    
487     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
488     return if (! $format_utf8);
489    
490     my $i = 0;
491     my @arr;
492    
493     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
494     push @arr, @v;
495     }
496    
497     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
498    
499     return @arr;
500     }
501    
502 dpavlin 15
503     =head2 get_data
504    
505     Returns value from record.
506    
507     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
508    
509     Arguments are:
510     record reference C<$rec>,
511     field C<$f>,
512     optional subfiled C<$sf>,
513     index for repeatable values C<$i>.
514    
515     Optinal variable C<$found> will be incremeted if there
516     is field.
517    
518     Returns value or empty string.
519    
520     =cut
521    
522     sub get_data {
523     my $self = shift;
524    
525     my ($rec,$f,$sf,$i,$found) = @_;
526    
527     if ($$rec->{$f}) {
528     return '' if (! $$rec->{$f}->[$i]);
529     no strict 'refs';
530     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
531     $$found++ if (defined($$found));
532     return $$rec->{$f}->[$i]->{$sf};
533 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
534 dpavlin 15 $$found++ if (defined($$found));
535     # it still might have subfield, just
536     # not specified, so we'll dump all
537     if ($$rec->{$f}->[$i] =~ /HASH/o) {
538     my $out;
539     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
540     $out .= $$rec->{$f}->[$i]->{$k}." ";
541     }
542     return $out;
543     } else {
544     return $$rec->{$f}->[$i];
545     }
546 dpavlin 64 } else {
547     return '';
548 dpavlin 15 }
549     } else {
550     return '';
551     }
552     }
553    
554    
555     =head2 apply_format
556    
557     Apply format specified in tag with C<format_name="name"> and
558     C<format_delimiter=";;">.
559    
560     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
561    
562     Formats can contain C<lookup{...}> if you need them.
563    
564     =cut
565    
566     sub apply_format {
567     my $self = shift;
568    
569     my ($name,$delimiter,$data) = @_;
570    
571     my $log = $self->_get_logger();
572    
573     if (! $self->{'import_xml'}->{'format'}->{$name}) {
574     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
575     return $data;
576     }
577    
578     $log->warn("no delimiter for format $name") if (! $delimiter);
579    
580     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
581    
582     my @data = split(/\Q$delimiter\E/, $data);
583    
584     my $out = sprintf($format, @data);
585     $log->debug("using format $name [$format] on $data to produce: $out");
586    
587     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
588 dpavlin 31 return $self->{'lookup'}->lookup($out);
589 dpavlin 15 } else {
590     return $out;
591     }
592    
593     }
594    
595 dpavlin 13 =head2 sort_arr
596    
597     Sort array ignoring case and html in data
598    
599     my @sorted = $webpac->sort_arr(@unsorted);
600    
601     =cut
602    
603     sub sort_arr {
604     my $self = shift;
605    
606     my $log = $self->_get_logger();
607    
608     # FIXME add Schwartzian Transformation?
609    
610     my @sorted = sort {
611     $a =~ s#<[^>]+/*>##;
612     $b =~ s#<[^>]+/*>##;
613     lc($b) cmp lc($a)
614     } @_;
615     $log->debug("sorted values: ",sub { join(", ",@sorted) });
616    
617     return @sorted;
618     }
619    
620    
621 dpavlin 15 =head1 INTERNAL METHODS
622    
623 dpavlin 13 =head2 _sort_by_order
624    
625     Sort xml tags data structure accoding to C<order=""> attribute.
626    
627     =cut
628    
629     sub _sort_by_order {
630     my $self = shift;
631    
632     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
633     $self->{'import_xml'}->{'indexer'}->{$a};
634     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
635     $self->{'import_xml'}->{'indexer'}->{$b};
636    
637     return $va <=> $vb;
638     }
639    
640     =head2 _x
641    
642 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
643     specific encoding (optinally specified using C<code_page> to C<new>
644     constructor).
645 dpavlin 13
646     my $text = $n->_x('normalize text string');
647    
648     This is a stub so that other modules doesn't have to implement it.
649    
650     =cut
651    
652     sub _x {
653     my $self = shift;
654     return shift;
655     }
656    
657    
658 dpavlin 10 =head1 AUTHOR
659    
660     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
661    
662     =head1 COPYRIGHT & LICENSE
663    
664     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
665    
666     This program is free software; you can redistribute it and/or modify it
667     under the same terms as Perl itself.
668    
669     =cut
670    
671 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26