/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 295 - (hide annotations)
Mon Dec 19 15:34:47 2005 UTC (17 years, 1 month ago) by dpavlin
File size: 16454 byte(s)
 r11795@llin:  dpavlin | 2005-12-19 16:35:30 +0100
 fix regex filter, moved development version to real one [2.07]

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 29 use base 'WebPAC::Common';
6 dpavlin 13 use Data::Dumper;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14 dpavlin 295 Version 0.07
15 dpavlin 10
16     =cut
17    
18 dpavlin 295 our $VERSION = '0.07';
19 dpavlin 10
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50 dpavlin 260 There is one built-in filter called C<regex> which can be use like this:
51    
52     filter{regex(s/foo/bar/)}
53    
54 dpavlin 15 =item *
55    
56     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57    
58     =item *
59    
60     at end, optional C<format>s rules are resolved. Format rules are similar to
61     C<sprintf> and can also contain C<lookup{...}> which is performed after
62     values are inserted in format.
63    
64     =back
65    
66     This also describes order in which transformations are applied (eval,
67     filter, lookup, format) which is important to undestand when deciding how to
68     solve your data mungling and normalisation process.
69    
70    
71    
72    
73 dpavlin 10 =head1 FUNCTIONS
74    
75 dpavlin 13 =head2 new
76 dpavlin 10
77 dpavlin 13 Create new normalisation object
78    
79     my $n = new WebPAC::Normalize::Something(
80 dpavlin 15 filter => {
81     'filter_name_1' => sub {
82     # filter code
83     return length($_);
84     }, ...
85     },
86 dpavlin 29 db => $db_obj,
87 dpavlin 13 lookup_regex => $lookup->regex,
88 dpavlin 31 lookup => $lookup_obj,
89 dpavlin 219 prefix => 'foobar',
90 dpavlin 13 );
91    
92 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
93     be use with C<filter{...}> notation.
94    
95 dpavlin 219 C<prefix> is used to form filename for database record (to support multiple
96     source files which are joined in one database).
97    
98 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 dpavlin 31 in structures. If you pass this parametar, you must also pass C<lookup>
100     which is C<WebPAC::Lookup> object.
101 dpavlin 13
102 dpavlin 10 =cut
103    
104 dpavlin 13 sub new {
105     my $class = shift;
106     my $self = {@_};
107     bless($self, $class);
108    
109 dpavlin 31 my $r = $self->{'lookup_regex'} ? 1 : 0;
110     my $l = $self->{'lookup'} ? 1 : 0;
111    
112     my $log = $self->_get_logger();
113    
114     # those two must be in pair
115     if ( ($r & $l) != ($r || $l) ) {
116     my $log = $self->_get_logger();
117     $log->logdie("lookup_regex and lookup must be in pair");
118     }
119    
120     $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121    
122 dpavlin 219 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123    
124 dpavlin 252 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125    
126 dpavlin 295 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 dpavlin 260 $log->debug("adding built-in filter regex");
128     $self->{filter}->{regex} = sub {
129     my ($val, $regex) = @_;
130     eval "\$val =~ $regex";
131     return $val;
132     };
133     }
134    
135 dpavlin 13 $self ? return $self : return undef;
136 dpavlin 10 }
137    
138 dpavlin 13
139     =head2 data_structure
140    
141     Create in-memory data structure which represents normalized layout from
142     C<conf/normalize/*.xml>.
143    
144     This structures are used to produce output.
145    
146 dpavlin 70 my $ds = $webpac->data_structure($rec);
147 dpavlin 13
148     =cut
149    
150     sub data_structure {
151     my $self = shift;
152    
153     my $log = $self->_get_logger();
154    
155     my $rec = shift;
156     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157    
158 dpavlin 125 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159    
160     $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
161    
162 dpavlin 219 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163 dpavlin 125
164 dpavlin 13 my $cache_file;
165    
166 dpavlin 18 if ($self->{'db'}) {
167 dpavlin 219 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 dpavlin 70 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169     return $ds if ($ds);
170 dpavlin 29 $log->debug("cache miss, creating");
171 dpavlin 13 }
172    
173     my @sorted_tags;
174     if ($self->{tags_by_order}) {
175     @sorted_tags = @{$self->{tags_by_order}};
176     } else {
177     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178     $self->{tags_by_order} = \@sorted_tags;
179     }
180    
181 dpavlin 70 my $ds;
182 dpavlin 13
183     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184    
185     foreach my $field (@sorted_tags) {
186    
187     my $row;
188    
189     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190    
191     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 dpavlin 38 my $format;
193 dpavlin 13
194 dpavlin 38 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195     $format = $tag->{'value'} || $tag->{'content'};
196    
197 dpavlin 13 $log->debug("format: $format");
198    
199     my @v;
200     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201     @v = $self->fill_in_to_arr($rec,$format);
202     } else {
203     @v = $self->parse_to_arr($rec,$format);
204     }
205     next if (! @v);
206    
207     if ($tag->{'sort'}) {
208     @v = $self->sort_arr(@v);
209     }
210    
211     # use format?
212     if ($tag->{'format_name'}) {
213     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214     }
215    
216     # delimiter will join repeatable fields
217     if ($tag->{'delimiter'}) {
218     @v = ( join($tag->{'delimiter'}, @v) );
219     }
220    
221     # default types
222 dpavlin 74 my @types = qw(display search);
223 dpavlin 13 # override by type attribute
224     @types = ( $tag->{'type'} ) if ($tag->{'type'});
225    
226     foreach my $type (@types) {
227     # append to previous line?
228 dpavlin 252 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229 dpavlin 13 if ($tag->{'append'}) {
230    
231     # I will delimit appended part with
232     # delimiter (or ,)
233     my $d = $tag->{'delimiter'};
234     # default delimiter
235     $d ||= " ";
236    
237     my $last = pop @{$row->{$type}};
238     $d = "" if (! $last);
239     $last .= $d . join($d, @v);
240     push @{$row->{$type}}, $last;
241    
242     } else {
243     push @{$row->{$type}}, @v;
244     }
245     }
246    
247    
248     }
249    
250     if ($row) {
251     $row->{'tag'} = $field;
252    
253     # TODO: name_sigular, name_plural
254     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255 dpavlin 70 my $row_name = $name ? $self->_x($name) : $field;
256 dpavlin 13
257     # post-sort all values in field
258     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259     $log->warn("sort at field tag not implemented");
260     }
261    
262 dpavlin 70 $ds->{$row_name} = $row;
263 dpavlin 13
264     $log->debug("row $field: ",sub { Dumper($row) });
265     }
266    
267     }
268    
269 dpavlin 22 $self->{'db'}->save_ds(
270 dpavlin 219 id => $id,
271 dpavlin 70 ds => $ds,
272 dpavlin 219 prefix => $self->{prefix},
273 dpavlin 18 ) if ($self->{'db'});
274 dpavlin 13
275 dpavlin 70 $log->debug("ds: ", sub { Dumper($ds) });
276 dpavlin 29
277 dpavlin 70 $log->logconfess("data structure returned is not array any more!") if wantarray;
278 dpavlin 13
279 dpavlin 70 return $ds;
280    
281 dpavlin 13 }
282    
283     =head2 parse
284    
285     Perform smart parsing of string, skipping delimiters for fields which aren't
286     defined. It can also eval code in format starting with C<eval{...}> and
287     return output or nothing depending on eval code.
288    
289     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290    
291 dpavlin 260 Filters are implemented here. While simple form of filters looks like this:
292    
293     filter{name_of_filter}
294    
295     but, filters can also have variable number of parametars like this:
296    
297     filter{name_of_filter(param,param,param)}
298    
299 dpavlin 13 =cut
300    
301 dpavlin 261 my $warn_once;
302    
303 dpavlin 13 sub parse {
304     my $self = shift;
305    
306     my ($rec, $format_utf8, $i) = @_;
307    
308     return if (! $format_utf8);
309    
310     my $log = $self->_get_logger();
311    
312     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313    
314     $i = 0 if (! $i);
315    
316     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317    
318     my @out;
319    
320     $log->debug("format: $format");
321    
322     my $eval_code;
323     # remove eval{...} from beginning
324     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325    
326     my $filter_name;
327     # remove filter{...} from beginning
328     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329    
330     my $prefix;
331     my $all_found=0;
332    
333     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
334    
335     my $del = $1 || '';
336     $prefix ||= $del if ($all_found == 0);
337    
338     # repeatable index
339     my $r = $i;
340     $r = 0 if (lc("$2") eq 's');
341    
342     my $found = 0;
343     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
344    
345     if ($found) {
346     push @out, $del;
347     push @out, $tmp;
348     $all_found += $found;
349     }
350     }
351    
352     return if (! $all_found);
353    
354     my $out = join('',@out);
355    
356     if ($out) {
357     # add rest of format (suffix)
358     $out .= $format;
359    
360     # add prefix if not there
361     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
362    
363     $log->debug("result: $out");
364     }
365    
366     if ($eval_code) {
367     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
368     $log->debug("about to eval{$eval} format: $out");
369     return if (! $self->_eval($eval));
370     }
371    
372 dpavlin 260 if ($filter_name) {
373     my @filter_args;
374     if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
375     @filter_args = split(/,/, $2);
376     }
377     if ($self->{'filter'}->{$filter_name}) {
378     $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
379     unshift @filter_args, $out;
380     $out = $self->{'filter'}->{$filter_name}->(@filter_args);
381     return unless(defined($out));
382     $log->debug("filter result: $out");
383 dpavlin 261 } elsif (! $warn_once->{$filter_name}) {
384 dpavlin 260 $log->warn("trying to use undefined filter $filter_name");
385 dpavlin 261 $warn_once->{$filter_name}++;
386 dpavlin 260 }
387 dpavlin 13 }
388    
389     return $out;
390     }
391    
392     =head2 parse_to_arr
393    
394     Similar to C<parse>, but returns array of all repeatable fields
395    
396     my @arr = $webpac->parse_to_arr($rec,'v250^a');
397    
398     =cut
399    
400     sub parse_to_arr {
401     my $self = shift;
402    
403     my ($rec, $format_utf8) = @_;
404    
405     my $log = $self->_get_logger();
406    
407     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
408     return if (! $format_utf8);
409    
410     my $i = 0;
411     my @arr;
412    
413     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
414     push @arr, $v;
415     }
416    
417     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
418    
419     return @arr;
420     }
421    
422 dpavlin 15
423     =head2 fill_in
424    
425     Workhourse of all: takes record from in-memory structure of database and
426     strings with placeholders and returns string or array of with substituted
427     values from record.
428    
429     my $text = $webpac->fill_in($rec,'v250^a');
430    
431     Optional argument is ordinal number for repeatable fields. By default,
432     it's assume to be first repeatable field (fields are perl array, so first
433     element is 0).
434     Following example will read second value from repeatable field.
435    
436     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
437    
438     This function B<does not> perform parsing of format to inteligenty skip
439     delimiters before fields which aren't used.
440    
441     This method will automatically decode UTF-8 string to local code page
442     if needed.
443    
444     =cut
445    
446     sub fill_in {
447     my $self = shift;
448    
449     my $log = $self->_get_logger();
450    
451     my $rec = shift || $log->logconfess("need data record");
452     my $format = shift || $log->logconfess("need format to parse");
453     # iteration (for repeatable fields)
454     my $i = shift || 0;
455    
456     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
457    
458     # FIXME remove for speedup?
459     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460    
461     if (utf8::is_utf8($format)) {
462     $format = $self->_x($format);
463     }
464    
465     my $found = 0;
466    
467     my $eval_code;
468     # remove eval{...} from beginning
469     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
470    
471     my $filter_name;
472     # remove filter{...} from beginning
473     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
474    
475     # do actual replacement of placeholders
476     # repeatable fields
477     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
478     # non-repeatable fields
479     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
480    
481     if ($found) {
482     $log->debug("format: $format");
483     if ($eval_code) {
484     my $eval = $self->fill_in($rec,$eval_code,$i);
485     return if (! $self->_eval($eval));
486     }
487     if ($filter_name && $self->{'filter'}->{$filter_name}) {
488     $log->debug("filter '$filter_name' for $format");
489     $format = $self->{'filter'}->{$filter_name}->($format);
490     return unless(defined($format));
491     $log->debug("filter result: $format");
492     }
493     # do we have lookups?
494     if ($self->{'lookup'}) {
495 dpavlin 31 if ($self->{'lookup'}->can('lookup')) {
496 dpavlin 252 my @lookup = $self->{lookup}->lookup($format);
497 dpavlin 253 $log->debug("lookup $format", join(", ", @lookup));
498 dpavlin 252 return @lookup;
499 dpavlin 31 } else {
500     $log->warn("Have lookup object but can't invoke lookup method");
501     }
502 dpavlin 15 } else {
503     return $format;
504     }
505     } else {
506     return;
507     }
508     }
509    
510    
511 dpavlin 13 =head2 fill_in_to_arr
512    
513     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
514     for fields which have lookups, so they shouldn't be parsed but rather
515     C<fill_id>ed.
516    
517     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
518    
519     =cut
520    
521     sub fill_in_to_arr {
522     my $self = shift;
523    
524     my ($rec, $format_utf8) = @_;
525    
526     my $log = $self->_get_logger();
527    
528     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
529     return if (! $format_utf8);
530    
531     my $i = 0;
532     my @arr;
533    
534     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
535     push @arr, @v;
536     }
537    
538     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
539    
540     return @arr;
541     }
542    
543 dpavlin 15
544     =head2 get_data
545    
546     Returns value from record.
547    
548     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
549    
550     Arguments are:
551     record reference C<$rec>,
552     field C<$f>,
553     optional subfiled C<$sf>,
554     index for repeatable values C<$i>.
555    
556     Optinal variable C<$found> will be incremeted if there
557     is field.
558    
559     Returns value or empty string.
560    
561     =cut
562    
563     sub get_data {
564     my $self = shift;
565    
566     my ($rec,$f,$sf,$i,$found) = @_;
567    
568     if ($$rec->{$f}) {
569     return '' if (! $$rec->{$f}->[$i]);
570     no strict 'refs';
571     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
572     $$found++ if (defined($$found));
573     return $$rec->{$f}->[$i]->{$sf};
574 dpavlin 64 } elsif (! $sf && $$rec->{$f}->[$i]) {
575 dpavlin 15 $$found++ if (defined($$found));
576     # it still might have subfield, just
577     # not specified, so we'll dump all
578     if ($$rec->{$f}->[$i] =~ /HASH/o) {
579     my $out;
580     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
581     $out .= $$rec->{$f}->[$i]->{$k}." ";
582     }
583     return $out;
584     } else {
585     return $$rec->{$f}->[$i];
586     }
587 dpavlin 64 } else {
588     return '';
589 dpavlin 15 }
590     } else {
591     return '';
592     }
593     }
594    
595    
596     =head2 apply_format
597    
598     Apply format specified in tag with C<format_name="name"> and
599     C<format_delimiter=";;">.
600    
601     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
602    
603     Formats can contain C<lookup{...}> if you need them.
604    
605     =cut
606    
607     sub apply_format {
608     my $self = shift;
609    
610     my ($name,$delimiter,$data) = @_;
611    
612     my $log = $self->_get_logger();
613    
614     if (! $self->{'import_xml'}->{'format'}->{$name}) {
615     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
616     return $data;
617     }
618    
619     $log->warn("no delimiter for format $name") if (! $delimiter);
620    
621     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
622    
623     my @data = split(/\Q$delimiter\E/, $data);
624    
625     my $out = sprintf($format, @data);
626     $log->debug("using format $name [$format] on $data to produce: $out");
627    
628     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
629 dpavlin 31 return $self->{'lookup'}->lookup($out);
630 dpavlin 15 } else {
631     return $out;
632     }
633    
634     }
635    
636 dpavlin 13 =head2 sort_arr
637    
638     Sort array ignoring case and html in data
639    
640     my @sorted = $webpac->sort_arr(@unsorted);
641    
642     =cut
643    
644     sub sort_arr {
645     my $self = shift;
646    
647     my $log = $self->_get_logger();
648    
649     # FIXME add Schwartzian Transformation?
650    
651     my @sorted = sort {
652     $a =~ s#<[^>]+/*>##;
653     $b =~ s#<[^>]+/*>##;
654     lc($b) cmp lc($a)
655     } @_;
656     $log->debug("sorted values: ",sub { join(", ",@sorted) });
657    
658     return @sorted;
659     }
660    
661    
662 dpavlin 15 =head1 INTERNAL METHODS
663    
664 dpavlin 13 =head2 _sort_by_order
665    
666     Sort xml tags data structure accoding to C<order=""> attribute.
667    
668     =cut
669    
670     sub _sort_by_order {
671     my $self = shift;
672    
673     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
674     $self->{'import_xml'}->{'indexer'}->{$a};
675     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
676     $self->{'import_xml'}->{'indexer'}->{$b};
677    
678     return $va <=> $vb;
679     }
680    
681     =head2 _x
682    
683 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
684     specific encoding (optinally specified using C<code_page> to C<new>
685     constructor).
686 dpavlin 13
687     my $text = $n->_x('normalize text string');
688    
689     This is a stub so that other modules doesn't have to implement it.
690    
691     =cut
692    
693     sub _x {
694     my $self = shift;
695     return shift;
696     }
697    
698    
699 dpavlin 10 =head1 AUTHOR
700    
701     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
702    
703     =head1 COPYRIGHT & LICENSE
704    
705     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
706    
707     This program is free software; you can redistribute it and/or modify it
708     under the same terms as Perl itself.
709    
710     =cut
711    
712 dpavlin 209 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26