/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (hide annotations)
Sun Jul 17 10:42:23 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 16320 byte(s)
WebPAC::Common cleanup, most code moved to WebPAC::Normalize. Added
documentation about order of data mungling when normalising data.

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 13 use Data::Dumper;
6 dpavlin 14 use Storable;
7 dpavlin 10
8     =head1 NAME
9    
10 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
11 dpavlin 10
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22 dpavlin 15 This package contains code that mungle data to produce normalized format.
23 dpavlin 10
24 dpavlin 15 It contains several assumptions:
25    
26     =over
27    
28     =item *
29    
30     format of fields is defined using C<v123^a> notation for repeatable fields
31     or C<s123^a> for single (or first) value, where C<123> is field number and
32     C<a> is subfield.
33    
34     =item *
35    
36     source data records (C<$rec>) have unique identifiers in field C<000>
37    
38     =item *
39    
40     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41     perl code that is evaluated before producing output (value of field will be
42     interpolated before that)
43    
44     =item *
45    
46     optional C<filter{filter_name}> at B<begining of format> will apply perl
47     code defined as code ref on format after field substitution to producing
48     output
49    
50     =item *
51    
52     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53    
54     =item *
55    
56     at end, optional C<format>s rules are resolved. Format rules are similar to
57     C<sprintf> and can also contain C<lookup{...}> which is performed after
58     values are inserted in format.
59    
60     =back
61    
62     This also describes order in which transformations are applied (eval,
63     filter, lookup, format) which is important to undestand when deciding how to
64     solve your data mungling and normalisation process.
65    
66    
67    
68    
69 dpavlin 10 =head1 FUNCTIONS
70    
71 dpavlin 13 =head2 new
72 dpavlin 10
73 dpavlin 13 Create new normalisation object
74    
75     my $n = new WebPAC::Normalize::Something(
76 dpavlin 15 filter => {
77     'filter_name_1' => sub {
78     # filter code
79     return length($_);
80     }, ...
81     },
82 dpavlin 13 cache_data_structure => './cache/ds/',
83     lookup_regex => $lookup->regex,
84     );
85    
86 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
87     be use with C<filter{...}> notation.
88    
89 dpavlin 13 Optional parameter C<cache_data_structure> defines path to directory
90     in which cache file for C<data_structure> call will be created.
91    
92     Recommended parametar C<lookup_regex> is used to enable parsing of lookups
93     in structures.
94    
95 dpavlin 10 =cut
96    
97 dpavlin 13 sub new {
98     my $class = shift;
99     my $self = {@_};
100     bless($self, $class);
101    
102     $self->setup_cache_dir( $self->{'cache_data_structure'} );
103    
104     $self ? return $self : return undef;
105 dpavlin 10 }
106    
107 dpavlin 13 =head2 setup_cache_dir
108    
109     Check if specified cache directory exist, and if not, disable caching.
110    
111     $setup_cache_dir('./cache/ds/');
112    
113     If you pass false or zero value to this function, it will disable
114     cacheing.
115    
116     =cut
117    
118     sub setup_cache_dir {
119     my $self = shift;
120    
121     my $dir = shift;
122    
123     my $log = $self->_get_logger();
124    
125     if ($dir) {
126     my $msg;
127     if (! -e $dir) {
128     $msg = "doesn't exist";
129     } elsif (! -d $dir) {
130     $msg = "is not directory";
131     } elsif (! -w $dir) {
132     $msg = "not writable";
133     }
134    
135     if ($msg) {
136     undef $self->{'cache_data_structure'};
137     $log->warn("cache_data_structure $dir $msg, disabling...");
138     } else {
139     $log->debug("using cache dir $dir");
140     }
141     } else {
142     $log->debug("disabling cache");
143     undef $self->{'cache_data_structure'};
144     }
145     }
146    
147    
148     =head2 data_structure
149    
150     Create in-memory data structure which represents normalized layout from
151     C<conf/normalize/*.xml>.
152    
153     This structures are used to produce output.
154    
155     my @ds = $webpac->data_structure($rec);
156    
157     B<Note: historical oddity follows>
158    
159     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
160     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
161     C<< <headline> >> tag.
162    
163     =cut
164    
165     sub data_structure {
166     my $self = shift;
167    
168     my $log = $self->_get_logger();
169    
170     my $rec = shift;
171     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
172    
173     my $cache_file;
174    
175     if (my $cache_path = $self->{'cache_data_structure'}) {
176     my $id = $rec->{'000'};
177     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
178     unless (defined($id)) {
179     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
180     undef $self->{'cache_data_structure'};
181     } else {
182     $cache_file = "$cache_path/$id";
183     if (-r $cache_file) {
184     my $ds_ref = retrieve($cache_file);
185     if ($ds_ref) {
186     $log->debug("cache hit: $cache_file");
187     my $ok = 1;
188     foreach my $f (qw(current_filename headline)) {
189     if ($ds_ref->{$f}) {
190     $self->{$f} = $ds_ref->{$f};
191     } else {
192     $ok = 0;
193     }
194     };
195     if ($ok && $ds_ref->{'ds'}) {
196     return @{ $ds_ref->{'ds'} };
197     } else {
198     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
199     undef $self->{'cache_data_structure'};
200     }
201     }
202     }
203     }
204     }
205    
206     undef $self->{'currnet_filename'};
207     undef $self->{'headline'};
208    
209     my @sorted_tags;
210     if ($self->{tags_by_order}) {
211     @sorted_tags = @{$self->{tags_by_order}};
212     } else {
213     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
214     $self->{tags_by_order} = \@sorted_tags;
215     }
216    
217     my @ds;
218    
219     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
220    
221     foreach my $field (@sorted_tags) {
222    
223     my $row;
224    
225     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
226    
227     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
228     my $format = $tag->{'value'} || $tag->{'content'};
229    
230     $log->debug("format: $format");
231    
232     my @v;
233     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
234     @v = $self->fill_in_to_arr($rec,$format);
235     } else {
236     @v = $self->parse_to_arr($rec,$format);
237     }
238     next if (! @v);
239    
240     if ($tag->{'sort'}) {
241     @v = $self->sort_arr(@v);
242     }
243    
244     # use format?
245     if ($tag->{'format_name'}) {
246     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
247     }
248    
249     if ($field eq 'filename') {
250     $self->{'current_filename'} = join('',@v);
251     $log->debug("filename: ",$self->{'current_filename'});
252     } elsif ($field eq 'headline') {
253     $self->{'headline'} .= join('',@v);
254     $log->debug("headline: ",$self->{'headline'});
255     next; # don't return headline in data_structure!
256     }
257    
258     # delimiter will join repeatable fields
259     if ($tag->{'delimiter'}) {
260     @v = ( join($tag->{'delimiter'}, @v) );
261     }
262    
263     # default types
264     my @types = qw(display swish);
265     # override by type attribute
266     @types = ( $tag->{'type'} ) if ($tag->{'type'});
267    
268     foreach my $type (@types) {
269     # append to previous line?
270     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
271     if ($tag->{'append'}) {
272    
273     # I will delimit appended part with
274     # delimiter (or ,)
275     my $d = $tag->{'delimiter'};
276     # default delimiter
277     $d ||= " ";
278    
279     my $last = pop @{$row->{$type}};
280     $d = "" if (! $last);
281     $last .= $d . join($d, @v);
282     push @{$row->{$type}}, $last;
283    
284     } else {
285     push @{$row->{$type}}, @v;
286     }
287     }
288    
289    
290     }
291    
292     if ($row) {
293     $row->{'tag'} = $field;
294    
295     # TODO: name_sigular, name_plural
296     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
297     $row->{'name'} = $name ? $self->_x($name) : $field;
298    
299     # post-sort all values in field
300     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
301     $log->warn("sort at field tag not implemented");
302     }
303    
304     push @ds, $row;
305    
306     $log->debug("row $field: ",sub { Dumper($row) });
307     }
308    
309     }
310    
311     if ($cache_file) {
312     store {
313     ds => \@ds,
314     current_filename => $self->{'current_filename'},
315     headline => $self->{'headline'},
316     }, $cache_file;
317     $log->debug("created storable cache file $cache_file");
318     }
319    
320     return @ds;
321    
322     }
323    
324     =head2 parse
325    
326     Perform smart parsing of string, skipping delimiters for fields which aren't
327     defined. It can also eval code in format starting with C<eval{...}> and
328     return output or nothing depending on eval code.
329    
330     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
331    
332     =cut
333    
334     sub parse {
335     my $self = shift;
336    
337     my ($rec, $format_utf8, $i) = @_;
338    
339     return if (! $format_utf8);
340    
341     my $log = $self->_get_logger();
342    
343     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
344    
345     $i = 0 if (! $i);
346    
347     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
348    
349     my @out;
350    
351     $log->debug("format: $format");
352    
353     my $eval_code;
354     # remove eval{...} from beginning
355     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
356    
357     my $filter_name;
358     # remove filter{...} from beginning
359     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
360    
361     my $prefix;
362     my $all_found=0;
363    
364     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
365    
366     my $del = $1 || '';
367     $prefix ||= $del if ($all_found == 0);
368    
369     # repeatable index
370     my $r = $i;
371     $r = 0 if (lc("$2") eq 's');
372    
373     my $found = 0;
374     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
375    
376     if ($found) {
377     push @out, $del;
378     push @out, $tmp;
379     $all_found += $found;
380     }
381     }
382    
383     return if (! $all_found);
384    
385     my $out = join('',@out);
386    
387     if ($out) {
388     # add rest of format (suffix)
389     $out .= $format;
390    
391     # add prefix if not there
392     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
393    
394     $log->debug("result: $out");
395     }
396    
397     if ($eval_code) {
398     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
399     $log->debug("about to eval{$eval} format: $out");
400     return if (! $self->_eval($eval));
401     }
402    
403     if ($filter_name && $self->{'filter'}->{$filter_name}) {
404     $log->debug("about to filter{$filter_name} format: $out");
405     $out = $self->{'filter'}->{$filter_name}->($out);
406     return unless(defined($out));
407     $log->debug("filter result: $out");
408     }
409    
410     return $out;
411     }
412    
413     =head2 parse_to_arr
414    
415     Similar to C<parse>, but returns array of all repeatable fields
416    
417     my @arr = $webpac->parse_to_arr($rec,'v250^a');
418    
419     =cut
420    
421     sub parse_to_arr {
422     my $self = shift;
423    
424     my ($rec, $format_utf8) = @_;
425    
426     my $log = $self->_get_logger();
427    
428     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429     return if (! $format_utf8);
430    
431     my $i = 0;
432     my @arr;
433    
434     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435     push @arr, $v;
436     }
437    
438     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
439    
440     return @arr;
441     }
442    
443 dpavlin 15
444     =head2 fill_in
445    
446     Workhourse of all: takes record from in-memory structure of database and
447     strings with placeholders and returns string or array of with substituted
448     values from record.
449    
450     my $text = $webpac->fill_in($rec,'v250^a');
451    
452     Optional argument is ordinal number for repeatable fields. By default,
453     it's assume to be first repeatable field (fields are perl array, so first
454     element is 0).
455     Following example will read second value from repeatable field.
456    
457     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458    
459     This function B<does not> perform parsing of format to inteligenty skip
460     delimiters before fields which aren't used.
461    
462     This method will automatically decode UTF-8 string to local code page
463     if needed.
464    
465     =cut
466    
467     sub fill_in {
468     my $self = shift;
469    
470     my $log = $self->_get_logger();
471    
472     my $rec = shift || $log->logconfess("need data record");
473     my $format = shift || $log->logconfess("need format to parse");
474     # iteration (for repeatable fields)
475     my $i = shift || 0;
476    
477     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
478    
479     # FIXME remove for speedup?
480     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481    
482     if (utf8::is_utf8($format)) {
483     $format = $self->_x($format);
484     }
485    
486     my $found = 0;
487    
488     my $eval_code;
489     # remove eval{...} from beginning
490     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
491    
492     my $filter_name;
493     # remove filter{...} from beginning
494     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
495    
496     # do actual replacement of placeholders
497     # repeatable fields
498     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
499     # non-repeatable fields
500     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
501    
502     if ($found) {
503     $log->debug("format: $format");
504     if ($eval_code) {
505     my $eval = $self->fill_in($rec,$eval_code,$i);
506     return if (! $self->_eval($eval));
507     }
508     if ($filter_name && $self->{'filter'}->{$filter_name}) {
509     $log->debug("filter '$filter_name' for $format");
510     $format = $self->{'filter'}->{$filter_name}->($format);
511     return unless(defined($format));
512     $log->debug("filter result: $format");
513     }
514     # do we have lookups?
515     if ($self->{'lookup'}) {
516     return $self->lookup($format);
517     } else {
518     return $format;
519     }
520     } else {
521     return;
522     }
523     }
524    
525    
526 dpavlin 13 =head2 fill_in_to_arr
527    
528     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
529     for fields which have lookups, so they shouldn't be parsed but rather
530     C<fill_id>ed.
531    
532     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
533    
534     =cut
535    
536     sub fill_in_to_arr {
537     my $self = shift;
538    
539     my ($rec, $format_utf8) = @_;
540    
541     my $log = $self->_get_logger();
542    
543     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
544     return if (! $format_utf8);
545    
546     my $i = 0;
547     my @arr;
548    
549     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
550     push @arr, @v;
551     }
552    
553     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
554    
555     return @arr;
556     }
557    
558 dpavlin 15
559     =head2 get_data
560    
561     Returns value from record.
562    
563     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
564    
565     Arguments are:
566     record reference C<$rec>,
567     field C<$f>,
568     optional subfiled C<$sf>,
569     index for repeatable values C<$i>.
570    
571     Optinal variable C<$found> will be incremeted if there
572     is field.
573    
574     Returns value or empty string.
575    
576     =cut
577    
578     sub get_data {
579     my $self = shift;
580    
581     my ($rec,$f,$sf,$i,$found) = @_;
582    
583     if ($$rec->{$f}) {
584     return '' if (! $$rec->{$f}->[$i]);
585     no strict 'refs';
586     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
587     $$found++ if (defined($$found));
588     return $$rec->{$f}->[$i]->{$sf};
589     } elsif ($$rec->{$f}->[$i]) {
590     $$found++ if (defined($$found));
591     # it still might have subfield, just
592     # not specified, so we'll dump all
593     if ($$rec->{$f}->[$i] =~ /HASH/o) {
594     my $out;
595     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
596     $out .= $$rec->{$f}->[$i]->{$k}." ";
597     }
598     return $out;
599     } else {
600     return $$rec->{$f}->[$i];
601     }
602     }
603     } else {
604     return '';
605     }
606     }
607    
608    
609     =head2 apply_format
610    
611     Apply format specified in tag with C<format_name="name"> and
612     C<format_delimiter=";;">.
613    
614     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615    
616     Formats can contain C<lookup{...}> if you need them.
617    
618     =cut
619    
620     sub apply_format {
621     my $self = shift;
622    
623     my ($name,$delimiter,$data) = @_;
624    
625     my $log = $self->_get_logger();
626    
627     if (! $self->{'import_xml'}->{'format'}->{$name}) {
628     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629     return $data;
630     }
631    
632     $log->warn("no delimiter for format $name") if (! $delimiter);
633    
634     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
635    
636     my @data = split(/\Q$delimiter\E/, $data);
637    
638     my $out = sprintf($format, @data);
639     $log->debug("using format $name [$format] on $data to produce: $out");
640    
641     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642     return $self->lookup($out);
643     } else {
644     return $out;
645     }
646    
647     }
648    
649 dpavlin 13 =head2 sort_arr
650    
651     Sort array ignoring case and html in data
652    
653     my @sorted = $webpac->sort_arr(@unsorted);
654    
655     =cut
656    
657     sub sort_arr {
658     my $self = shift;
659    
660     my $log = $self->_get_logger();
661    
662     # FIXME add Schwartzian Transformation?
663    
664     my @sorted = sort {
665     $a =~ s#<[^>]+/*>##;
666     $b =~ s#<[^>]+/*>##;
667     lc($b) cmp lc($a)
668     } @_;
669     $log->debug("sorted values: ",sub { join(", ",@sorted) });
670    
671     return @sorted;
672     }
673    
674    
675 dpavlin 15 =head1 INTERNAL METHODS
676    
677 dpavlin 13 =head2 _sort_by_order
678    
679     Sort xml tags data structure accoding to C<order=""> attribute.
680    
681     =cut
682    
683     sub _sort_by_order {
684     my $self = shift;
685    
686     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
687     $self->{'import_xml'}->{'indexer'}->{$a};
688     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
689     $self->{'import_xml'}->{'indexer'}->{$b};
690    
691     return $va <=> $vb;
692     }
693    
694     =head2 _x
695    
696 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
697     specific encoding (optinally specified using C<code_page> to C<new>
698     constructor).
699 dpavlin 13
700     my $text = $n->_x('normalize text string');
701    
702     This is a stub so that other modules doesn't have to implement it.
703    
704     =cut
705    
706     sub _x {
707     my $self = shift;
708     return shift;
709     }
710    
711    
712 dpavlin 10 =head1 AUTHOR
713    
714     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
715    
716     =head1 COPYRIGHT & LICENSE
717    
718     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
719    
720     This program is free software; you can redistribute it and/or modify it
721     under the same terms as Perl itself.
722    
723     =cut
724    
725     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26