/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 18 - (hide annotations)
Sun Jul 17 14:53:37 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 14441 byte(s)
first cut into WebPAC::DB

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 13 use Data::Dumper;
6 dpavlin 10
7     =head1 NAME
8    
9 dpavlin 15 WebPAC::Normalize - data mungling for normalisation
10 dpavlin 10
11     =head1 VERSION
12    
13     Version 0.01
14    
15     =cut
16    
17     our $VERSION = '0.01';
18    
19     =head1 SYNOPSIS
20    
21 dpavlin 15 This package contains code that mungle data to produce normalized format.
22 dpavlin 10
23 dpavlin 15 It contains several assumptions:
24    
25     =over
26    
27     =item *
28    
29     format of fields is defined using C<v123^a> notation for repeatable fields
30     or C<s123^a> for single (or first) value, where C<123> is field number and
31     C<a> is subfield.
32    
33     =item *
34    
35     source data records (C<$rec>) have unique identifiers in field C<000>
36    
37     =item *
38    
39     optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
40     perl code that is evaluated before producing output (value of field will be
41     interpolated before that)
42    
43     =item *
44    
45     optional C<filter{filter_name}> at B<begining of format> will apply perl
46     code defined as code ref on format after field substitution to producing
47     output
48    
49     =item *
50    
51     optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
52    
53     =item *
54    
55     at end, optional C<format>s rules are resolved. Format rules are similar to
56     C<sprintf> and can also contain C<lookup{...}> which is performed after
57     values are inserted in format.
58    
59     =back
60    
61     This also describes order in which transformations are applied (eval,
62     filter, lookup, format) which is important to undestand when deciding how to
63     solve your data mungling and normalisation process.
64    
65    
66    
67    
68 dpavlin 10 =head1 FUNCTIONS
69    
70 dpavlin 13 =head2 new
71 dpavlin 10
72 dpavlin 13 Create new normalisation object
73    
74     my $n = new WebPAC::Normalize::Something(
75 dpavlin 15 filter => {
76     'filter_name_1' => sub {
77     # filter code
78     return length($_);
79     }, ...
80     },
81 dpavlin 18 db => $webpac_db_obj,
82 dpavlin 13 lookup_regex => $lookup->regex,
83     );
84    
85 dpavlin 15 Parametar C<filter> defines user supplied snippets of perl code which can
86     be use with C<filter{...}> notation.
87    
88 dpavlin 13 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
89     in structures.
90    
91 dpavlin 10 =cut
92    
93 dpavlin 13 sub new {
94     my $class = shift;
95     my $self = {@_};
96     bless($self, $class);
97    
98     $self ? return $self : return undef;
99 dpavlin 10 }
100    
101 dpavlin 13
102     =head2 data_structure
103    
104     Create in-memory data structure which represents normalized layout from
105     C<conf/normalize/*.xml>.
106    
107     This structures are used to produce output.
108    
109     my @ds = $webpac->data_structure($rec);
110    
111     B<Note: historical oddity follows>
112    
113     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
114     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
115     C<< <headline> >> tag.
116    
117     =cut
118    
119     sub data_structure {
120     my $self = shift;
121    
122     my $log = $self->_get_logger();
123    
124     my $rec = shift;
125     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
126    
127     my $cache_file;
128    
129 dpavlin 18 if ($self->{'db'}) {
130     my @ds = $self->{'db'}->get_ds($rec);
131     return @ds if (@ds);
132 dpavlin 13 }
133    
134     undef $self->{'currnet_filename'};
135     undef $self->{'headline'};
136    
137     my @sorted_tags;
138     if ($self->{tags_by_order}) {
139     @sorted_tags = @{$self->{tags_by_order}};
140     } else {
141     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
142     $self->{tags_by_order} = \@sorted_tags;
143     }
144    
145     my @ds;
146    
147     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
148    
149     foreach my $field (@sorted_tags) {
150    
151     my $row;
152    
153     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
154    
155     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
156     my $format = $tag->{'value'} || $tag->{'content'};
157    
158     $log->debug("format: $format");
159    
160     my @v;
161     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
162     @v = $self->fill_in_to_arr($rec,$format);
163     } else {
164     @v = $self->parse_to_arr($rec,$format);
165     }
166     next if (! @v);
167    
168     if ($tag->{'sort'}) {
169     @v = $self->sort_arr(@v);
170     }
171    
172     # use format?
173     if ($tag->{'format_name'}) {
174     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
175     }
176    
177     if ($field eq 'filename') {
178     $self->{'current_filename'} = join('',@v);
179     $log->debug("filename: ",$self->{'current_filename'});
180     } elsif ($field eq 'headline') {
181     $self->{'headline'} .= join('',@v);
182     $log->debug("headline: ",$self->{'headline'});
183     next; # don't return headline in data_structure!
184     }
185    
186     # delimiter will join repeatable fields
187     if ($tag->{'delimiter'}) {
188     @v = ( join($tag->{'delimiter'}, @v) );
189     }
190    
191     # default types
192     my @types = qw(display swish);
193     # override by type attribute
194     @types = ( $tag->{'type'} ) if ($tag->{'type'});
195    
196     foreach my $type (@types) {
197     # append to previous line?
198     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
199     if ($tag->{'append'}) {
200    
201     # I will delimit appended part with
202     # delimiter (or ,)
203     my $d = $tag->{'delimiter'};
204     # default delimiter
205     $d ||= " ";
206    
207     my $last = pop @{$row->{$type}};
208     $d = "" if (! $last);
209     $last .= $d . join($d, @v);
210     push @{$row->{$type}}, $last;
211    
212     } else {
213     push @{$row->{$type}}, @v;
214     }
215     }
216    
217    
218     }
219    
220     if ($row) {
221     $row->{'tag'} = $field;
222    
223     # TODO: name_sigular, name_plural
224     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
225     $row->{'name'} = $name ? $self->_x($name) : $field;
226    
227     # post-sort all values in field
228     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
229     $log->warn("sort at field tag not implemented");
230     }
231    
232     push @ds, $row;
233    
234     $log->debug("row $field: ",sub { Dumper($row) });
235     }
236    
237     }
238    
239 dpavlin 18 $self->{'db'}->put_gs(
240     ds => \@ds,
241     current_filename => $self->{'current_filename'},
242     headline => $self->{'headline'},
243     ) if ($self->{'db'});
244 dpavlin 13
245     return @ds;
246    
247     }
248    
249     =head2 parse
250    
251     Perform smart parsing of string, skipping delimiters for fields which aren't
252     defined. It can also eval code in format starting with C<eval{...}> and
253     return output or nothing depending on eval code.
254    
255     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
256    
257     =cut
258    
259     sub parse {
260     my $self = shift;
261    
262     my ($rec, $format_utf8, $i) = @_;
263    
264     return if (! $format_utf8);
265    
266     my $log = $self->_get_logger();
267    
268     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
269    
270     $i = 0 if (! $i);
271    
272     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
273    
274     my @out;
275    
276     $log->debug("format: $format");
277    
278     my $eval_code;
279     # remove eval{...} from beginning
280     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
281    
282     my $filter_name;
283     # remove filter{...} from beginning
284     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
285    
286     my $prefix;
287     my $all_found=0;
288    
289     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
290    
291     my $del = $1 || '';
292     $prefix ||= $del if ($all_found == 0);
293    
294     # repeatable index
295     my $r = $i;
296     $r = 0 if (lc("$2") eq 's');
297    
298     my $found = 0;
299     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
300    
301     if ($found) {
302     push @out, $del;
303     push @out, $tmp;
304     $all_found += $found;
305     }
306     }
307    
308     return if (! $all_found);
309    
310     my $out = join('',@out);
311    
312     if ($out) {
313     # add rest of format (suffix)
314     $out .= $format;
315    
316     # add prefix if not there
317     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
318    
319     $log->debug("result: $out");
320     }
321    
322     if ($eval_code) {
323     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
324     $log->debug("about to eval{$eval} format: $out");
325     return if (! $self->_eval($eval));
326     }
327    
328     if ($filter_name && $self->{'filter'}->{$filter_name}) {
329     $log->debug("about to filter{$filter_name} format: $out");
330     $out = $self->{'filter'}->{$filter_name}->($out);
331     return unless(defined($out));
332     $log->debug("filter result: $out");
333     }
334    
335     return $out;
336     }
337    
338     =head2 parse_to_arr
339    
340     Similar to C<parse>, but returns array of all repeatable fields
341    
342     my @arr = $webpac->parse_to_arr($rec,'v250^a');
343    
344     =cut
345    
346     sub parse_to_arr {
347     my $self = shift;
348    
349     my ($rec, $format_utf8) = @_;
350    
351     my $log = $self->_get_logger();
352    
353     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
354     return if (! $format_utf8);
355    
356     my $i = 0;
357     my @arr;
358    
359     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
360     push @arr, $v;
361     }
362    
363     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
364    
365     return @arr;
366     }
367    
368 dpavlin 15
369     =head2 fill_in
370    
371     Workhourse of all: takes record from in-memory structure of database and
372     strings with placeholders and returns string or array of with substituted
373     values from record.
374    
375     my $text = $webpac->fill_in($rec,'v250^a');
376    
377     Optional argument is ordinal number for repeatable fields. By default,
378     it's assume to be first repeatable field (fields are perl array, so first
379     element is 0).
380     Following example will read second value from repeatable field.
381    
382     my $text = $webpac->fill_in($rec,'Title: v250^a',1);
383    
384     This function B<does not> perform parsing of format to inteligenty skip
385     delimiters before fields which aren't used.
386    
387     This method will automatically decode UTF-8 string to local code page
388     if needed.
389    
390     =cut
391    
392     sub fill_in {
393     my $self = shift;
394    
395     my $log = $self->_get_logger();
396    
397     my $rec = shift || $log->logconfess("need data record");
398     my $format = shift || $log->logconfess("need format to parse");
399     # iteration (for repeatable fields)
400     my $i = shift || 0;
401    
402     $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
403    
404     # FIXME remove for speedup?
405     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
406    
407     if (utf8::is_utf8($format)) {
408     $format = $self->_x($format);
409     }
410    
411     my $found = 0;
412    
413     my $eval_code;
414     # remove eval{...} from beginning
415     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
416    
417     my $filter_name;
418     # remove filter{...} from beginning
419     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
420    
421     # do actual replacement of placeholders
422     # repeatable fields
423     $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
424     # non-repeatable fields
425     $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
426    
427     if ($found) {
428     $log->debug("format: $format");
429     if ($eval_code) {
430     my $eval = $self->fill_in($rec,$eval_code,$i);
431     return if (! $self->_eval($eval));
432     }
433     if ($filter_name && $self->{'filter'}->{$filter_name}) {
434     $log->debug("filter '$filter_name' for $format");
435     $format = $self->{'filter'}->{$filter_name}->($format);
436     return unless(defined($format));
437     $log->debug("filter result: $format");
438     }
439     # do we have lookups?
440     if ($self->{'lookup'}) {
441     return $self->lookup($format);
442     } else {
443     return $format;
444     }
445     } else {
446     return;
447     }
448     }
449    
450    
451 dpavlin 13 =head2 fill_in_to_arr
452    
453     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
454     for fields which have lookups, so they shouldn't be parsed but rather
455     C<fill_id>ed.
456    
457     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
458    
459     =cut
460    
461     sub fill_in_to_arr {
462     my $self = shift;
463    
464     my ($rec, $format_utf8) = @_;
465    
466     my $log = $self->_get_logger();
467    
468     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
469     return if (! $format_utf8);
470    
471     my $i = 0;
472     my @arr;
473    
474     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
475     push @arr, @v;
476     }
477    
478     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
479    
480     return @arr;
481     }
482    
483 dpavlin 15
484     =head2 get_data
485    
486     Returns value from record.
487    
488     my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
489    
490     Arguments are:
491     record reference C<$rec>,
492     field C<$f>,
493     optional subfiled C<$sf>,
494     index for repeatable values C<$i>.
495    
496     Optinal variable C<$found> will be incremeted if there
497     is field.
498    
499     Returns value or empty string.
500    
501     =cut
502    
503     sub get_data {
504     my $self = shift;
505    
506     my ($rec,$f,$sf,$i,$found) = @_;
507    
508     if ($$rec->{$f}) {
509     return '' if (! $$rec->{$f}->[$i]);
510     no strict 'refs';
511     if ($sf && $$rec->{$f}->[$i]->{$sf}) {
512     $$found++ if (defined($$found));
513     return $$rec->{$f}->[$i]->{$sf};
514     } elsif ($$rec->{$f}->[$i]) {
515     $$found++ if (defined($$found));
516     # it still might have subfield, just
517     # not specified, so we'll dump all
518     if ($$rec->{$f}->[$i] =~ /HASH/o) {
519     my $out;
520     foreach my $k (keys %{$$rec->{$f}->[$i]}) {
521     $out .= $$rec->{$f}->[$i]->{$k}." ";
522     }
523     return $out;
524     } else {
525     return $$rec->{$f}->[$i];
526     }
527     }
528     } else {
529     return '';
530     }
531     }
532    
533    
534     =head2 apply_format
535    
536     Apply format specified in tag with C<format_name="name"> and
537     C<format_delimiter=";;">.
538    
539     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
540    
541     Formats can contain C<lookup{...}> if you need them.
542    
543     =cut
544    
545     sub apply_format {
546     my $self = shift;
547    
548     my ($name,$delimiter,$data) = @_;
549    
550     my $log = $self->_get_logger();
551    
552     if (! $self->{'import_xml'}->{'format'}->{$name}) {
553     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
554     return $data;
555     }
556    
557     $log->warn("no delimiter for format $name") if (! $delimiter);
558    
559     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
560    
561     my @data = split(/\Q$delimiter\E/, $data);
562    
563     my $out = sprintf($format, @data);
564     $log->debug("using format $name [$format] on $data to produce: $out");
565    
566     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
567     return $self->lookup($out);
568     } else {
569     return $out;
570     }
571    
572     }
573    
574 dpavlin 13 =head2 sort_arr
575    
576     Sort array ignoring case and html in data
577    
578     my @sorted = $webpac->sort_arr(@unsorted);
579    
580     =cut
581    
582     sub sort_arr {
583     my $self = shift;
584    
585     my $log = $self->_get_logger();
586    
587     # FIXME add Schwartzian Transformation?
588    
589     my @sorted = sort {
590     $a =~ s#<[^>]+/*>##;
591     $b =~ s#<[^>]+/*>##;
592     lc($b) cmp lc($a)
593     } @_;
594     $log->debug("sorted values: ",sub { join(", ",@sorted) });
595    
596     return @sorted;
597     }
598    
599    
600 dpavlin 15 =head1 INTERNAL METHODS
601    
602 dpavlin 13 =head2 _sort_by_order
603    
604     Sort xml tags data structure accoding to C<order=""> attribute.
605    
606     =cut
607    
608     sub _sort_by_order {
609     my $self = shift;
610    
611     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
612     $self->{'import_xml'}->{'indexer'}->{$a};
613     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
614     $self->{'import_xml'}->{'indexer'}->{$b};
615    
616     return $va <=> $vb;
617     }
618    
619     =head2 _x
620    
621 dpavlin 15 Convert strings from C<conf/normalize/*.xml> encoding into application
622     specific encoding (optinally specified using C<code_page> to C<new>
623     constructor).
624 dpavlin 13
625     my $text = $n->_x('normalize text string');
626    
627     This is a stub so that other modules doesn't have to implement it.
628    
629     =cut
630    
631     sub _x {
632     my $self = shift;
633     return shift;
634     }
635    
636    
637 dpavlin 10 =head1 AUTHOR
638    
639     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
640    
641     =head1 COPYRIGHT & LICENSE
642    
643     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
644    
645     This program is free software; you can redistribute it and/or modify it
646     under the same terms as Perl itself.
647    
648     =cut
649    
650     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26