/[webpac2]/trunk/lib/WebPAC/Lookup/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Lookup/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 536 - (show annotations)
Mon Jun 26 16:39:51 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 18439 byte(s)
 r719@llin:  dpavlin | 2006-06-26 18:40:57 +0200
 big refacture: depriciate and remove all normalisation formats except .pl sets (but
 old code is still available in WebPAC::Lookup::Normalize because lookups use it) [2.20]

1 package WebPAC::Lookup::Normalize;
2
3 use warnings;
4 use strict;
5 use blib;
6 use WebPAC::Common;
7 use base 'WebPAC::Common';
8 use Data::Dumper;
9
10 =head1 NAME
11
12 WebPAC::Lookup::Normalize - data mungling for normalisation
13
14 =head1 VERSION
15
16 Version 0.09
17
18 =cut
19
20 our $VERSION = '0.09';
21
22 =head1 SYNOPSIS
23
24 This package contains code that mungle data to produce normalized format.
25
26 B<WARNING:>
27
28 This code is obsolete. It moved to here so that I don't have to re-write
29 L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
30 just yet. But it will dissapear real soon!
31
32 It contains several assumptions:
33
34 =over
35
36 =item *
37
38 format of fields is defined using C<v123^a> notation for repeatable fields
39 or C<s123^a> for single (or first) value, where C<123> is field number and
40 C<a> is subfield.
41
42 =item *
43
44 source data records (C<$rec>) have unique identifiers in field C<000>
45
46 =item *
47
48 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
49 perl code that is evaluated before producing output (value of field will be
50 interpolated before that)
51
52 =item *
53
54 optional C<filter{filter_name}> at B<begining of format> will apply perl
55 code defined as code ref on format after field substitution to producing
56 output
57
58 There is one built-in filter called C<regex> which can be use like this:
59
60 filter{regex(s/foo/bar/)}
61
62 =item *
63
64 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
65
66 =item *
67
68 at end, optional C<format>s rules are resolved. Format rules are similar to
69 C<sprintf> and can also contain C<lookup{...}> which is performed after
70 values are inserted in format.
71
72 =back
73
74 This also describes order in which transformations are applied (eval,
75 filter, lookup, format) which is important to undestand when deciding how to
76 solve your data mungling and normalisation process.
77
78
79
80
81 =head1 FUNCTIONS
82
83 =head2 new
84
85 Create new normalisation object
86
87 my $n = new WebPAC::Lookup::Normalize::Something(
88 filter => {
89 'filter_name_1' => sub {
90 # filter code
91 return length($_);
92 }, ...
93 },
94 db => $db_obj,
95 lookup_regex => $lookup->regex,
96 lookup => $lookup_obj,
97 prefix => 'foobar',
98 );
99
100 Parametar C<filter> defines user supplied snippets of perl code which can
101 be use with C<filter{...}> notation.
102
103 C<prefix> is used to form filename for database record (to support multiple
104 source files which are joined in one database).
105
106 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
107 in structures. If you pass this parametar, you must also pass C<lookup>
108 which is C<WebPAC::Lookup> object.
109
110 =cut
111
112 sub new {
113 my $class = shift;
114 my $self = {@_};
115 bless($self, $class);
116
117 my $r = $self->{'lookup_regex'} ? 1 : 0;
118 my $l = $self->{'lookup'} ? 1 : 0;
119
120 my $log = $self->_get_logger();
121
122 # those two must be in pair
123 if ( ($r & $l) != ($r || $l) ) {
124 my $log = $self->_get_logger();
125 $log->logdie("lookup_regex and lookup must be in pair");
126 }
127
128 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
129
130 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
131
132 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
133
134 if (! $self->{filter} || ! $self->{filter}->{regex}) {
135 $log->debug("adding built-in filter regex");
136 $self->{filter}->{regex} = sub {
137 my ($val, $regex) = @_;
138 eval "\$val =~ $regex";
139 return $val;
140 };
141 }
142
143 $self ? return $self : return undef;
144 }
145
146 =head2 all_tags
147
148 Returns all tags in document in specified order
149
150 my $sorted_tags = $self->all_tags();
151
152 =cut
153
154 sub all_tags {
155 my $self = shift;
156
157 if (! $self->{_tags_by_order}) {
158
159 my $log = $self->_get_logger;
160 # sanity check
161 $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
162
163 my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
164 $log->debug("unsorted tags: " . join(", ", @tags));
165
166 @tags = sort { $self->_sort_by_order } @tags;
167
168 $log->debug("sorted tags: " . join(",", @tags) );
169
170 $self->{_tags_by_order} = \@tags;
171 }
172
173 return $self->{_tags_by_order};
174 }
175
176
177
178 =head2 data_structure
179
180 Create in-memory data structure which represents normalized layout from
181 C<conf/normalize/*.xml>.
182
183 This structures are used to produce output.
184
185 my $ds = $webpac->data_structure($rec);
186
187 =cut
188
189 sub data_structure {
190 my $self = shift;
191
192 my $log = $self->_get_logger();
193
194 my $rec = shift;
195 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
196
197 $log->debug("data_structure rec = ", sub { Dumper($rec) });
198
199 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
200
201 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
202
203 my $cache_file;
204
205 if ($self->{'db'}) {
206 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
207 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
208 return $ds if ($ds);
209 $log->debug("cache miss, creating");
210 }
211
212 my $tags = $self->all_tags();
213
214 $log->debug("tags: ",sub { join(", ",@{ $tags }) });
215
216 my $ds;
217
218 foreach my $field (@{ $tags }) {
219
220 my $row;
221
222 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
223
224 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
225 my $format;
226
227 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
228 $format = $tag->{'value'} || $tag->{'content'};
229
230 my @v;
231 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
232 @v = $self->_rec_to_arr($rec,$format,'fill_in');
233 } else {
234 @v = $self->_rec_to_arr($rec,$format,'parse');
235 }
236 if (! @v) {
237 $log->debug("$field <",$self->{tag},"> format: $format no values");
238 next;
239 } else {
240 $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
241 }
242
243 if ($tag->{'sort'}) {
244 @v = $self->sort_arr(@v);
245 }
246
247 # use format?
248 if ($tag->{'format_name'}) {
249 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
250 }
251
252 # delimiter will join repeatable fields
253 if ($tag->{'delimiter'}) {
254 @v = ( join($tag->{'delimiter'}, @v) );
255 }
256
257 # default types
258 my @types = qw(display search);
259 # override by type attribute
260 @types = ( $tag->{'type'} ) if ($tag->{'type'});
261
262 foreach my $type (@types) {
263 # append to previous line?
264 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
265 if ($tag->{'append'}) {
266
267 # I will delimit appended part with
268 # delimiter (or ,)
269 my $d = $tag->{'delimiter'};
270 # default delimiter
271 $d ||= " ";
272
273 my $last = pop @{$row->{$type}};
274 $d = "" if (! $last);
275 $last .= $d . join($d, @v);
276 push @{$row->{$type}}, $last;
277
278 } else {
279 push @{$row->{$type}}, @v;
280 }
281 }
282
283
284 }
285
286 if ($row) {
287 $row->{'tag'} = $field;
288
289 # TODO: name_sigular, name_plural
290 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
291 my $row_name = $name ? $self->_x($name) : $field;
292
293 # post-sort all values in field
294 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
295 $log->warn("sort at field tag not implemented");
296 }
297
298 $ds->{$row_name} = $row;
299
300 $log->debug("row $field: ",sub { Dumper($row) });
301 }
302
303 }
304
305 $self->{'db'}->save_ds(
306 id => $id,
307 ds => $ds,
308 prefix => $self->{prefix},
309 ) if ($self->{'db'});
310
311 $log->debug("ds: ", sub { Dumper($ds) });
312
313 $log->logconfess("data structure returned is not array any more!") if wantarray;
314
315 return $ds;
316
317 }
318
319 =head2 parse
320
321 Perform smart parsing of string, skipping delimiters for fields which aren't
322 defined. It can also eval code in format starting with C<eval{...}> and
323 return output or nothing depending on eval code.
324
325 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
326
327 Filters are implemented here. While simple form of filters looks like this:
328
329 filter{name_of_filter}
330
331 but, filters can also have variable number of parametars like this:
332
333 filter{name_of_filter(param,param,param)}
334
335 =cut
336
337 my $warn_once;
338
339 sub parse {
340 my $self = shift;
341
342 my ($rec, $format_utf8, $i, $rec_size) = @_;
343
344 return if (! $format_utf8);
345
346 my $log = $self->_get_logger();
347
348 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
349
350 $i = 0 if (! $i);
351
352 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
353
354 my @out;
355
356 $log->debug("format: $format [$i]");
357
358 my $eval_code;
359 # remove eval{...} from beginning
360 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
361
362 my $filter_name;
363 # remove filter{...} from beginning
364 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
365
366 # did we found any (att all) field from format in row?
367 my $found_any;
368 # prefix before first field which we preserve it $found_any
369 my $prefix;
370
371 my $f_step = 1;
372
373 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
374
375 my $del = $1 || '';
376 $prefix = $del if ($f_step == 1);
377
378 my $fld_type = lc($2);
379
380 # repeatable index
381 my $r = $i;
382 if ($fld_type eq 's') {
383 if ($found_any->{'v'}) {
384 $r = 0;
385 } else {
386 return;
387 }
388 }
389
390 my $found = 0;
391 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
392
393 if ($found) {
394 $found_any->{$fld_type} += $found;
395
396 # we will skip delimiter before first occurence of field!
397 push @out, $del unless($found_any->{$fld_type} == 1);
398 push @out, $tmp if ($tmp);
399 }
400 $f_step++;
401 }
402
403 # test if any fields found?
404 return if (! $found_any->{'v'} && ! $found_any->{'s'});
405
406 my $out = join('',@out);
407
408 if ($out) {
409 # add rest of format (suffix)
410 $out .= $format;
411
412 # add prefix if not there
413 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
414
415 $log->debug("result: $out");
416 }
417
418 if ($eval_code) {
419 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
420 $log->debug("about to eval{$eval} format: $out");
421 return if (! $self->_eval($eval));
422 }
423
424 if ($filter_name) {
425 my @filter_args;
426 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
427 @filter_args = split(/,/, $2);
428 }
429 if ($self->{'filter'}->{$filter_name}) {
430 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
431 unshift @filter_args, $out;
432 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
433 return unless(defined($out));
434 $log->debug("filter result: $out");
435 } elsif (! $warn_once->{$filter_name}) {
436 $log->warn("trying to use undefined filter $filter_name");
437 $warn_once->{$filter_name}++;
438 }
439 }
440
441 return $out;
442 }
443
444 =head2 fill_in
445
446 Workhourse of all: takes record from in-memory structure of database and
447 strings with placeholders and returns string or array of with substituted
448 values from record.
449
450 my $text = $webpac->fill_in($rec,'v250^a');
451
452 Optional argument is ordinal number for repeatable fields. By default,
453 it's assume to be first repeatable field (fields are perl array, so first
454 element is 0).
455 Following example will read second value from repeatable field.
456
457 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459 This function B<does not> perform parsing of format to inteligenty skip
460 delimiters before fields which aren't used.
461
462 This method will automatically decode UTF-8 string to local code page
463 if needed.
464
465 There is optional parametar C<$record_size> which can be used to get sizes of
466 all C<field^subfield> combinations in this format.
467
468 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
469
470 =cut
471
472 sub fill_in {
473 my $self = shift;
474
475 my $log = $self->_get_logger();
476
477 my ($rec,$format,$i,$rec_size) = @_;
478
479 $log->logconfess("need data record") unless ($rec);
480 $log->logconfess("need format to parse") unless($format);
481
482 # iteration (for repeatable fields)
483 $i ||= 0;
484
485 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
486
487 # FIXME remove for speedup?
488 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
489
490 if (utf8::is_utf8($format)) {
491 $format = $self->_x($format);
492 }
493
494 my $found = 0;
495 my $just_single = 1;
496
497 my $eval_code;
498 # remove eval{...} from beginning
499 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
500
501 my $filter_name;
502 # remove filter{...} from beginning
503 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
504
505 {
506 # fix warnings
507 no warnings 'uninitialized';
508
509 # do actual replacement of placeholders
510 # repeatable fields
511 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
512 $just_single = 0;
513 }
514
515 # non-repeatable fields
516 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
517 return if ($i > 0 && $just_single);
518 }
519 }
520
521 if ($found) {
522 $log->debug("format: $format");
523 if ($eval_code) {
524 my $eval = $self->fill_in($rec,$eval_code,$i);
525 return if (! $self->_eval($eval));
526 }
527 if ($filter_name && $self->{'filter'}->{$filter_name}) {
528 $log->debug("filter '$filter_name' for $format");
529 $format = $self->{'filter'}->{$filter_name}->($format);
530 return unless(defined($format));
531 $log->debug("filter result: $format");
532 }
533 # do we have lookups?
534 if ($self->{'lookup'}) {
535 if ($self->{'lookup'}->can('lookup')) {
536 my @lookup = $self->{lookup}->lookup($format);
537 $log->debug("lookup $format", join(", ", @lookup));
538 return @lookup;
539 } else {
540 $log->warn("Have lookup object but can't invoke lookup method");
541 }
542 } else {
543 return $format;
544 }
545 } else {
546 return;
547 }
548 }
549
550
551 =head2 _rec_to_arr
552
553 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
554 for fields which have lookups, so they shouldn't be parsed but rather
555 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
556
557 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
558
559 =cut
560
561 sub _rec_to_arr {
562 my $self = shift;
563
564 my ($rec, $format_utf8, $code) = @_;
565
566 my $log = $self->_get_logger();
567
568 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
569 return if (! $format_utf8);
570
571 $log->debug("using $code on $format_utf8");
572
573 my $i = 0;
574 my $max = 0;
575 my @arr;
576 my $rec_size = {};
577
578 while ($i <= $max) {
579 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
580 if ($rec_size) {
581 foreach my $f (keys %{ $rec_size }) {
582 $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
583 }
584 $log->debug("max set to $max");
585 undef $rec_size;
586 }
587 if (@v) {
588 push @arr, @v;
589 } else {
590 push @arr, '' if ($max > $i);
591 }
592 }
593
594 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
595
596 return @arr;
597 }
598
599
600 =head2 get_data
601
602 Returns value from record.
603
604 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
605
606 Required arguments are:
607
608 =over 8
609
610 =item C<$rec>
611
612 record reference
613
614 =item C<$f>
615
616 field
617
618 =item C<$sf>
619
620 optional subfield
621
622 =item C<$i>
623
624 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
625
626 =item C<$found>
627
628 optional variable that will be incremeted if preset
629
630 =item C<$rec_size>
631
632 hash to hold maximum occurances of C<field^subfield> combinations
633 (which can be accessed using keys in same format)
634
635 =back
636
637 Returns value or empty string, updates C<$found> and C<rec_size>
638 if present.
639
640 =cut
641
642 sub get_data {
643 my $self = shift;
644
645 my ($rec,$f,$sf,$i,$found,$cache) = @_;
646
647 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
648
649 if (defined($$cache)) {
650 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
651 }
652
653 return '' unless ($$rec->{$f}->[$i]);
654
655 {
656 no strict 'refs';
657 if (defined($sf)) {
658 $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
659 return $$rec->{$f}->[$i]->{$sf};
660 } else {
661 $$found++ if (defined($$found));
662 # it still might have subfields, just
663 # not specified, so we'll dump some debug info
664 if ($$rec->{$f}->[$i] =~ /HASH/o) {
665 my $out;
666 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
667 my $v = $$rec->{$f}->[$i]->{$k};
668 $out .= '$' . $k .':' . $v if ($v);
669 }
670 return $out;
671 } else {
672 return $$rec->{$f}->[$i];
673 }
674 }
675 }
676 }
677
678
679 =head2 apply_format
680
681 Apply format specified in tag with C<format_name="name"> and
682 C<format_delimiter=";;">.
683
684 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
685
686 Formats can contain C<lookup{...}> if you need them.
687
688 =cut
689
690 sub apply_format {
691 my $self = shift;
692
693 my ($name,$delimiter,$data) = @_;
694
695 my $log = $self->_get_logger();
696
697 if (! $self->{'import_xml'}->{'format'}->{$name}) {
698 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
699 return $data;
700 }
701
702 $log->warn("no delimiter for format $name") if (! $delimiter);
703
704 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
705
706 my @data = split(/\Q$delimiter\E/, $data);
707
708 my $out = sprintf($format, @data);
709 $log->debug("using format $name [$format] on $data to produce: $out");
710
711 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
712 return $self->{'lookup'}->lookup($out);
713 } else {
714 return $out;
715 }
716
717 }
718
719 =head2 sort_arr
720
721 Sort array ignoring case and html in data
722
723 my @sorted = $webpac->sort_arr(@unsorted);
724
725 =cut
726
727 sub sort_arr {
728 my $self = shift;
729
730 my $log = $self->_get_logger();
731
732 # FIXME add Schwartzian Transformation?
733
734 my @sorted = sort {
735 $a =~ s#<[^>]+/*>##;
736 $b =~ s#<[^>]+/*>##;
737 lc($b) cmp lc($a)
738 } @_;
739 $log->debug("sorted values: ",sub { join(", ",@sorted) });
740
741 return @sorted;
742 }
743
744
745 =head1 INTERNAL METHODS
746
747 =head2 _sort_by_order
748
749 Sort xml tags data structure accoding to C<order=""> attribute.
750
751 =cut
752
753 sub _sort_by_order {
754 my $self = shift;
755
756 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
757 $self->{'import_xml'}->{'indexer'}->{$a};
758 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
759 $self->{'import_xml'}->{'indexer'}->{$b};
760
761 return $va <=> $vb;
762 }
763
764 =head2 _x
765
766 Convert strings from C<conf/normalize/*.xml> encoding into application
767 specific encoding (optinally specified using C<code_page> to C<new>
768 constructor).
769
770 my $text = $n->_x('normalize text string');
771
772 This is a stub so that other modules doesn't have to implement it.
773
774 =cut
775
776 sub _x {
777 my $self = shift;
778 return shift;
779 }
780
781
782 =head1 AUTHOR
783
784 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
785
786 =head1 COPYRIGHT & LICENSE
787
788 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
789
790 This program is free software; you can redistribute it and/or modify it
791 under the same terms as Perl itself.
792
793 =cut
794
795 1; # End of WebPAC::Lookup::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26