/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 433 - (show annotations)
Mon Apr 17 16:01:12 2006 UTC (16 years, 9 months ago) by dpavlin
File size: 18168 byte(s)
 r524@llin:  dpavlin | 2006-04-17 18:01:04 +0200
 added all_tags() to get sorted list of all tags in input xml

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use blib;
6 use WebPAC::Common;
7 use base 'WebPAC::Common';
8 use Data::Dumper;
9
10 =head1 NAME
11
12 WebPAC::Normalize - data mungling for normalisation
13
14 =head1 VERSION
15
16 Version 0.09
17
18 =cut
19
20 our $VERSION = '0.09';
21
22 =head1 SYNOPSIS
23
24 This package contains code that mungle data to produce normalized format.
25
26 It contains several assumptions:
27
28 =over
29
30 =item *
31
32 format of fields is defined using C<v123^a> notation for repeatable fields
33 or C<s123^a> for single (or first) value, where C<123> is field number and
34 C<a> is subfield.
35
36 =item *
37
38 source data records (C<$rec>) have unique identifiers in field C<000>
39
40 =item *
41
42 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43 perl code that is evaluated before producing output (value of field will be
44 interpolated before that)
45
46 =item *
47
48 optional C<filter{filter_name}> at B<begining of format> will apply perl
49 code defined as code ref on format after field substitution to producing
50 output
51
52 There is one built-in filter called C<regex> which can be use like this:
53
54 filter{regex(s/foo/bar/)}
55
56 =item *
57
58 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59
60 =item *
61
62 at end, optional C<format>s rules are resolved. Format rules are similar to
63 C<sprintf> and can also contain C<lookup{...}> which is performed after
64 values are inserted in format.
65
66 =back
67
68 This also describes order in which transformations are applied (eval,
69 filter, lookup, format) which is important to undestand when deciding how to
70 solve your data mungling and normalisation process.
71
72
73
74
75 =head1 FUNCTIONS
76
77 =head2 new
78
79 Create new normalisation object
80
81 my $n = new WebPAC::Normalize::Something(
82 filter => {
83 'filter_name_1' => sub {
84 # filter code
85 return length($_);
86 }, ...
87 },
88 db => $db_obj,
89 lookup_regex => $lookup->regex,
90 lookup => $lookup_obj,
91 prefix => 'foobar',
92 );
93
94 Parametar C<filter> defines user supplied snippets of perl code which can
95 be use with C<filter{...}> notation.
96
97 C<prefix> is used to form filename for database record (to support multiple
98 source files which are joined in one database).
99
100 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 in structures. If you pass this parametar, you must also pass C<lookup>
102 which is C<WebPAC::Lookup> object.
103
104 =cut
105
106 sub new {
107 my $class = shift;
108 my $self = {@_};
109 bless($self, $class);
110
111 my $r = $self->{'lookup_regex'} ? 1 : 0;
112 my $l = $self->{'lookup'} ? 1 : 0;
113
114 my $log = $self->_get_logger();
115
116 # those two must be in pair
117 if ( ($r & $l) != ($r || $l) ) {
118 my $log = $self->_get_logger();
119 $log->logdie("lookup_regex and lookup must be in pair");
120 }
121
122 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123
124 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125
126 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127
128 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 $log->debug("adding built-in filter regex");
130 $self->{filter}->{regex} = sub {
131 my ($val, $regex) = @_;
132 eval "\$val =~ $regex";
133 return $val;
134 };
135 }
136
137 $self ? return $self : return undef;
138 }
139
140 =head2 all_tags
141
142 Returns all tags in document in specified order
143
144 my $sorted_tags = $self->all_tags();
145
146 =cut
147
148 sub all_tags {
149 my $self = shift;
150
151 if (! $self->{_tags_by_order}) {
152
153 my $log = $self->_get_logger;
154 # sanity check
155 $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
156
157 my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
158 $log->debug("unsorted tags: " . join(", ", @tags));
159
160 @tags = sort { $self->_sort_by_order } @tags;
161
162 $log->debug("sorted tags: " . join(",", @tags) );
163
164 $self->{_tags_by_order} = \@tags;
165 }
166
167 return $self->{_tags_by_order};
168 }
169
170
171
172 =head2 data_structure
173
174 Create in-memory data structure which represents normalized layout from
175 C<conf/normalize/*.xml>.
176
177 This structures are used to produce output.
178
179 my $ds = $webpac->data_structure($rec);
180
181 =cut
182
183 sub data_structure {
184 my $self = shift;
185
186 my $log = $self->_get_logger();
187
188 my $rec = shift;
189 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
190
191 $log->debug("data_structure rec = ", sub { Dumper($rec) });
192
193 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
194
195 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
196
197 my $cache_file;
198
199 if ($self->{'db'}) {
200 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
201 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
202 return $ds if ($ds);
203 $log->debug("cache miss, creating");
204 }
205
206 my $tags = $self->all_tags();
207
208 $log->debug("tags: ",sub { join(", ",@{ $tags }) });
209
210 my $ds;
211
212 foreach my $field (@{ $tags }) {
213
214 my $row;
215
216 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
217
218 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
219 my $format;
220
221 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
222 $format = $tag->{'value'} || $tag->{'content'};
223
224 my @v;
225 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
226 @v = $self->_rec_to_arr($rec,$format,'fill_in');
227 } else {
228 @v = $self->_rec_to_arr($rec,$format,'parse');
229 }
230 if (! @v) {
231 $log->debug("$field <",$self->{tag},"> format: $format no values");
232 next;
233 } else {
234 $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
235 }
236
237 if ($tag->{'sort'}) {
238 @v = $self->sort_arr(@v);
239 }
240
241 # use format?
242 if ($tag->{'format_name'}) {
243 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
244 }
245
246 # delimiter will join repeatable fields
247 if ($tag->{'delimiter'}) {
248 @v = ( join($tag->{'delimiter'}, @v) );
249 }
250
251 # default types
252 my @types = qw(display search);
253 # override by type attribute
254 @types = ( $tag->{'type'} ) if ($tag->{'type'});
255
256 foreach my $type (@types) {
257 # append to previous line?
258 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
259 if ($tag->{'append'}) {
260
261 # I will delimit appended part with
262 # delimiter (or ,)
263 my $d = $tag->{'delimiter'};
264 # default delimiter
265 $d ||= " ";
266
267 my $last = pop @{$row->{$type}};
268 $d = "" if (! $last);
269 $last .= $d . join($d, @v);
270 push @{$row->{$type}}, $last;
271
272 } else {
273 push @{$row->{$type}}, @v;
274 }
275 }
276
277
278 }
279
280 if ($row) {
281 $row->{'tag'} = $field;
282
283 # TODO: name_sigular, name_plural
284 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
285 my $row_name = $name ? $self->_x($name) : $field;
286
287 # post-sort all values in field
288 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
289 $log->warn("sort at field tag not implemented");
290 }
291
292 $ds->{$row_name} = $row;
293
294 $log->debug("row $field: ",sub { Dumper($row) });
295 }
296
297 }
298
299 $self->{'db'}->save_ds(
300 id => $id,
301 ds => $ds,
302 prefix => $self->{prefix},
303 ) if ($self->{'db'});
304
305 $log->debug("ds: ", sub { Dumper($ds) });
306
307 $log->logconfess("data structure returned is not array any more!") if wantarray;
308
309 return $ds;
310
311 }
312
313 =head2 parse
314
315 Perform smart parsing of string, skipping delimiters for fields which aren't
316 defined. It can also eval code in format starting with C<eval{...}> and
317 return output or nothing depending on eval code.
318
319 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
320
321 Filters are implemented here. While simple form of filters looks like this:
322
323 filter{name_of_filter}
324
325 but, filters can also have variable number of parametars like this:
326
327 filter{name_of_filter(param,param,param)}
328
329 =cut
330
331 my $warn_once;
332
333 sub parse {
334 my $self = shift;
335
336 my ($rec, $format_utf8, $i, $rec_size) = @_;
337
338 return if (! $format_utf8);
339
340 my $log = $self->_get_logger();
341
342 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
343
344 $i = 0 if (! $i);
345
346 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
347
348 my @out;
349
350 $log->debug("format: $format [$i]");
351
352 my $eval_code;
353 # remove eval{...} from beginning
354 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
355
356 my $filter_name;
357 # remove filter{...} from beginning
358 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
359
360 # did we found any (att all) field from format in row?
361 my $found_any;
362 # prefix before first field which we preserve it $found_any
363 my $prefix;
364
365 my $f_step = 1;
366
367 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
368
369 my $del = $1 || '';
370 $prefix = $del if ($f_step == 1);
371
372 my $fld_type = lc($2);
373
374 # repeatable index
375 my $r = $i;
376 if ($fld_type eq 's') {
377 if ($found_any->{'v'}) {
378 $r = 0;
379 } else {
380 return;
381 }
382 }
383
384 my $found = 0;
385 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
386
387 if ($found) {
388 $found_any->{$fld_type} += $found;
389
390 # we will skip delimiter before first occurence of field!
391 push @out, $del unless($found_any->{$fld_type} == 1);
392 push @out, $tmp;
393 }
394 $f_step++;
395 }
396
397 # test if any fields found?
398 return if (! $found_any->{'v'} && ! $found_any->{'s'});
399
400 my $out = join('',@out);
401
402 if ($out) {
403 # add rest of format (suffix)
404 $out .= $format;
405
406 # add prefix if not there
407 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
408
409 $log->debug("result: $out");
410 }
411
412 if ($eval_code) {
413 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
414 $log->debug("about to eval{$eval} format: $out");
415 return if (! $self->_eval($eval));
416 }
417
418 if ($filter_name) {
419 my @filter_args;
420 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
421 @filter_args = split(/,/, $2);
422 }
423 if ($self->{'filter'}->{$filter_name}) {
424 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
425 unshift @filter_args, $out;
426 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
427 return unless(defined($out));
428 $log->debug("filter result: $out");
429 } elsif (! $warn_once->{$filter_name}) {
430 $log->warn("trying to use undefined filter $filter_name");
431 $warn_once->{$filter_name}++;
432 }
433 }
434
435 return $out;
436 }
437
438 =head2 fill_in
439
440 Workhourse of all: takes record from in-memory structure of database and
441 strings with placeholders and returns string or array of with substituted
442 values from record.
443
444 my $text = $webpac->fill_in($rec,'v250^a');
445
446 Optional argument is ordinal number for repeatable fields. By default,
447 it's assume to be first repeatable field (fields are perl array, so first
448 element is 0).
449 Following example will read second value from repeatable field.
450
451 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
452
453 This function B<does not> perform parsing of format to inteligenty skip
454 delimiters before fields which aren't used.
455
456 This method will automatically decode UTF-8 string to local code page
457 if needed.
458
459 There is optional parametar C<$record_size> which can be used to get sizes of
460 all C<field^subfield> combinations in this format.
461
462 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
463
464 =cut
465
466 sub fill_in {
467 my $self = shift;
468
469 my $log = $self->_get_logger();
470
471 my ($rec,$format,$i,$rec_size) = @_;
472
473 $log->logconfess("need data record") unless ($rec);
474 $log->logconfess("need format to parse") unless($format);
475
476 # iteration (for repeatable fields)
477 $i ||= 0;
478
479 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
480
481 # FIXME remove for speedup?
482 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
483
484 if (utf8::is_utf8($format)) {
485 $format = $self->_x($format);
486 }
487
488 my $found = 0;
489 my $just_single = 1;
490
491 my $eval_code;
492 # remove eval{...} from beginning
493 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
494
495 my $filter_name;
496 # remove filter{...} from beginning
497 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
498
499 {
500 # fix warnings
501 no warnings 'uninitialized';
502
503 # do actual replacement of placeholders
504 # repeatable fields
505 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
506 $just_single = 0;
507 }
508
509 # non-repeatable fields
510 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
511 return if ($i > 0 && $just_single);
512 }
513 }
514
515 if ($found) {
516 $log->debug("format: $format");
517 if ($eval_code) {
518 my $eval = $self->fill_in($rec,$eval_code,$i);
519 return if (! $self->_eval($eval));
520 }
521 if ($filter_name && $self->{'filter'}->{$filter_name}) {
522 $log->debug("filter '$filter_name' for $format");
523 $format = $self->{'filter'}->{$filter_name}->($format);
524 return unless(defined($format));
525 $log->debug("filter result: $format");
526 }
527 # do we have lookups?
528 if ($self->{'lookup'}) {
529 if ($self->{'lookup'}->can('lookup')) {
530 my @lookup = $self->{lookup}->lookup($format);
531 $log->debug("lookup $format", join(", ", @lookup));
532 return @lookup;
533 } else {
534 $log->warn("Have lookup object but can't invoke lookup method");
535 }
536 } else {
537 return $format;
538 }
539 } else {
540 return;
541 }
542 }
543
544
545 =head2 _rec_to_arr
546
547 Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
548 for fields which have lookups, so they shouldn't be parsed but rather
549 C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
550
551 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
552
553 =cut
554
555 sub _rec_to_arr {
556 my $self = shift;
557
558 my ($rec, $format_utf8, $code) = @_;
559
560 my $log = $self->_get_logger();
561
562 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
563 return if (! $format_utf8);
564
565 $log->debug("using $code on $format_utf8");
566
567 my $i = 0;
568 my $max = 0;
569 my @arr;
570 my $rec_size = {};
571
572 while ($i <= $max) {
573 my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
574 if ($rec_size) {
575 foreach my $f (keys %{ $rec_size }) {
576 $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
577 }
578 $log->debug("max set to $max");
579 undef $rec_size;
580 }
581 if (@v) {
582 push @arr, @v;
583 } else {
584 push @arr, '' if ($max > $i);
585 }
586 }
587
588 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
589
590 return @arr;
591 }
592
593
594 =head2 get_data
595
596 Returns value from record.
597
598 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
599
600 Required arguments are:
601
602 =over 8
603
604 =item C<$rec>
605
606 record reference
607
608 =item C<$f>
609
610 field
611
612 =item C<$sf>
613
614 optional subfield
615
616 =item C<$i>
617
618 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
619
620 =item C<$found>
621
622 optional variable that will be incremeted if preset
623
624 =item C<$rec_size>
625
626 hash to hold maximum occurances of C<field^subfield> combinations
627 (which can be accessed using keys in same format)
628
629 =back
630
631 Returns value or empty string, updates C<$found> and C<rec_size>
632 if present.
633
634 =cut
635
636 sub get_data {
637 my $self = shift;
638
639 my ($rec,$f,$sf,$i,$found,$cache) = @_;
640
641 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
642
643 if (defined($$cache)) {
644 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
645 }
646
647 return '' unless ($$rec->{$f}->[$i]);
648
649 {
650 no strict 'refs';
651 if (defined($sf)) {
652 $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
653 return $$rec->{$f}->[$i]->{$sf};
654 } else {
655 $$found++ if (defined($$found));
656 # it still might have subfields, just
657 # not specified, so we'll dump some debug info
658 if ($$rec->{$f}->[$i] =~ /HASH/o) {
659 my $out;
660 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
661 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
662 }
663 return $out;
664 } else {
665 return $$rec->{$f}->[$i];
666 }
667 }
668 }
669 }
670
671
672 =head2 apply_format
673
674 Apply format specified in tag with C<format_name="name"> and
675 C<format_delimiter=";;">.
676
677 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
678
679 Formats can contain C<lookup{...}> if you need them.
680
681 =cut
682
683 sub apply_format {
684 my $self = shift;
685
686 my ($name,$delimiter,$data) = @_;
687
688 my $log = $self->_get_logger();
689
690 if (! $self->{'import_xml'}->{'format'}->{$name}) {
691 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
692 return $data;
693 }
694
695 $log->warn("no delimiter for format $name") if (! $delimiter);
696
697 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
698
699 my @data = split(/\Q$delimiter\E/, $data);
700
701 my $out = sprintf($format, @data);
702 $log->debug("using format $name [$format] on $data to produce: $out");
703
704 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
705 return $self->{'lookup'}->lookup($out);
706 } else {
707 return $out;
708 }
709
710 }
711
712 =head2 sort_arr
713
714 Sort array ignoring case and html in data
715
716 my @sorted = $webpac->sort_arr(@unsorted);
717
718 =cut
719
720 sub sort_arr {
721 my $self = shift;
722
723 my $log = $self->_get_logger();
724
725 # FIXME add Schwartzian Transformation?
726
727 my @sorted = sort {
728 $a =~ s#<[^>]+/*>##;
729 $b =~ s#<[^>]+/*>##;
730 lc($b) cmp lc($a)
731 } @_;
732 $log->debug("sorted values: ",sub { join(", ",@sorted) });
733
734 return @sorted;
735 }
736
737
738 =head1 INTERNAL METHODS
739
740 =head2 _sort_by_order
741
742 Sort xml tags data structure accoding to C<order=""> attribute.
743
744 =cut
745
746 sub _sort_by_order {
747 my $self = shift;
748
749 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
750 $self->{'import_xml'}->{'indexer'}->{$a};
751 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
752 $self->{'import_xml'}->{'indexer'}->{$b};
753
754 return $va <=> $vb;
755 }
756
757 =head2 _x
758
759 Convert strings from C<conf/normalize/*.xml> encoding into application
760 specific encoding (optinally specified using C<code_page> to C<new>
761 constructor).
762
763 my $text = $n->_x('normalize text string');
764
765 This is a stub so that other modules doesn't have to implement it.
766
767 =cut
768
769 sub _x {
770 my $self = shift;
771 return shift;
772 }
773
774
775 =head1 AUTHOR
776
777 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
778
779 =head1 COPYRIGHT & LICENSE
780
781 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
782
783 This program is free software; you can redistribute it and/or modify it
784 under the same terms as Perl itself.
785
786 =cut
787
788 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26