/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 371 - (show annotations)
Sun Jan 8 21:16:27 2006 UTC (17 years, 1 month ago) by dpavlin
File size: 17970 byte(s)
 r409@llin:  dpavlin | 2006-01-08 22:16:39 +0100
 collect record sizes

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use blib;
6 use WebPAC::Common;
7 use base 'WebPAC::Common';
8 use Data::Dumper;
9
10 =head1 NAME
11
12 WebPAC::Normalize - data mungling for normalisation
13
14 =head1 VERSION
15
16 Version 0.08
17
18 =cut
19
20 our $VERSION = '0.08';
21
22 =head1 SYNOPSIS
23
24 This package contains code that mungle data to produce normalized format.
25
26 It contains several assumptions:
27
28 =over
29
30 =item *
31
32 format of fields is defined using C<v123^a> notation for repeatable fields
33 or C<s123^a> for single (or first) value, where C<123> is field number and
34 C<a> is subfield.
35
36 =item *
37
38 source data records (C<$rec>) have unique identifiers in field C<000>
39
40 =item *
41
42 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
43 perl code that is evaluated before producing output (value of field will be
44 interpolated before that)
45
46 =item *
47
48 optional C<filter{filter_name}> at B<begining of format> will apply perl
49 code defined as code ref on format after field substitution to producing
50 output
51
52 There is one built-in filter called C<regex> which can be use like this:
53
54 filter{regex(s/foo/bar/)}
55
56 =item *
57
58 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
59
60 =item *
61
62 at end, optional C<format>s rules are resolved. Format rules are similar to
63 C<sprintf> and can also contain C<lookup{...}> which is performed after
64 values are inserted in format.
65
66 =back
67
68 This also describes order in which transformations are applied (eval,
69 filter, lookup, format) which is important to undestand when deciding how to
70 solve your data mungling and normalisation process.
71
72
73
74
75 =head1 FUNCTIONS
76
77 =head2 new
78
79 Create new normalisation object
80
81 my $n = new WebPAC::Normalize::Something(
82 filter => {
83 'filter_name_1' => sub {
84 # filter code
85 return length($_);
86 }, ...
87 },
88 db => $db_obj,
89 lookup_regex => $lookup->regex,
90 lookup => $lookup_obj,
91 prefix => 'foobar',
92 );
93
94 Parametar C<filter> defines user supplied snippets of perl code which can
95 be use with C<filter{...}> notation.
96
97 C<prefix> is used to form filename for database record (to support multiple
98 source files which are joined in one database).
99
100 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
101 in structures. If you pass this parametar, you must also pass C<lookup>
102 which is C<WebPAC::Lookup> object.
103
104 =cut
105
106 sub new {
107 my $class = shift;
108 my $self = {@_};
109 bless($self, $class);
110
111 my $r = $self->{'lookup_regex'} ? 1 : 0;
112 my $l = $self->{'lookup'} ? 1 : 0;
113
114 my $log = $self->_get_logger();
115
116 # those two must be in pair
117 if ( ($r & $l) != ($r || $l) ) {
118 my $log = $self->_get_logger();
119 $log->logdie("lookup_regex and lookup must be in pair");
120 }
121
122 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
123
124 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
125
126 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
127
128 if (! $self->{filter} || ! $self->{filter}->{regex}) {
129 $log->debug("adding built-in filter regex");
130 $self->{filter}->{regex} = sub {
131 my ($val, $regex) = @_;
132 eval "\$val =~ $regex";
133 return $val;
134 };
135 }
136
137 $self ? return $self : return undef;
138 }
139
140
141 =head2 data_structure
142
143 Create in-memory data structure which represents normalized layout from
144 C<conf/normalize/*.xml>.
145
146 This structures are used to produce output.
147
148 my $ds = $webpac->data_structure($rec);
149
150 =cut
151
152 sub data_structure {
153 my $self = shift;
154
155 my $log = $self->_get_logger();
156
157 my $rec = shift;
158 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
159
160 $log->debug("data_structure rec = ", sub { Dumper($rec) });
161
162 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
163
164 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
165
166 my $cache_file;
167
168 if ($self->{'db'}) {
169 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
170 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
171 return $ds if ($ds);
172 $log->debug("cache miss, creating");
173 }
174
175 my @sorted_tags;
176 if ($self->{tags_by_order}) {
177 @sorted_tags = @{$self->{tags_by_order}};
178 } else {
179 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
180 $self->{tags_by_order} = \@sorted_tags;
181 }
182
183 my $ds;
184
185 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
186
187 foreach my $field (@sorted_tags) {
188
189 my $row;
190
191 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
192
193 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
194 my $format;
195
196 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
197 $format = $tag->{'value'} || $tag->{'content'};
198
199 my @v;
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->fill_in_to_arr($rec,$format);
202 } else {
203 @v = $self->parse_to_arr($rec,$format);
204 }
205 if (! @v) {
206 $log->debug("$field <",$self->{tag},"> format: $format no values");
207 # next;
208 } else {
209 $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
210 }
211
212 if ($tag->{'sort'}) {
213 @v = $self->sort_arr(@v);
214 }
215
216 # use format?
217 if ($tag->{'format_name'}) {
218 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
219 }
220
221 # delimiter will join repeatable fields
222 if ($tag->{'delimiter'}) {
223 @v = ( join($tag->{'delimiter'}, @v) );
224 }
225
226 # default types
227 my @types = qw(display search);
228 # override by type attribute
229 @types = ( $tag->{'type'} ) if ($tag->{'type'});
230
231 foreach my $type (@types) {
232 # append to previous line?
233 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
234 if ($tag->{'append'}) {
235
236 # I will delimit appended part with
237 # delimiter (or ,)
238 my $d = $tag->{'delimiter'};
239 # default delimiter
240 $d ||= " ";
241
242 my $last = pop @{$row->{$type}};
243 $d = "" if (! $last);
244 $last .= $d . join($d, @v);
245 push @{$row->{$type}}, $last;
246
247 } else {
248 push @{$row->{$type}}, @v;
249 }
250 }
251
252
253 }
254
255 if ($row) {
256 $row->{'tag'} = $field;
257
258 # TODO: name_sigular, name_plural
259 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
260 my $row_name = $name ? $self->_x($name) : $field;
261
262 # post-sort all values in field
263 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
264 $log->warn("sort at field tag not implemented");
265 }
266
267 $ds->{$row_name} = $row;
268
269 $log->debug("row $field: ",sub { Dumper($row) });
270 }
271
272 }
273
274 $self->{'db'}->save_ds(
275 id => $id,
276 ds => $ds,
277 prefix => $self->{prefix},
278 ) if ($self->{'db'});
279
280 $log->debug("ds: ", sub { Dumper($ds) });
281
282 $log->logconfess("data structure returned is not array any more!") if wantarray;
283
284 return $ds;
285
286 }
287
288 =head2 parse
289
290 Perform smart parsing of string, skipping delimiters for fields which aren't
291 defined. It can also eval code in format starting with C<eval{...}> and
292 return output or nothing depending on eval code.
293
294 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
295
296 Filters are implemented here. While simple form of filters looks like this:
297
298 filter{name_of_filter}
299
300 but, filters can also have variable number of parametars like this:
301
302 filter{name_of_filter(param,param,param)}
303
304 =cut
305
306 my $warn_once;
307
308 sub parse {
309 my $self = shift;
310
311 my ($rec, $format_utf8, $i, $rec_size) = @_;
312
313 return if (! $format_utf8);
314
315 my $log = $self->_get_logger();
316
317 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
318
319 $i = 0 if (! $i);
320
321 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
322
323 my @out;
324
325 $log->debug("format: $format [$i]");
326
327 my $eval_code;
328 # remove eval{...} from beginning
329 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
330
331 my $filter_name;
332 # remove filter{...} from beginning
333 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
334
335 # did we found any (att all) field from format in row?
336 my $found_any;
337 # prefix before first field which we preserve it $found_any
338 my $prefix;
339
340 my $f_step = 1;
341
342 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
343
344 my $del = $1 || '';
345 $prefix = $del if ($f_step == 1);
346
347 my $fld_type = lc($2);
348
349 # repeatable index
350 my $r = $i;
351 if ($fld_type eq 's') {
352 if ($found_any->{'v'}) {
353 $r = 0;
354 } else {
355 return;
356 }
357 }
358
359 my $found = 0;
360 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
361
362 if ($found) {
363 $found_any->{$fld_type} += $found;
364
365 # we will skip delimiter before first occurence of field!
366 push @out, $del unless($found_any->{$fld_type} == 1);
367 push @out, $tmp;
368 }
369 $f_step++;
370 }
371
372 # test if any fields found?
373 return if (! $found_any->{'v'} && ! $found_any->{'s'});
374
375 my $out = join('',@out);
376
377 if ($out) {
378 # add rest of format (suffix)
379 $out .= $format;
380
381 # add prefix if not there
382 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
383
384 $log->debug("result: $out");
385 }
386
387 if ($eval_code) {
388 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
389 $log->debug("about to eval{$eval} format: $out");
390 return if (! $self->_eval($eval));
391 }
392
393 if ($filter_name) {
394 my @filter_args;
395 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
396 @filter_args = split(/,/, $2);
397 }
398 if ($self->{'filter'}->{$filter_name}) {
399 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
400 unshift @filter_args, $out;
401 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
402 return unless(defined($out));
403 $log->debug("filter result: $out");
404 } elsif (! $warn_once->{$filter_name}) {
405 $log->warn("trying to use undefined filter $filter_name");
406 $warn_once->{$filter_name}++;
407 }
408 }
409
410 return $out;
411 }
412
413 =head2 parse_to_arr
414
415 Similar to C<parse>, but returns array of all repeatable fields
416
417 my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419 =cut
420
421 sub parse_to_arr {
422 my $self = shift;
423
424 my ($rec, $format_utf8) = @_;
425
426 my $log = $self->_get_logger();
427
428 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429 return if (! $format_utf8);
430
431 my $i = 0;
432 my @arr;
433
434 my $rec_size = { '_' => '_' };
435
436 while (my $v = $self->parse($rec,$format_utf8,$i++,\$rec_size)) {
437 push @arr, $v;
438 warn "parse rec_size = ", Dumper($rec_size);
439 }
440
441 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
442
443 return @arr;
444 }
445
446
447 =head2 fill_in
448
449 Workhourse of all: takes record from in-memory structure of database and
450 strings with placeholders and returns string or array of with substituted
451 values from record.
452
453 my $text = $webpac->fill_in($rec,'v250^a');
454
455 Optional argument is ordinal number for repeatable fields. By default,
456 it's assume to be first repeatable field (fields are perl array, so first
457 element is 0).
458 Following example will read second value from repeatable field.
459
460 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
461
462 This function B<does not> perform parsing of format to inteligenty skip
463 delimiters before fields which aren't used.
464
465 This method will automatically decode UTF-8 string to local code page
466 if needed.
467
468 There is optional parametar C<$record_size> which can be used to get sizes of
469 all C<field^subfield> combinations in this format.
470
471 my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
472
473 =cut
474
475 sub fill_in {
476 my $self = shift;
477
478 my $log = $self->_get_logger();
479
480 my ($rec,$format,$i,$rec_size) = @_;
481
482 $log->logconfess("need data record") unless ($rec);
483 $log->logconfess("need format to parse") unless($format);
484
485 # iteration (for repeatable fields)
486 $i ||= 0;
487
488 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
489
490 # FIXME remove for speedup?
491 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
492
493 if (utf8::is_utf8($format)) {
494 $format = $self->_x($format);
495 }
496
497 my $found = 0;
498 my $just_single = 1;
499
500 my $eval_code;
501 # remove eval{...} from beginning
502 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
503
504 my $filter_name;
505 # remove filter{...} from beginning
506 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
507
508 # do actual replacement of placeholders
509 # repeatable fields
510 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
511 $just_single = 0;
512 }
513
514 # non-repeatable fields
515 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
516 return if ($i > 0 && $just_single);
517 }
518
519 if ($found) {
520 $log->debug("format: $format");
521 if ($eval_code) {
522 my $eval = $self->fill_in($rec,$eval_code,$i);
523 return if (! $self->_eval($eval));
524 }
525 if ($filter_name && $self->{'filter'}->{$filter_name}) {
526 $log->debug("filter '$filter_name' for $format");
527 $format = $self->{'filter'}->{$filter_name}->($format);
528 return unless(defined($format));
529 $log->debug("filter result: $format");
530 }
531 # do we have lookups?
532 if ($self->{'lookup'}) {
533 if ($self->{'lookup'}->can('lookup')) {
534 my @lookup = $self->{lookup}->lookup($format);
535 $log->debug("lookup $format", join(", ", @lookup));
536 return @lookup;
537 } else {
538 $log->warn("Have lookup object but can't invoke lookup method");
539 }
540 } else {
541 return $format;
542 }
543 } else {
544 return;
545 }
546 }
547
548
549 =head2 fill_in_to_arr
550
551 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
552 for fields which have lookups, so they shouldn't be parsed but rather
553 C<fill_id>ed.
554
555 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
556
557 =cut
558
559 sub fill_in_to_arr {
560 my $self = shift;
561
562 my ($rec, $format_utf8) = @_;
563
564 my $log = $self->_get_logger();
565
566 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
567 return if (! $format_utf8);
568
569 my $i = 0;
570 my @arr;
571
572 my $rec_size;
573
574 while (my $v = $self->fill_in($rec,$format_utf8,$i,\$rec_size)) {
575 push @arr, $v;
576 warn "rec_size = ", Dumper($rec_size);
577 }
578
579 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
580
581 return @arr;
582 }
583
584
585 =head2 get_data
586
587 Returns value from record.
588
589 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
590
591 Required arguments are:
592
593 =over 8
594
595 =item C<$rec>
596
597 record reference
598
599 =item C<$f>
600
601 field
602
603 =item C<$sf>
604
605 optional subfield
606
607 =item C<$i>
608
609 index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
610
611 =item C<$found>
612
613 optional variable that will be incremeted if preset
614
615 =item C<$rec_size>
616
617 hash to hold maximum occurances of C<field^subfield> combinations
618 (which can be accessed using keys in same format)
619
620 =back
621
622 Returns value or empty string, updates C<$found> and C<rec_size>
623 if present.
624
625 =cut
626
627 sub get_data {
628 my $self = shift;
629
630 my ($rec,$f,$sf,$i,$found,$cache) = @_;
631
632 return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
633
634 if (defined($$cache)) {
635 $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
636 }
637
638 return '' unless ($$rec->{$f}->[$i]);
639
640 {
641 no strict 'refs';
642 if (defined($sf)) {
643 $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
644 return $$rec->{$f}->[$i]->{$sf};
645 } else {
646 $$found++ if (defined($$found));
647 # it still might have subfields, just
648 # not specified, so we'll dump some debug info
649 if ($$rec->{$f}->[$i] =~ /HASH/o) {
650 my $out;
651 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
652 $out .= '$' . $k .':' . $$rec->{$f}->[$i]->{$k}." ";
653 }
654 return $out;
655 } else {
656 return $$rec->{$f}->[$i];
657 }
658 }
659 }
660 }
661
662
663 =head2 apply_format
664
665 Apply format specified in tag with C<format_name="name"> and
666 C<format_delimiter=";;">.
667
668 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
669
670 Formats can contain C<lookup{...}> if you need them.
671
672 =cut
673
674 sub apply_format {
675 my $self = shift;
676
677 my ($name,$delimiter,$data) = @_;
678
679 my $log = $self->_get_logger();
680
681 if (! $self->{'import_xml'}->{'format'}->{$name}) {
682 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
683 return $data;
684 }
685
686 $log->warn("no delimiter for format $name") if (! $delimiter);
687
688 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
689
690 my @data = split(/\Q$delimiter\E/, $data);
691
692 my $out = sprintf($format, @data);
693 $log->debug("using format $name [$format] on $data to produce: $out");
694
695 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
696 return $self->{'lookup'}->lookup($out);
697 } else {
698 return $out;
699 }
700
701 }
702
703 =head2 sort_arr
704
705 Sort array ignoring case and html in data
706
707 my @sorted = $webpac->sort_arr(@unsorted);
708
709 =cut
710
711 sub sort_arr {
712 my $self = shift;
713
714 my $log = $self->_get_logger();
715
716 # FIXME add Schwartzian Transformation?
717
718 my @sorted = sort {
719 $a =~ s#<[^>]+/*>##;
720 $b =~ s#<[^>]+/*>##;
721 lc($b) cmp lc($a)
722 } @_;
723 $log->debug("sorted values: ",sub { join(", ",@sorted) });
724
725 return @sorted;
726 }
727
728
729 =head1 INTERNAL METHODS
730
731 =head2 _sort_by_order
732
733 Sort xml tags data structure accoding to C<order=""> attribute.
734
735 =cut
736
737 sub _sort_by_order {
738 my $self = shift;
739
740 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
741 $self->{'import_xml'}->{'indexer'}->{$a};
742 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
743 $self->{'import_xml'}->{'indexer'}->{$b};
744
745 return $va <=> $vb;
746 }
747
748 =head2 _x
749
750 Convert strings from C<conf/normalize/*.xml> encoding into application
751 specific encoding (optinally specified using C<code_page> to C<new>
752 constructor).
753
754 my $text = $n->_x('normalize text string');
755
756 This is a stub so that other modules doesn't have to implement it.
757
758 =cut
759
760 sub _x {
761 my $self = shift;
762 return shift;
763 }
764
765
766 =head1 AUTHOR
767
768 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
769
770 =head1 COPYRIGHT & LICENSE
771
772 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
773
774 This program is free software; you can redistribute it and/or modify it
775 under the same terms as Perl itself.
776
777 =cut
778
779 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26