/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 261 - (show annotations)
Fri Dec 16 16:00:18 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 16517 byte(s)
 r11729@llin:  dpavlin | 2005-12-16 21:00:26 +0100
 warn about non-defined filters just once

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.06
15
16 =cut
17
18 our $VERSION = '0.06';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if ($self->{filter} && ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 undef $self->{'currnet_filename'};
174 undef $self->{'headline'};
175
176 my @sorted_tags;
177 if ($self->{tags_by_order}) {
178 @sorted_tags = @{$self->{tags_by_order}};
179 } else {
180 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
181 $self->{tags_by_order} = \@sorted_tags;
182 }
183
184 my $ds;
185
186 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
187
188 foreach my $field (@sorted_tags) {
189
190 my $row;
191
192 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
193
194 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
195 my $format;
196
197 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
198 $format = $tag->{'value'} || $tag->{'content'};
199
200 $log->debug("format: $format");
201
202 my @v;
203 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
204 @v = $self->fill_in_to_arr($rec,$format);
205 } else {
206 @v = $self->parse_to_arr($rec,$format);
207 }
208 next if (! @v);
209
210 if ($tag->{'sort'}) {
211 @v = $self->sort_arr(@v);
212 }
213
214 # use format?
215 if ($tag->{'format_name'}) {
216 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
217 }
218
219 # delimiter will join repeatable fields
220 if ($tag->{'delimiter'}) {
221 @v = ( join($tag->{'delimiter'}, @v) );
222 }
223
224 # default types
225 my @types = qw(display search);
226 # override by type attribute
227 @types = ( $tag->{'type'} ) if ($tag->{'type'});
228
229 foreach my $type (@types) {
230 # append to previous line?
231 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
232 if ($tag->{'append'}) {
233
234 # I will delimit appended part with
235 # delimiter (or ,)
236 my $d = $tag->{'delimiter'};
237 # default delimiter
238 $d ||= " ";
239
240 my $last = pop @{$row->{$type}};
241 $d = "" if (! $last);
242 $last .= $d . join($d, @v);
243 push @{$row->{$type}}, $last;
244
245 } else {
246 push @{$row->{$type}}, @v;
247 }
248 }
249
250
251 }
252
253 if ($row) {
254 $row->{'tag'} = $field;
255
256 # TODO: name_sigular, name_plural
257 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258 my $row_name = $name ? $self->_x($name) : $field;
259
260 # post-sort all values in field
261 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262 $log->warn("sort at field tag not implemented");
263 }
264
265 $ds->{$row_name} = $row;
266
267 $log->debug("row $field: ",sub { Dumper($row) });
268 }
269
270 }
271
272 $self->{'db'}->save_ds(
273 id => $id,
274 ds => $ds,
275 prefix => $self->{prefix},
276 ) if ($self->{'db'});
277
278 $log->debug("ds: ", sub { Dumper($ds) });
279
280 $log->logconfess("data structure returned is not array any more!") if wantarray;
281
282 return $ds;
283
284 }
285
286 =head2 parse
287
288 Perform smart parsing of string, skipping delimiters for fields which aren't
289 defined. It can also eval code in format starting with C<eval{...}> and
290 return output or nothing depending on eval code.
291
292 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
293
294 Filters are implemented here. While simple form of filters looks like this:
295
296 filter{name_of_filter}
297
298 but, filters can also have variable number of parametars like this:
299
300 filter{name_of_filter(param,param,param)}
301
302 =cut
303
304 my $warn_once;
305
306 sub parse {
307 my $self = shift;
308
309 my ($rec, $format_utf8, $i) = @_;
310
311 return if (! $format_utf8);
312
313 my $log = $self->_get_logger();
314
315 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
316
317 $i = 0 if (! $i);
318
319 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
320
321 my @out;
322
323 $log->debug("format: $format");
324
325 my $eval_code;
326 # remove eval{...} from beginning
327 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328
329 my $filter_name;
330 # remove filter{...} from beginning
331 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
332
333 my $prefix;
334 my $all_found=0;
335
336 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
337
338 my $del = $1 || '';
339 $prefix ||= $del if ($all_found == 0);
340
341 # repeatable index
342 my $r = $i;
343 $r = 0 if (lc("$2") eq 's');
344
345 my $found = 0;
346 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
347
348 if ($found) {
349 push @out, $del;
350 push @out, $tmp;
351 $all_found += $found;
352 }
353 }
354
355 return if (! $all_found);
356
357 my $out = join('',@out);
358
359 if ($out) {
360 # add rest of format (suffix)
361 $out .= $format;
362
363 # add prefix if not there
364 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
365
366 $log->debug("result: $out");
367 }
368
369 if ($eval_code) {
370 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
371 $log->debug("about to eval{$eval} format: $out");
372 return if (! $self->_eval($eval));
373 }
374
375 if ($filter_name) {
376 my @filter_args;
377 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
378 @filter_args = split(/,/, $2);
379 }
380 if ($self->{'filter'}->{$filter_name}) {
381 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
382 unshift @filter_args, $out;
383 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
384 return unless(defined($out));
385 $log->debug("filter result: $out");
386 } elsif (! $warn_once->{$filter_name}) {
387 $log->warn("trying to use undefined filter $filter_name");
388 $warn_once->{$filter_name}++;
389 }
390 }
391
392 return $out;
393 }
394
395 =head2 parse_to_arr
396
397 Similar to C<parse>, but returns array of all repeatable fields
398
399 my @arr = $webpac->parse_to_arr($rec,'v250^a');
400
401 =cut
402
403 sub parse_to_arr {
404 my $self = shift;
405
406 my ($rec, $format_utf8) = @_;
407
408 my $log = $self->_get_logger();
409
410 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411 return if (! $format_utf8);
412
413 my $i = 0;
414 my @arr;
415
416 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
417 push @arr, $v;
418 }
419
420 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
421
422 return @arr;
423 }
424
425
426 =head2 fill_in
427
428 Workhourse of all: takes record from in-memory structure of database and
429 strings with placeholders and returns string or array of with substituted
430 values from record.
431
432 my $text = $webpac->fill_in($rec,'v250^a');
433
434 Optional argument is ordinal number for repeatable fields. By default,
435 it's assume to be first repeatable field (fields are perl array, so first
436 element is 0).
437 Following example will read second value from repeatable field.
438
439 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
440
441 This function B<does not> perform parsing of format to inteligenty skip
442 delimiters before fields which aren't used.
443
444 This method will automatically decode UTF-8 string to local code page
445 if needed.
446
447 =cut
448
449 sub fill_in {
450 my $self = shift;
451
452 my $log = $self->_get_logger();
453
454 my $rec = shift || $log->logconfess("need data record");
455 my $format = shift || $log->logconfess("need format to parse");
456 # iteration (for repeatable fields)
457 my $i = shift || 0;
458
459 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
460
461 # FIXME remove for speedup?
462 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
463
464 if (utf8::is_utf8($format)) {
465 $format = $self->_x($format);
466 }
467
468 my $found = 0;
469
470 my $eval_code;
471 # remove eval{...} from beginning
472 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
473
474 my $filter_name;
475 # remove filter{...} from beginning
476 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
477
478 # do actual replacement of placeholders
479 # repeatable fields
480 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
481 # non-repeatable fields
482 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
483
484 if ($found) {
485 $log->debug("format: $format");
486 if ($eval_code) {
487 my $eval = $self->fill_in($rec,$eval_code,$i);
488 return if (! $self->_eval($eval));
489 }
490 if ($filter_name && $self->{'filter'}->{$filter_name}) {
491 $log->debug("filter '$filter_name' for $format");
492 $format = $self->{'filter'}->{$filter_name}->($format);
493 return unless(defined($format));
494 $log->debug("filter result: $format");
495 }
496 # do we have lookups?
497 if ($self->{'lookup'}) {
498 if ($self->{'lookup'}->can('lookup')) {
499 my @lookup = $self->{lookup}->lookup($format);
500 $log->debug("lookup $format", join(", ", @lookup));
501 return @lookup;
502 } else {
503 $log->warn("Have lookup object but can't invoke lookup method");
504 }
505 } else {
506 return $format;
507 }
508 } else {
509 return;
510 }
511 }
512
513
514 =head2 fill_in_to_arr
515
516 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
517 for fields which have lookups, so they shouldn't be parsed but rather
518 C<fill_id>ed.
519
520 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
521
522 =cut
523
524 sub fill_in_to_arr {
525 my $self = shift;
526
527 my ($rec, $format_utf8) = @_;
528
529 my $log = $self->_get_logger();
530
531 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
532 return if (! $format_utf8);
533
534 my $i = 0;
535 my @arr;
536
537 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
538 push @arr, @v;
539 }
540
541 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
542
543 return @arr;
544 }
545
546
547 =head2 get_data
548
549 Returns value from record.
550
551 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
552
553 Arguments are:
554 record reference C<$rec>,
555 field C<$f>,
556 optional subfiled C<$sf>,
557 index for repeatable values C<$i>.
558
559 Optinal variable C<$found> will be incremeted if there
560 is field.
561
562 Returns value or empty string.
563
564 =cut
565
566 sub get_data {
567 my $self = shift;
568
569 my ($rec,$f,$sf,$i,$found) = @_;
570
571 if ($$rec->{$f}) {
572 return '' if (! $$rec->{$f}->[$i]);
573 no strict 'refs';
574 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
575 $$found++ if (defined($$found));
576 return $$rec->{$f}->[$i]->{$sf};
577 } elsif (! $sf && $$rec->{$f}->[$i]) {
578 $$found++ if (defined($$found));
579 # it still might have subfield, just
580 # not specified, so we'll dump all
581 if ($$rec->{$f}->[$i] =~ /HASH/o) {
582 my $out;
583 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
584 $out .= $$rec->{$f}->[$i]->{$k}." ";
585 }
586 return $out;
587 } else {
588 return $$rec->{$f}->[$i];
589 }
590 } else {
591 return '';
592 }
593 } else {
594 return '';
595 }
596 }
597
598
599 =head2 apply_format
600
601 Apply format specified in tag with C<format_name="name"> and
602 C<format_delimiter=";;">.
603
604 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
605
606 Formats can contain C<lookup{...}> if you need them.
607
608 =cut
609
610 sub apply_format {
611 my $self = shift;
612
613 my ($name,$delimiter,$data) = @_;
614
615 my $log = $self->_get_logger();
616
617 if (! $self->{'import_xml'}->{'format'}->{$name}) {
618 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
619 return $data;
620 }
621
622 $log->warn("no delimiter for format $name") if (! $delimiter);
623
624 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
625
626 my @data = split(/\Q$delimiter\E/, $data);
627
628 my $out = sprintf($format, @data);
629 $log->debug("using format $name [$format] on $data to produce: $out");
630
631 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
632 return $self->{'lookup'}->lookup($out);
633 } else {
634 return $out;
635 }
636
637 }
638
639 =head2 sort_arr
640
641 Sort array ignoring case and html in data
642
643 my @sorted = $webpac->sort_arr(@unsorted);
644
645 =cut
646
647 sub sort_arr {
648 my $self = shift;
649
650 my $log = $self->_get_logger();
651
652 # FIXME add Schwartzian Transformation?
653
654 my @sorted = sort {
655 $a =~ s#<[^>]+/*>##;
656 $b =~ s#<[^>]+/*>##;
657 lc($b) cmp lc($a)
658 } @_;
659 $log->debug("sorted values: ",sub { join(", ",@sorted) });
660
661 return @sorted;
662 }
663
664
665 =head1 INTERNAL METHODS
666
667 =head2 _sort_by_order
668
669 Sort xml tags data structure accoding to C<order=""> attribute.
670
671 =cut
672
673 sub _sort_by_order {
674 my $self = shift;
675
676 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
677 $self->{'import_xml'}->{'indexer'}->{$a};
678 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
679 $self->{'import_xml'}->{'indexer'}->{$b};
680
681 return $va <=> $vb;
682 }
683
684 =head2 _x
685
686 Convert strings from C<conf/normalize/*.xml> encoding into application
687 specific encoding (optinally specified using C<code_page> to C<new>
688 constructor).
689
690 my $text = $n->_x('normalize text string');
691
692 This is a stub so that other modules doesn't have to implement it.
693
694 =cut
695
696 sub _x {
697 my $self = shift;
698 return shift;
699 }
700
701
702 =head1 AUTHOR
703
704 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
705
706 =head1 COPYRIGHT & LICENSE
707
708 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
709
710 This program is free software; you can redistribute it and/or modify it
711 under the same terms as Perl itself.
712
713 =cut
714
715 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26