/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 364 - (show annotations)
Sun Jan 8 20:27:11 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 17136 byte(s)
 r393@llin:  dpavlin | 2006-01-08 20:50:40 +0100
 better logging and minor fix to fill_arr

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.08
15
16 =cut
17
18 our $VERSION = '0.08';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 my @sorted_tags;
174 if ($self->{tags_by_order}) {
175 @sorted_tags = @{$self->{tags_by_order}};
176 } else {
177 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178 $self->{tags_by_order} = \@sorted_tags;
179 }
180
181 my $ds;
182
183 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185 foreach my $field (@sorted_tags) {
186
187 my $row;
188
189 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 my $format;
193
194 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195 $format = $tag->{'value'} || $tag->{'content'};
196
197 my @v;
198 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
199 @v = $self->fill_in_to_arr($rec,$format);
200 } else {
201 @v = $self->parse_to_arr($rec,$format);
202 }
203 if (! @v) {
204 $log->debug("$field <",$self->{tag},"> format: $format no values");
205 # next;
206 } else {
207 $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
208 }
209
210 if ($tag->{'sort'}) {
211 @v = $self->sort_arr(@v);
212 }
213
214 # use format?
215 if ($tag->{'format_name'}) {
216 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
217 }
218
219 # delimiter will join repeatable fields
220 if ($tag->{'delimiter'}) {
221 @v = ( join($tag->{'delimiter'}, @v) );
222 }
223
224 # default types
225 my @types = qw(display search);
226 # override by type attribute
227 @types = ( $tag->{'type'} ) if ($tag->{'type'});
228
229 foreach my $type (@types) {
230 # append to previous line?
231 $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
232 if ($tag->{'append'}) {
233
234 # I will delimit appended part with
235 # delimiter (or ,)
236 my $d = $tag->{'delimiter'};
237 # default delimiter
238 $d ||= " ";
239
240 my $last = pop @{$row->{$type}};
241 $d = "" if (! $last);
242 $last .= $d . join($d, @v);
243 push @{$row->{$type}}, $last;
244
245 } else {
246 push @{$row->{$type}}, @v;
247 }
248 }
249
250
251 }
252
253 if ($row) {
254 $row->{'tag'} = $field;
255
256 # TODO: name_sigular, name_plural
257 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
258 my $row_name = $name ? $self->_x($name) : $field;
259
260 # post-sort all values in field
261 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
262 $log->warn("sort at field tag not implemented");
263 }
264
265 $ds->{$row_name} = $row;
266
267 $log->debug("row $field: ",sub { Dumper($row) });
268 }
269
270 }
271
272 $self->{'db'}->save_ds(
273 id => $id,
274 ds => $ds,
275 prefix => $self->{prefix},
276 ) if ($self->{'db'});
277
278 $log->debug("ds: ", sub { Dumper($ds) });
279
280 $log->logconfess("data structure returned is not array any more!") if wantarray;
281
282 return $ds;
283
284 }
285
286 =head2 parse
287
288 Perform smart parsing of string, skipping delimiters for fields which aren't
289 defined. It can also eval code in format starting with C<eval{...}> and
290 return output or nothing depending on eval code.
291
292 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
293
294 Filters are implemented here. While simple form of filters looks like this:
295
296 filter{name_of_filter}
297
298 but, filters can also have variable number of parametars like this:
299
300 filter{name_of_filter(param,param,param)}
301
302 =cut
303
304 my $warn_once;
305
306 sub parse {
307 my $self = shift;
308
309 my ($rec, $format_utf8, $i) = @_;
310
311 return if (! $format_utf8);
312
313 my $log = $self->_get_logger();
314
315 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
316
317 $i = 0 if (! $i);
318
319 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
320
321 my @out;
322
323 $log->debug("format: $format [$i]");
324
325 my $eval_code;
326 # remove eval{...} from beginning
327 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
328
329 my $filter_name;
330 # remove filter{...} from beginning
331 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
332
333 # did we found any (att all) field from format in row?
334 my $found_any;
335 # prefix before first field which we preserve it $found_any
336 my $prefix;
337
338 my $f_step = 1;
339
340 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
341
342 my $del = $1 || '';
343 $prefix = $del if ($f_step == 1);
344
345 my $fld_type = lc($2);
346
347 # repeatable index
348 my $r = $i;
349 if ($fld_type eq 's') {
350 if ($found_any->{'v'}) {
351 $r = 0;
352 } else {
353 return;
354 }
355 }
356
357 my $found = 0;
358 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
359
360 if ($found) {
361 $found_any->{$fld_type} += $found;
362
363 # we will skip delimiter before first occurence of field!
364 push @out, $del unless($found_any->{$fld_type} == 1);
365 push @out, $tmp;
366 }
367 $f_step++;
368 }
369
370 # test if any fields found?
371 return if (! $found_any->{'v'} && ! $found_any->{'s'});
372
373 my $out = join('',@out);
374
375 if ($out) {
376 # add rest of format (suffix)
377 $out .= $format;
378
379 # add prefix if not there
380 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
381
382 $log->debug("result: $out");
383 }
384
385 if ($eval_code) {
386 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
387 $log->debug("about to eval{$eval} format: $out");
388 return if (! $self->_eval($eval));
389 }
390
391 if ($filter_name) {
392 my @filter_args;
393 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
394 @filter_args = split(/,/, $2);
395 }
396 if ($self->{'filter'}->{$filter_name}) {
397 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
398 unshift @filter_args, $out;
399 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
400 return unless(defined($out));
401 $log->debug("filter result: $out");
402 } elsif (! $warn_once->{$filter_name}) {
403 $log->warn("trying to use undefined filter $filter_name");
404 $warn_once->{$filter_name}++;
405 }
406 }
407
408 return $out;
409 }
410
411 =head2 parse_to_arr
412
413 Similar to C<parse>, but returns array of all repeatable fields
414
415 my @arr = $webpac->parse_to_arr($rec,'v250^a');
416
417 =cut
418
419 sub parse_to_arr {
420 my $self = shift;
421
422 my ($rec, $format_utf8) = @_;
423
424 my $log = $self->_get_logger();
425
426 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
427 return if (! $format_utf8);
428
429 my $i = 0;
430 my @arr;
431
432 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
433 push @arr, $v;
434 }
435
436 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
437
438 return @arr;
439 }
440
441
442 =head2 fill_in
443
444 Workhourse of all: takes record from in-memory structure of database and
445 strings with placeholders and returns string or array of with substituted
446 values from record.
447
448 my $text = $webpac->fill_in($rec,'v250^a');
449
450 Optional argument is ordinal number for repeatable fields. By default,
451 it's assume to be first repeatable field (fields are perl array, so first
452 element is 0).
453 Following example will read second value from repeatable field.
454
455 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
456
457 This function B<does not> perform parsing of format to inteligenty skip
458 delimiters before fields which aren't used.
459
460 This method will automatically decode UTF-8 string to local code page
461 if needed.
462
463 =cut
464
465 sub fill_in {
466 my $self = shift;
467
468 my $log = $self->_get_logger();
469
470 my $rec = shift || $log->logconfess("need data record");
471 my $format = shift || $log->logconfess("need format to parse");
472 # iteration (for repeatable fields)
473 my $i = shift || 0;
474
475 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
476
477 # FIXME remove for speedup?
478 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
479
480 if (utf8::is_utf8($format)) {
481 $format = $self->_x($format);
482 }
483
484 my $found = 0;
485 my $just_single = 1;
486
487 my $eval_code;
488 # remove eval{...} from beginning
489 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
490
491 my $filter_name;
492 # remove filter{...} from beginning
493 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
494
495 # do actual replacement of placeholders
496 # repeatable fields
497 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
498 $just_single = 0;
499 }
500
501 # non-repeatable fields
502 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
503 return if ($i > 0 && $just_single);
504 }
505
506 if ($found) {
507 $log->debug("format: $format");
508 if ($eval_code) {
509 my $eval = $self->fill_in($rec,$eval_code,$i);
510 return if (! $self->_eval($eval));
511 }
512 if ($filter_name && $self->{'filter'}->{$filter_name}) {
513 $log->debug("filter '$filter_name' for $format");
514 $format = $self->{'filter'}->{$filter_name}->($format);
515 return unless(defined($format));
516 $log->debug("filter result: $format");
517 }
518 # do we have lookups?
519 if ($self->{'lookup'}) {
520 if ($self->{'lookup'}->can('lookup')) {
521 my @lookup = $self->{lookup}->lookup($format);
522 $log->debug("lookup $format", join(", ", @lookup));
523 return @lookup;
524 } else {
525 $log->warn("Have lookup object but can't invoke lookup method");
526 }
527 } else {
528 return $format;
529 }
530 } else {
531 return;
532 }
533 }
534
535
536 =head2 fill_in_to_arr
537
538 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
539 for fields which have lookups, so they shouldn't be parsed but rather
540 C<fill_id>ed.
541
542 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
543
544 =cut
545
546 sub fill_in_to_arr {
547 my $self = shift;
548
549 my ($rec, $format_utf8) = @_;
550
551 my $log = $self->_get_logger();
552
553 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
554 return if (! $format_utf8);
555
556 my $i = 0;
557 my @arr;
558
559 while (my $v = $self->fill_in($rec,$format_utf8,$i++)) {
560 push @arr, $v;
561 }
562
563 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
564
565 return @arr;
566 }
567
568
569 =head2 get_data
570
571 Returns value from record.
572
573 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
574
575 Arguments are:
576 record reference C<$rec>,
577 field C<$f>,
578 optional subfiled C<$sf>,
579 index for repeatable values C<$i>.
580
581 Optinal variable C<$found> will be incremeted if there
582 is field.
583
584 Returns value or empty string.
585
586 =cut
587
588 sub get_data {
589 my $self = shift;
590
591 my ($rec,$f,$sf,$i,$found) = @_;
592
593 if ($$rec->{$f}) {
594 return '' if (! $$rec->{$f}->[$i]);
595 no strict 'refs';
596 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
597 $$found++ if (defined($$found));
598 return $$rec->{$f}->[$i]->{$sf};
599 } elsif (! $sf && $$rec->{$f}->[$i]) {
600 $$found++ if (defined($$found));
601 # it still might have subfield, just
602 # not specified, so we'll dump all
603 if ($$rec->{$f}->[$i] =~ /HASH/o) {
604 my $out;
605 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
606 my $v = $$rec->{$f}->[$i]->{$k};
607 $out .= "$v " if ($v);
608 }
609 return $out;
610 } else {
611 return $$rec->{$f}->[$i];
612 }
613 } else {
614 return '';
615 }
616 } else {
617 return '';
618 }
619 }
620
621
622 =head2 apply_format
623
624 Apply format specified in tag with C<format_name="name"> and
625 C<format_delimiter=";;">.
626
627 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
628
629 Formats can contain C<lookup{...}> if you need them.
630
631 =cut
632
633 sub apply_format {
634 my $self = shift;
635
636 my ($name,$delimiter,$data) = @_;
637
638 my $log = $self->_get_logger();
639
640 if (! $self->{'import_xml'}->{'format'}->{$name}) {
641 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
642 return $data;
643 }
644
645 $log->warn("no delimiter for format $name") if (! $delimiter);
646
647 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
648
649 my @data = split(/\Q$delimiter\E/, $data);
650
651 my $out = sprintf($format, @data);
652 $log->debug("using format $name [$format] on $data to produce: $out");
653
654 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
655 return $self->{'lookup'}->lookup($out);
656 } else {
657 return $out;
658 }
659
660 }
661
662 =head2 sort_arr
663
664 Sort array ignoring case and html in data
665
666 my @sorted = $webpac->sort_arr(@unsorted);
667
668 =cut
669
670 sub sort_arr {
671 my $self = shift;
672
673 my $log = $self->_get_logger();
674
675 # FIXME add Schwartzian Transformation?
676
677 my @sorted = sort {
678 $a =~ s#<[^>]+/*>##;
679 $b =~ s#<[^>]+/*>##;
680 lc($b) cmp lc($a)
681 } @_;
682 $log->debug("sorted values: ",sub { join(", ",@sorted) });
683
684 return @sorted;
685 }
686
687
688 =head1 INTERNAL METHODS
689
690 =head2 _sort_by_order
691
692 Sort xml tags data structure accoding to C<order=""> attribute.
693
694 =cut
695
696 sub _sort_by_order {
697 my $self = shift;
698
699 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
700 $self->{'import_xml'}->{'indexer'}->{$a};
701 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
702 $self->{'import_xml'}->{'indexer'}->{$b};
703
704 return $va <=> $vb;
705 }
706
707 =head2 _x
708
709 Convert strings from C<conf/normalize/*.xml> encoding into application
710 specific encoding (optinally specified using C<code_page> to C<new>
711 constructor).
712
713 my $text = $n->_x('normalize text string');
714
715 This is a stub so that other modules doesn't have to implement it.
716
717 =cut
718
719 sub _x {
720 my $self = shift;
721 return shift;
722 }
723
724
725 =head1 AUTHOR
726
727 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
728
729 =head1 COPYRIGHT & LICENSE
730
731 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
732
733 This program is free software; you can redistribute it and/or modify it
734 under the same terms as Perl itself.
735
736 =cut
737
738 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26