/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 344 - (show annotations)
Sat Jan 7 02:05:55 2006 UTC (17 years ago) by dpavlin
File size: 16958 byte(s)
 r356@llin:  dpavlin | 2006-01-07 01:05:14 +0100
 fix failing test

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.08
15
16 =cut
17
18 our $VERSION = '0.08';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 my @sorted_tags;
174 if ($self->{tags_by_order}) {
175 @sorted_tags = @{$self->{tags_by_order}};
176 } else {
177 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178 $self->{tags_by_order} = \@sorted_tags;
179 }
180
181 my $ds;
182
183 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185 foreach my $field (@sorted_tags) {
186
187 my $row;
188
189 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 my $format;
193
194 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195 $format = $tag->{'value'} || $tag->{'content'};
196
197 $log->debug("format: $format");
198
199 my @v;
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->fill_in_to_arr($rec,$format);
202 } else {
203 @v = $self->parse_to_arr($rec,$format);
204 }
205 next if (! @v);
206
207 if ($tag->{'sort'}) {
208 @v = $self->sort_arr(@v);
209 }
210
211 # use format?
212 if ($tag->{'format_name'}) {
213 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214 }
215
216 # delimiter will join repeatable fields
217 if ($tag->{'delimiter'}) {
218 @v = ( join($tag->{'delimiter'}, @v) );
219 }
220
221 # default types
222 my @types = qw(display search);
223 # override by type attribute
224 @types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226 foreach my $type (@types) {
227 # append to previous line?
228 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229 if ($tag->{'append'}) {
230
231 # I will delimit appended part with
232 # delimiter (or ,)
233 my $d = $tag->{'delimiter'};
234 # default delimiter
235 $d ||= " ";
236
237 my $last = pop @{$row->{$type}};
238 $d = "" if (! $last);
239 $last .= $d . join($d, @v);
240 push @{$row->{$type}}, $last;
241
242 } else {
243 push @{$row->{$type}}, @v;
244 }
245 }
246
247
248 }
249
250 if ($row) {
251 $row->{'tag'} = $field;
252
253 # TODO: name_sigular, name_plural
254 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255 my $row_name = $name ? $self->_x($name) : $field;
256
257 # post-sort all values in field
258 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259 $log->warn("sort at field tag not implemented");
260 }
261
262 $ds->{$row_name} = $row;
263
264 $log->debug("row $field: ",sub { Dumper($row) });
265 }
266
267 }
268
269 $self->{'db'}->save_ds(
270 id => $id,
271 ds => $ds,
272 prefix => $self->{prefix},
273 ) if ($self->{'db'});
274
275 $log->debug("ds: ", sub { Dumper($ds) });
276
277 $log->logconfess("data structure returned is not array any more!") if wantarray;
278
279 return $ds;
280
281 }
282
283 =head2 parse
284
285 Perform smart parsing of string, skipping delimiters for fields which aren't
286 defined. It can also eval code in format starting with C<eval{...}> and
287 return output or nothing depending on eval code.
288
289 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291 Filters are implemented here. While simple form of filters looks like this:
292
293 filter{name_of_filter}
294
295 but, filters can also have variable number of parametars like this:
296
297 filter{name_of_filter(param,param,param)}
298
299 =cut
300
301 my $warn_once;
302
303 sub parse {
304 my $self = shift;
305
306 my ($rec, $format_utf8, $i) = @_;
307
308 return if (! $format_utf8);
309
310 my $log = $self->_get_logger();
311
312 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314 $i = 0 if (! $i);
315
316 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318 my @out;
319
320 $log->debug("format: $format [$i]");
321
322 my $eval_code;
323 # remove eval{...} from beginning
324 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326 my $filter_name;
327 # remove filter{...} from beginning
328 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330 # did we found any (att all) field from format in row?
331 my $found_any;
332 # prefix before first field which we preserve it $found_any
333 my $prefix;
334
335 my $f_step = 1;
336
337 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
338
339 my $del = $1 || '';
340 $prefix = $del if ($f_step == 1);
341
342 my $fld_type = lc($2);
343
344 # repeatable index
345 my $r = $i;
346 if ($fld_type eq 's') {
347 if ($found_any->{'v'}) {
348 $r = 0;
349 } else {
350 return;
351 }
352 }
353
354 my $found = 0;
355 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
356
357 if ($found) {
358 $found_any->{$fld_type} += $found;
359
360 # we will skip delimiter before first occurence of field!
361 push @out, $del unless($found_any->{$fld_type} == 1);
362 push @out, $tmp;
363 }
364 $f_step++;
365 }
366
367 # test if any fields found?
368 return if (! $found_any->{'v'} && ! $found_any->{'s'});
369
370 my $out = join('',@out);
371
372 if ($out) {
373 # add rest of format (suffix)
374 $out .= $format;
375
376 # add prefix if not there
377 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
378
379 $log->debug("result: $out");
380 }
381
382 if ($eval_code) {
383 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
384 $log->debug("about to eval{$eval} format: $out");
385 return if (! $self->_eval($eval));
386 }
387
388 if ($filter_name) {
389 my @filter_args;
390 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
391 @filter_args = split(/,/, $2);
392 }
393 if ($self->{'filter'}->{$filter_name}) {
394 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
395 unshift @filter_args, $out;
396 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
397 return unless(defined($out));
398 $log->debug("filter result: $out");
399 } elsif (! $warn_once->{$filter_name}) {
400 $log->warn("trying to use undefined filter $filter_name");
401 $warn_once->{$filter_name}++;
402 }
403 }
404
405 return $out;
406 }
407
408 =head2 parse_to_arr
409
410 Similar to C<parse>, but returns array of all repeatable fields
411
412 my @arr = $webpac->parse_to_arr($rec,'v250^a');
413
414 =cut
415
416 sub parse_to_arr {
417 my $self = shift;
418
419 my ($rec, $format_utf8) = @_;
420
421 my $log = $self->_get_logger();
422
423 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
424 return if (! $format_utf8);
425
426 my $i = 0;
427 my @arr;
428
429 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
430 push @arr, $v;
431 }
432
433 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
434
435 return @arr;
436 }
437
438
439 =head2 fill_in
440
441 Workhourse of all: takes record from in-memory structure of database and
442 strings with placeholders and returns string or array of with substituted
443 values from record.
444
445 my $text = $webpac->fill_in($rec,'v250^a');
446
447 Optional argument is ordinal number for repeatable fields. By default,
448 it's assume to be first repeatable field (fields are perl array, so first
449 element is 0).
450 Following example will read second value from repeatable field.
451
452 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
453
454 This function B<does not> perform parsing of format to inteligenty skip
455 delimiters before fields which aren't used.
456
457 This method will automatically decode UTF-8 string to local code page
458 if needed.
459
460 =cut
461
462 sub fill_in {
463 my $self = shift;
464
465 my $log = $self->_get_logger();
466
467 my $rec = shift || $log->logconfess("need data record");
468 my $format = shift || $log->logconfess("need format to parse");
469 # iteration (for repeatable fields)
470 my $i = shift || 0;
471
472 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
473
474 # FIXME remove for speedup?
475 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
476
477 if (utf8::is_utf8($format)) {
478 $format = $self->_x($format);
479 }
480
481 my $found = 0;
482 my $just_single = 1;
483
484 my $eval_code;
485 # remove eval{...} from beginning
486 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
487
488 my $filter_name;
489 # remove filter{...} from beginning
490 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
491
492 # do actual replacement of placeholders
493 # repeatable fields
494 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
495 $just_single = 0;
496 }
497
498 # non-repeatable fields
499 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
500 return if ($i > 0 && $just_single);
501 }
502
503 if ($found) {
504 $log->debug("format: $format");
505 if ($eval_code) {
506 my $eval = $self->fill_in($rec,$eval_code,$i);
507 return if (! $self->_eval($eval));
508 }
509 if ($filter_name && $self->{'filter'}->{$filter_name}) {
510 $log->debug("filter '$filter_name' for $format");
511 $format = $self->{'filter'}->{$filter_name}->($format);
512 return unless(defined($format));
513 $log->debug("filter result: $format");
514 }
515 # do we have lookups?
516 if ($self->{'lookup'}) {
517 if ($self->{'lookup'}->can('lookup')) {
518 my @lookup = $self->{lookup}->lookup($format);
519 $log->debug("lookup $format", join(", ", @lookup));
520 return @lookup;
521 } else {
522 $log->warn("Have lookup object but can't invoke lookup method");
523 }
524 } else {
525 return $format;
526 }
527 } else {
528 return;
529 }
530 }
531
532
533 =head2 fill_in_to_arr
534
535 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
536 for fields which have lookups, so they shouldn't be parsed but rather
537 C<fill_id>ed.
538
539 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
540
541 =cut
542
543 sub fill_in_to_arr {
544 my $self = shift;
545
546 my ($rec, $format_utf8) = @_;
547
548 my $log = $self->_get_logger();
549
550 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
551 return if (! $format_utf8);
552
553 my $i = 0;
554 my @arr;
555
556 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
557 push @arr, @v;
558 }
559
560 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
561
562 return @arr;
563 }
564
565
566 =head2 get_data
567
568 Returns value from record.
569
570 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
571
572 Arguments are:
573 record reference C<$rec>,
574 field C<$f>,
575 optional subfiled C<$sf>,
576 index for repeatable values C<$i>.
577
578 Optinal variable C<$found> will be incremeted if there
579 is field.
580
581 Returns value or empty string.
582
583 =cut
584
585 sub get_data {
586 my $self = shift;
587
588 my ($rec,$f,$sf,$i,$found) = @_;
589
590 if ($$rec->{$f}) {
591 return '' if (! $$rec->{$f}->[$i]);
592 no strict 'refs';
593 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
594 $$found++ if (defined($$found));
595 return $$rec->{$f}->[$i]->{$sf};
596 } elsif (! $sf && $$rec->{$f}->[$i]) {
597 $$found++ if (defined($$found));
598 # it still might have subfield, just
599 # not specified, so we'll dump all
600 if ($$rec->{$f}->[$i] =~ /HASH/o) {
601 my $out;
602 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
603 $out .= $$rec->{$f}->[$i]->{$k}." ";
604 }
605 return $out;
606 } else {
607 return $$rec->{$f}->[$i];
608 }
609 } else {
610 return '';
611 }
612 } else {
613 return '';
614 }
615 }
616
617
618 =head2 apply_format
619
620 Apply format specified in tag with C<format_name="name"> and
621 C<format_delimiter=";;">.
622
623 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
624
625 Formats can contain C<lookup{...}> if you need them.
626
627 =cut
628
629 sub apply_format {
630 my $self = shift;
631
632 my ($name,$delimiter,$data) = @_;
633
634 my $log = $self->_get_logger();
635
636 if (! $self->{'import_xml'}->{'format'}->{$name}) {
637 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
638 return $data;
639 }
640
641 $log->warn("no delimiter for format $name") if (! $delimiter);
642
643 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
644
645 my @data = split(/\Q$delimiter\E/, $data);
646
647 my $out = sprintf($format, @data);
648 $log->debug("using format $name [$format] on $data to produce: $out");
649
650 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
651 return $self->{'lookup'}->lookup($out);
652 } else {
653 return $out;
654 }
655
656 }
657
658 =head2 sort_arr
659
660 Sort array ignoring case and html in data
661
662 my @sorted = $webpac->sort_arr(@unsorted);
663
664 =cut
665
666 sub sort_arr {
667 my $self = shift;
668
669 my $log = $self->_get_logger();
670
671 # FIXME add Schwartzian Transformation?
672
673 my @sorted = sort {
674 $a =~ s#<[^>]+/*>##;
675 $b =~ s#<[^>]+/*>##;
676 lc($b) cmp lc($a)
677 } @_;
678 $log->debug("sorted values: ",sub { join(", ",@sorted) });
679
680 return @sorted;
681 }
682
683
684 =head1 INTERNAL METHODS
685
686 =head2 _sort_by_order
687
688 Sort xml tags data structure accoding to C<order=""> attribute.
689
690 =cut
691
692 sub _sort_by_order {
693 my $self = shift;
694
695 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
696 $self->{'import_xml'}->{'indexer'}->{$a};
697 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
698 $self->{'import_xml'}->{'indexer'}->{$b};
699
700 return $va <=> $vb;
701 }
702
703 =head2 _x
704
705 Convert strings from C<conf/normalize/*.xml> encoding into application
706 specific encoding (optinally specified using C<code_page> to C<new>
707 constructor).
708
709 my $text = $n->_x('normalize text string');
710
711 This is a stub so that other modules doesn't have to implement it.
712
713 =cut
714
715 sub _x {
716 my $self = shift;
717 return shift;
718 }
719
720
721 =head1 AUTHOR
722
723 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
724
725 =head1 COPYRIGHT & LICENSE
726
727 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
728
729 This program is free software; you can redistribute it and/or modify it
730 under the same terms as Perl itself.
731
732 =cut
733
734 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26