/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 333 - (show annotations)
Sat Dec 31 13:42:11 2005 UTC (18 years, 3 months ago) by dpavlin
File size: 16778 byte(s)
try to fix infinite loop (not working)

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.08
15
16 =cut
17
18 our $VERSION = '0.08';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 my @sorted_tags;
174 if ($self->{tags_by_order}) {
175 @sorted_tags = @{$self->{tags_by_order}};
176 } else {
177 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178 $self->{tags_by_order} = \@sorted_tags;
179 }
180
181 my $ds;
182
183 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185 foreach my $field (@sorted_tags) {
186
187 my $row;
188
189 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 my $format;
193
194 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195 $format = $tag->{'value'} || $tag->{'content'};
196
197 $log->debug("format: $format");
198
199 my @v;
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->fill_in_to_arr($rec,$format);
202 } else {
203 @v = $self->parse_to_arr($rec,$format);
204 }
205 next if (! @v);
206
207 if ($tag->{'sort'}) {
208 @v = $self->sort_arr(@v);
209 }
210
211 # use format?
212 if ($tag->{'format_name'}) {
213 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214 }
215
216 # delimiter will join repeatable fields
217 if ($tag->{'delimiter'}) {
218 @v = ( join($tag->{'delimiter'}, @v) );
219 }
220
221 # default types
222 my @types = qw(display search);
223 # override by type attribute
224 @types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226 foreach my $type (@types) {
227 # append to previous line?
228 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229 if ($tag->{'append'}) {
230
231 # I will delimit appended part with
232 # delimiter (or ,)
233 my $d = $tag->{'delimiter'};
234 # default delimiter
235 $d ||= " ";
236
237 my $last = pop @{$row->{$type}};
238 $d = "" if (! $last);
239 $last .= $d . join($d, @v);
240 push @{$row->{$type}}, $last;
241
242 } else {
243 push @{$row->{$type}}, @v;
244 }
245 }
246
247
248 }
249
250 if ($row) {
251 $row->{'tag'} = $field;
252
253 # TODO: name_sigular, name_plural
254 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255 my $row_name = $name ? $self->_x($name) : $field;
256
257 # post-sort all values in field
258 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259 $log->warn("sort at field tag not implemented");
260 }
261
262 $ds->{$row_name} = $row;
263
264 $log->debug("row $field: ",sub { Dumper($row) });
265 }
266
267 }
268
269 $self->{'db'}->save_ds(
270 id => $id,
271 ds => $ds,
272 prefix => $self->{prefix},
273 ) if ($self->{'db'});
274
275 $log->debug("ds: ", sub { Dumper($ds) });
276
277 $log->logconfess("data structure returned is not array any more!") if wantarray;
278
279 return $ds;
280
281 }
282
283 =head2 parse
284
285 Perform smart parsing of string, skipping delimiters for fields which aren't
286 defined. It can also eval code in format starting with C<eval{...}> and
287 return output or nothing depending on eval code.
288
289 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291 Filters are implemented here. While simple form of filters looks like this:
292
293 filter{name_of_filter}
294
295 but, filters can also have variable number of parametars like this:
296
297 filter{name_of_filter(param,param,param)}
298
299 =cut
300
301 my $warn_once;
302
303 sub parse {
304 my $self = shift;
305
306 my ($rec, $format_utf8, $i) = @_;
307
308 return if (! $format_utf8);
309
310 my $log = $self->_get_logger();
311
312 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314 $i = 0 if (! $i);
315
316 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318 my @out;
319
320 $log->debug("format: $format");
321
322 my $eval_code;
323 # remove eval{...} from beginning
324 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326 my $filter_name;
327 # remove filter{...} from beginning
328 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330 # did we found any (att all) field from format in row?
331 my $found_any = 0;
332 # prefix before first field which we preserve it $found_any
333 my $prefix;
334
335 my $f_step = 1;
336
337 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
338
339 my $del = $1 || '';
340 $prefix = $del if ($f_step == 1);
341
342 # repeatable index
343 my $r = $i;
344 $r = 0 if (lc("$2") eq 's');
345
346 my $found = 0;
347 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
348
349 if ($found) {
350 $found_any += $found;
351
352 # we will skip delimiter before first occurence of field!
353 push @out, $del unless($found_any == 1);
354 push @out, $tmp;
355 }
356 $f_step++;
357 }
358
359 return if (! $found_any);
360
361 my $out = join('',@out);
362
363 if ($out) {
364 # add rest of format (suffix)
365 $out .= $format;
366
367 # add prefix if not there
368 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
369
370 $log->debug("result: $out");
371 }
372
373 if ($eval_code) {
374 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
375 $log->debug("about to eval{$eval} format: $out");
376 return if (! $self->_eval($eval));
377 }
378
379 if ($filter_name) {
380 my @filter_args;
381 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
382 @filter_args = split(/,/, $2);
383 }
384 if ($self->{'filter'}->{$filter_name}) {
385 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
386 unshift @filter_args, $out;
387 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
388 return unless(defined($out));
389 $log->debug("filter result: $out");
390 } elsif (! $warn_once->{$filter_name}) {
391 $log->warn("trying to use undefined filter $filter_name");
392 $warn_once->{$filter_name}++;
393 }
394 }
395
396 return $out;
397 }
398
399 =head2 parse_to_arr
400
401 Similar to C<parse>, but returns array of all repeatable fields
402
403 my @arr = $webpac->parse_to_arr($rec,'v250^a');
404
405 =cut
406
407 sub parse_to_arr {
408 my $self = shift;
409
410 my ($rec, $format_utf8) = @_;
411
412 my $log = $self->_get_logger();
413
414 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
415 return if (! $format_utf8);
416
417 my $i = 0;
418 my @arr;
419
420 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
421 push @arr, $v;
422 }
423
424 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
425
426 return @arr;
427 }
428
429
430 =head2 fill_in
431
432 Workhourse of all: takes record from in-memory structure of database and
433 strings with placeholders and returns string or array of with substituted
434 values from record.
435
436 my $text = $webpac->fill_in($rec,'v250^a');
437
438 Optional argument is ordinal number for repeatable fields. By default,
439 it's assume to be first repeatable field (fields are perl array, so first
440 element is 0).
441 Following example will read second value from repeatable field.
442
443 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
444
445 This function B<does not> perform parsing of format to inteligenty skip
446 delimiters before fields which aren't used.
447
448 This method will automatically decode UTF-8 string to local code page
449 if needed.
450
451 =cut
452
453 sub fill_in {
454 my $self = shift;
455
456 my $log = $self->_get_logger();
457
458 my $rec = shift || $log->logconfess("need data record");
459 my $format = shift || $log->logconfess("need format to parse");
460 # iteration (for repeatable fields)
461 my $i = shift || 0;
462
463 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
464
465 # FIXME remove for speedup?
466 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467
468 if (utf8::is_utf8($format)) {
469 $format = $self->_x($format);
470 }
471
472 my $found = 0;
473 my $just_single = 1;
474
475 my $eval_code;
476 # remove eval{...} from beginning
477 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
478
479 my $filter_name;
480 # remove filter{...} from beginning
481 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
482
483 # do actual replacement of placeholders
484 # repeatable fields
485 if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges) {
486 $just_single = 0;
487 }
488
489 # non-repeatable fields
490 if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges) {
491 return if ($i > 0 && $just_single);
492 }
493
494 if ($found) {
495 $log->debug("format: $format");
496 if ($eval_code) {
497 my $eval = $self->fill_in($rec,$eval_code,$i);
498 return if (! $self->_eval($eval));
499 }
500 if ($filter_name && $self->{'filter'}->{$filter_name}) {
501 $log->debug("filter '$filter_name' for $format");
502 $format = $self->{'filter'}->{$filter_name}->($format);
503 return unless(defined($format));
504 $log->debug("filter result: $format");
505 }
506 # do we have lookups?
507 if ($self->{'lookup'}) {
508 if ($self->{'lookup'}->can('lookup')) {
509 my @lookup = $self->{lookup}->lookup($format);
510 $log->debug("lookup $format", join(", ", @lookup));
511 return @lookup;
512 } else {
513 $log->warn("Have lookup object but can't invoke lookup method");
514 }
515 } else {
516 return $format;
517 }
518 } else {
519 return;
520 }
521 }
522
523
524 =head2 fill_in_to_arr
525
526 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
527 for fields which have lookups, so they shouldn't be parsed but rather
528 C<fill_id>ed.
529
530 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
531
532 =cut
533
534 sub fill_in_to_arr {
535 my $self = shift;
536
537 my ($rec, $format_utf8) = @_;
538
539 my $log = $self->_get_logger();
540
541 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
542 return if (! $format_utf8);
543
544 my $i = 0;
545 my @arr;
546
547 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
548 push @arr, @v;
549 }
550
551 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
552
553 return @arr;
554 }
555
556
557 =head2 get_data
558
559 Returns value from record.
560
561 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
562
563 Arguments are:
564 record reference C<$rec>,
565 field C<$f>,
566 optional subfiled C<$sf>,
567 index for repeatable values C<$i>.
568
569 Optinal variable C<$found> will be incremeted if there
570 is field.
571
572 Returns value or empty string.
573
574 =cut
575
576 sub get_data {
577 my $self = shift;
578
579 my ($rec,$f,$sf,$i,$found) = @_;
580
581 if ($$rec->{$f}) {
582 return '' if (! $$rec->{$f}->[$i]);
583 no strict 'refs';
584 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
585 $$found++ if (defined($$found));
586 return $$rec->{$f}->[$i]->{$sf};
587 } elsif (! $sf && $$rec->{$f}->[$i]) {
588 $$found++ if (defined($$found));
589 # it still might have subfield, just
590 # not specified, so we'll dump all
591 if ($$rec->{$f}->[$i] =~ /HASH/o) {
592 my $out;
593 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
594 $out .= $$rec->{$f}->[$i]->{$k}." ";
595 }
596 return $out;
597 } else {
598 return $$rec->{$f}->[$i];
599 }
600 } else {
601 return '';
602 }
603 } else {
604 return '';
605 }
606 }
607
608
609 =head2 apply_format
610
611 Apply format specified in tag with C<format_name="name"> and
612 C<format_delimiter=";;">.
613
614 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615
616 Formats can contain C<lookup{...}> if you need them.
617
618 =cut
619
620 sub apply_format {
621 my $self = shift;
622
623 my ($name,$delimiter,$data) = @_;
624
625 my $log = $self->_get_logger();
626
627 if (! $self->{'import_xml'}->{'format'}->{$name}) {
628 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629 return $data;
630 }
631
632 $log->warn("no delimiter for format $name") if (! $delimiter);
633
634 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
635
636 my @data = split(/\Q$delimiter\E/, $data);
637
638 my $out = sprintf($format, @data);
639 $log->debug("using format $name [$format] on $data to produce: $out");
640
641 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642 return $self->{'lookup'}->lookup($out);
643 } else {
644 return $out;
645 }
646
647 }
648
649 =head2 sort_arr
650
651 Sort array ignoring case and html in data
652
653 my @sorted = $webpac->sort_arr(@unsorted);
654
655 =cut
656
657 sub sort_arr {
658 my $self = shift;
659
660 my $log = $self->_get_logger();
661
662 # FIXME add Schwartzian Transformation?
663
664 my @sorted = sort {
665 $a =~ s#<[^>]+/*>##;
666 $b =~ s#<[^>]+/*>##;
667 lc($b) cmp lc($a)
668 } @_;
669 $log->debug("sorted values: ",sub { join(", ",@sorted) });
670
671 return @sorted;
672 }
673
674
675 =head1 INTERNAL METHODS
676
677 =head2 _sort_by_order
678
679 Sort xml tags data structure accoding to C<order=""> attribute.
680
681 =cut
682
683 sub _sort_by_order {
684 my $self = shift;
685
686 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
687 $self->{'import_xml'}->{'indexer'}->{$a};
688 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
689 $self->{'import_xml'}->{'indexer'}->{$b};
690
691 return $va <=> $vb;
692 }
693
694 =head2 _x
695
696 Convert strings from C<conf/normalize/*.xml> encoding into application
697 specific encoding (optinally specified using C<code_page> to C<new>
698 constructor).
699
700 my $text = $n->_x('normalize text string');
701
702 This is a stub so that other modules doesn't have to implement it.
703
704 =cut
705
706 sub _x {
707 my $self = shift;
708 return shift;
709 }
710
711
712 =head1 AUTHOR
713
714 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
715
716 =head1 COPYRIGHT & LICENSE
717
718 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
719
720 This program is free software; you can redistribute it and/or modify it
721 under the same terms as Perl itself.
722
723 =cut
724
725 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26