/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 295 - (show annotations)
Mon Dec 19 15:34:47 2005 UTC (18 years, 3 months ago) by dpavlin
File size: 16454 byte(s)
 r11795@llin:  dpavlin | 2005-12-19 16:35:30 +0100
 fix regex filter, moved development version to real one [2.07]

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.07
15
16 =cut
17
18 our $VERSION = '0.07';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 my @sorted_tags;
174 if ($self->{tags_by_order}) {
175 @sorted_tags = @{$self->{tags_by_order}};
176 } else {
177 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178 $self->{tags_by_order} = \@sorted_tags;
179 }
180
181 my $ds;
182
183 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185 foreach my $field (@sorted_tags) {
186
187 my $row;
188
189 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 my $format;
193
194 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195 $format = $tag->{'value'} || $tag->{'content'};
196
197 $log->debug("format: $format");
198
199 my @v;
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->fill_in_to_arr($rec,$format);
202 } else {
203 @v = $self->parse_to_arr($rec,$format);
204 }
205 next if (! @v);
206
207 if ($tag->{'sort'}) {
208 @v = $self->sort_arr(@v);
209 }
210
211 # use format?
212 if ($tag->{'format_name'}) {
213 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214 }
215
216 # delimiter will join repeatable fields
217 if ($tag->{'delimiter'}) {
218 @v = ( join($tag->{'delimiter'}, @v) );
219 }
220
221 # default types
222 my @types = qw(display search);
223 # override by type attribute
224 @types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226 foreach my $type (@types) {
227 # append to previous line?
228 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229 if ($tag->{'append'}) {
230
231 # I will delimit appended part with
232 # delimiter (or ,)
233 my $d = $tag->{'delimiter'};
234 # default delimiter
235 $d ||= " ";
236
237 my $last = pop @{$row->{$type}};
238 $d = "" if (! $last);
239 $last .= $d . join($d, @v);
240 push @{$row->{$type}}, $last;
241
242 } else {
243 push @{$row->{$type}}, @v;
244 }
245 }
246
247
248 }
249
250 if ($row) {
251 $row->{'tag'} = $field;
252
253 # TODO: name_sigular, name_plural
254 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255 my $row_name = $name ? $self->_x($name) : $field;
256
257 # post-sort all values in field
258 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259 $log->warn("sort at field tag not implemented");
260 }
261
262 $ds->{$row_name} = $row;
263
264 $log->debug("row $field: ",sub { Dumper($row) });
265 }
266
267 }
268
269 $self->{'db'}->save_ds(
270 id => $id,
271 ds => $ds,
272 prefix => $self->{prefix},
273 ) if ($self->{'db'});
274
275 $log->debug("ds: ", sub { Dumper($ds) });
276
277 $log->logconfess("data structure returned is not array any more!") if wantarray;
278
279 return $ds;
280
281 }
282
283 =head2 parse
284
285 Perform smart parsing of string, skipping delimiters for fields which aren't
286 defined. It can also eval code in format starting with C<eval{...}> and
287 return output or nothing depending on eval code.
288
289 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291 Filters are implemented here. While simple form of filters looks like this:
292
293 filter{name_of_filter}
294
295 but, filters can also have variable number of parametars like this:
296
297 filter{name_of_filter(param,param,param)}
298
299 =cut
300
301 my $warn_once;
302
303 sub parse {
304 my $self = shift;
305
306 my ($rec, $format_utf8, $i) = @_;
307
308 return if (! $format_utf8);
309
310 my $log = $self->_get_logger();
311
312 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314 $i = 0 if (! $i);
315
316 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318 my @out;
319
320 $log->debug("format: $format");
321
322 my $eval_code;
323 # remove eval{...} from beginning
324 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326 my $filter_name;
327 # remove filter{...} from beginning
328 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330 my $prefix;
331 my $all_found=0;
332
333 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
334
335 my $del = $1 || '';
336 $prefix ||= $del if ($all_found == 0);
337
338 # repeatable index
339 my $r = $i;
340 $r = 0 if (lc("$2") eq 's');
341
342 my $found = 0;
343 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
344
345 if ($found) {
346 push @out, $del;
347 push @out, $tmp;
348 $all_found += $found;
349 }
350 }
351
352 return if (! $all_found);
353
354 my $out = join('',@out);
355
356 if ($out) {
357 # add rest of format (suffix)
358 $out .= $format;
359
360 # add prefix if not there
361 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
362
363 $log->debug("result: $out");
364 }
365
366 if ($eval_code) {
367 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
368 $log->debug("about to eval{$eval} format: $out");
369 return if (! $self->_eval($eval));
370 }
371
372 if ($filter_name) {
373 my @filter_args;
374 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
375 @filter_args = split(/,/, $2);
376 }
377 if ($self->{'filter'}->{$filter_name}) {
378 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
379 unshift @filter_args, $out;
380 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
381 return unless(defined($out));
382 $log->debug("filter result: $out");
383 } elsif (! $warn_once->{$filter_name}) {
384 $log->warn("trying to use undefined filter $filter_name");
385 $warn_once->{$filter_name}++;
386 }
387 }
388
389 return $out;
390 }
391
392 =head2 parse_to_arr
393
394 Similar to C<parse>, but returns array of all repeatable fields
395
396 my @arr = $webpac->parse_to_arr($rec,'v250^a');
397
398 =cut
399
400 sub parse_to_arr {
401 my $self = shift;
402
403 my ($rec, $format_utf8) = @_;
404
405 my $log = $self->_get_logger();
406
407 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
408 return if (! $format_utf8);
409
410 my $i = 0;
411 my @arr;
412
413 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
414 push @arr, $v;
415 }
416
417 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
418
419 return @arr;
420 }
421
422
423 =head2 fill_in
424
425 Workhourse of all: takes record from in-memory structure of database and
426 strings with placeholders and returns string or array of with substituted
427 values from record.
428
429 my $text = $webpac->fill_in($rec,'v250^a');
430
431 Optional argument is ordinal number for repeatable fields. By default,
432 it's assume to be first repeatable field (fields are perl array, so first
433 element is 0).
434 Following example will read second value from repeatable field.
435
436 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
437
438 This function B<does not> perform parsing of format to inteligenty skip
439 delimiters before fields which aren't used.
440
441 This method will automatically decode UTF-8 string to local code page
442 if needed.
443
444 =cut
445
446 sub fill_in {
447 my $self = shift;
448
449 my $log = $self->_get_logger();
450
451 my $rec = shift || $log->logconfess("need data record");
452 my $format = shift || $log->logconfess("need format to parse");
453 # iteration (for repeatable fields)
454 my $i = shift || 0;
455
456 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
457
458 # FIXME remove for speedup?
459 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
460
461 if (utf8::is_utf8($format)) {
462 $format = $self->_x($format);
463 }
464
465 my $found = 0;
466
467 my $eval_code;
468 # remove eval{...} from beginning
469 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
470
471 my $filter_name;
472 # remove filter{...} from beginning
473 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
474
475 # do actual replacement of placeholders
476 # repeatable fields
477 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
478 # non-repeatable fields
479 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
480
481 if ($found) {
482 $log->debug("format: $format");
483 if ($eval_code) {
484 my $eval = $self->fill_in($rec,$eval_code,$i);
485 return if (! $self->_eval($eval));
486 }
487 if ($filter_name && $self->{'filter'}->{$filter_name}) {
488 $log->debug("filter '$filter_name' for $format");
489 $format = $self->{'filter'}->{$filter_name}->($format);
490 return unless(defined($format));
491 $log->debug("filter result: $format");
492 }
493 # do we have lookups?
494 if ($self->{'lookup'}) {
495 if ($self->{'lookup'}->can('lookup')) {
496 my @lookup = $self->{lookup}->lookup($format);
497 $log->debug("lookup $format", join(", ", @lookup));
498 return @lookup;
499 } else {
500 $log->warn("Have lookup object but can't invoke lookup method");
501 }
502 } else {
503 return $format;
504 }
505 } else {
506 return;
507 }
508 }
509
510
511 =head2 fill_in_to_arr
512
513 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
514 for fields which have lookups, so they shouldn't be parsed but rather
515 C<fill_id>ed.
516
517 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
518
519 =cut
520
521 sub fill_in_to_arr {
522 my $self = shift;
523
524 my ($rec, $format_utf8) = @_;
525
526 my $log = $self->_get_logger();
527
528 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
529 return if (! $format_utf8);
530
531 my $i = 0;
532 my @arr;
533
534 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
535 push @arr, @v;
536 }
537
538 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
539
540 return @arr;
541 }
542
543
544 =head2 get_data
545
546 Returns value from record.
547
548 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
549
550 Arguments are:
551 record reference C<$rec>,
552 field C<$f>,
553 optional subfiled C<$sf>,
554 index for repeatable values C<$i>.
555
556 Optinal variable C<$found> will be incremeted if there
557 is field.
558
559 Returns value or empty string.
560
561 =cut
562
563 sub get_data {
564 my $self = shift;
565
566 my ($rec,$f,$sf,$i,$found) = @_;
567
568 if ($$rec->{$f}) {
569 return '' if (! $$rec->{$f}->[$i]);
570 no strict 'refs';
571 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
572 $$found++ if (defined($$found));
573 return $$rec->{$f}->[$i]->{$sf};
574 } elsif (! $sf && $$rec->{$f}->[$i]) {
575 $$found++ if (defined($$found));
576 # it still might have subfield, just
577 # not specified, so we'll dump all
578 if ($$rec->{$f}->[$i] =~ /HASH/o) {
579 my $out;
580 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
581 $out .= $$rec->{$f}->[$i]->{$k}." ";
582 }
583 return $out;
584 } else {
585 return $$rec->{$f}->[$i];
586 }
587 } else {
588 return '';
589 }
590 } else {
591 return '';
592 }
593 }
594
595
596 =head2 apply_format
597
598 Apply format specified in tag with C<format_name="name"> and
599 C<format_delimiter=";;">.
600
601 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
602
603 Formats can contain C<lookup{...}> if you need them.
604
605 =cut
606
607 sub apply_format {
608 my $self = shift;
609
610 my ($name,$delimiter,$data) = @_;
611
612 my $log = $self->_get_logger();
613
614 if (! $self->{'import_xml'}->{'format'}->{$name}) {
615 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
616 return $data;
617 }
618
619 $log->warn("no delimiter for format $name") if (! $delimiter);
620
621 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
622
623 my @data = split(/\Q$delimiter\E/, $data);
624
625 my $out = sprintf($format, @data);
626 $log->debug("using format $name [$format] on $data to produce: $out");
627
628 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
629 return $self->{'lookup'}->lookup($out);
630 } else {
631 return $out;
632 }
633
634 }
635
636 =head2 sort_arr
637
638 Sort array ignoring case and html in data
639
640 my @sorted = $webpac->sort_arr(@unsorted);
641
642 =cut
643
644 sub sort_arr {
645 my $self = shift;
646
647 my $log = $self->_get_logger();
648
649 # FIXME add Schwartzian Transformation?
650
651 my @sorted = sort {
652 $a =~ s#<[^>]+/*>##;
653 $b =~ s#<[^>]+/*>##;
654 lc($b) cmp lc($a)
655 } @_;
656 $log->debug("sorted values: ",sub { join(", ",@sorted) });
657
658 return @sorted;
659 }
660
661
662 =head1 INTERNAL METHODS
663
664 =head2 _sort_by_order
665
666 Sort xml tags data structure accoding to C<order=""> attribute.
667
668 =cut
669
670 sub _sort_by_order {
671 my $self = shift;
672
673 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
674 $self->{'import_xml'}->{'indexer'}->{$a};
675 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
676 $self->{'import_xml'}->{'indexer'}->{$b};
677
678 return $va <=> $vb;
679 }
680
681 =head2 _x
682
683 Convert strings from C<conf/normalize/*.xml> encoding into application
684 specific encoding (optinally specified using C<code_page> to C<new>
685 constructor).
686
687 my $text = $n->_x('normalize text string');
688
689 This is a stub so that other modules doesn't have to implement it.
690
691 =cut
692
693 sub _x {
694 my $self = shift;
695 return shift;
696 }
697
698
699 =head1 AUTHOR
700
701 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
702
703 =head1 COPYRIGHT & LICENSE
704
705 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
706
707 This program is free software; you can redistribute it and/or modify it
708 under the same terms as Perl itself.
709
710 =cut
711
712 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26