/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 317 - (show annotations)
Fri Dec 23 21:37:05 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 16678 byte(s)
 r12234@llin:  dpavlin | 2005-12-23 23:38:41 +0100
 bug fix to skip delimiter before first occurence of field in format

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.08
15
16 =cut
17
18 our $VERSION = '0.08';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 There is one built-in filter called C<regex> which can be use like this:
51
52 filter{regex(s/foo/bar/)}
53
54 =item *
55
56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
57
58 =item *
59
60 at end, optional C<format>s rules are resolved. Format rules are similar to
61 C<sprintf> and can also contain C<lookup{...}> which is performed after
62 values are inserted in format.
63
64 =back
65
66 This also describes order in which transformations are applied (eval,
67 filter, lookup, format) which is important to undestand when deciding how to
68 solve your data mungling and normalisation process.
69
70
71
72
73 =head1 FUNCTIONS
74
75 =head2 new
76
77 Create new normalisation object
78
79 my $n = new WebPAC::Normalize::Something(
80 filter => {
81 'filter_name_1' => sub {
82 # filter code
83 return length($_);
84 }, ...
85 },
86 db => $db_obj,
87 lookup_regex => $lookup->regex,
88 lookup => $lookup_obj,
89 prefix => 'foobar',
90 );
91
92 Parametar C<filter> defines user supplied snippets of perl code which can
93 be use with C<filter{...}> notation.
94
95 C<prefix> is used to form filename for database record (to support multiple
96 source files which are joined in one database).
97
98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
99 in structures. If you pass this parametar, you must also pass C<lookup>
100 which is C<WebPAC::Lookup> object.
101
102 =cut
103
104 sub new {
105 my $class = shift;
106 my $self = {@_};
107 bless($self, $class);
108
109 my $r = $self->{'lookup_regex'} ? 1 : 0;
110 my $l = $self->{'lookup'} ? 1 : 0;
111
112 my $log = $self->_get_logger();
113
114 # those two must be in pair
115 if ( ($r & $l) != ($r || $l) ) {
116 my $log = $self->_get_logger();
117 $log->logdie("lookup_regex and lookup must be in pair");
118 }
119
120 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
121
122 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
123
124 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
125
126 if (! $self->{filter} || ! $self->{filter}->{regex}) {
127 $log->debug("adding built-in filter regex");
128 $self->{filter}->{regex} = sub {
129 my ($val, $regex) = @_;
130 eval "\$val =~ $regex";
131 return $val;
132 };
133 }
134
135 $self ? return $self : return undef;
136 }
137
138
139 =head2 data_structure
140
141 Create in-memory data structure which represents normalized layout from
142 C<conf/normalize/*.xml>.
143
144 This structures are used to produce output.
145
146 my $ds = $webpac->data_structure($rec);
147
148 =cut
149
150 sub data_structure {
151 my $self = shift;
152
153 my $log = $self->_get_logger();
154
155 my $rec = shift;
156 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
157
158 $log->debug("data_structure rec = ", sub { Dumper($rec) });
159
160 $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
161
162 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
163
164 my $cache_file;
165
166 if ($self->{'db'}) {
167 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
168 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
169 return $ds if ($ds);
170 $log->debug("cache miss, creating");
171 }
172
173 my @sorted_tags;
174 if ($self->{tags_by_order}) {
175 @sorted_tags = @{$self->{tags_by_order}};
176 } else {
177 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
178 $self->{tags_by_order} = \@sorted_tags;
179 }
180
181 my $ds;
182
183 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
184
185 foreach my $field (@sorted_tags) {
186
187 my $row;
188
189 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
190
191 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
192 my $format;
193
194 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
195 $format = $tag->{'value'} || $tag->{'content'};
196
197 $log->debug("format: $format");
198
199 my @v;
200 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
201 @v = $self->fill_in_to_arr($rec,$format);
202 } else {
203 @v = $self->parse_to_arr($rec,$format);
204 }
205 next if (! @v);
206
207 if ($tag->{'sort'}) {
208 @v = $self->sort_arr(@v);
209 }
210
211 # use format?
212 if ($tag->{'format_name'}) {
213 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
214 }
215
216 # delimiter will join repeatable fields
217 if ($tag->{'delimiter'}) {
218 @v = ( join($tag->{'delimiter'}, @v) );
219 }
220
221 # default types
222 my @types = qw(display search);
223 # override by type attribute
224 @types = ( $tag->{'type'} ) if ($tag->{'type'});
225
226 foreach my $type (@types) {
227 # append to previous line?
228 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
229 if ($tag->{'append'}) {
230
231 # I will delimit appended part with
232 # delimiter (or ,)
233 my $d = $tag->{'delimiter'};
234 # default delimiter
235 $d ||= " ";
236
237 my $last = pop @{$row->{$type}};
238 $d = "" if (! $last);
239 $last .= $d . join($d, @v);
240 push @{$row->{$type}}, $last;
241
242 } else {
243 push @{$row->{$type}}, @v;
244 }
245 }
246
247
248 }
249
250 if ($row) {
251 $row->{'tag'} = $field;
252
253 # TODO: name_sigular, name_plural
254 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
255 my $row_name = $name ? $self->_x($name) : $field;
256
257 # post-sort all values in field
258 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
259 $log->warn("sort at field tag not implemented");
260 }
261
262 $ds->{$row_name} = $row;
263
264 $log->debug("row $field: ",sub { Dumper($row) });
265 }
266
267 }
268
269 $self->{'db'}->save_ds(
270 id => $id,
271 ds => $ds,
272 prefix => $self->{prefix},
273 ) if ($self->{'db'});
274
275 $log->debug("ds: ", sub { Dumper($ds) });
276
277 $log->logconfess("data structure returned is not array any more!") if wantarray;
278
279 return $ds;
280
281 }
282
283 =head2 parse
284
285 Perform smart parsing of string, skipping delimiters for fields which aren't
286 defined. It can also eval code in format starting with C<eval{...}> and
287 return output or nothing depending on eval code.
288
289 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
290
291 Filters are implemented here. While simple form of filters looks like this:
292
293 filter{name_of_filter}
294
295 but, filters can also have variable number of parametars like this:
296
297 filter{name_of_filter(param,param,param)}
298
299 =cut
300
301 my $warn_once;
302
303 sub parse {
304 my $self = shift;
305
306 my ($rec, $format_utf8, $i) = @_;
307
308 return if (! $format_utf8);
309
310 my $log = $self->_get_logger();
311
312 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
313
314 $i = 0 if (! $i);
315
316 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
317
318 my @out;
319
320 $log->debug("format: $format");
321
322 my $eval_code;
323 # remove eval{...} from beginning
324 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
325
326 my $filter_name;
327 # remove filter{...} from beginning
328 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
329
330 # did we found any (att all) field from format in row?
331 my $found_any = 0;
332 # prefix before first field which we preserve it $found_any
333 my $prefix;
334
335 my $f_step = 1;
336
337 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
338
339 my $del = $1 || '';
340 $prefix = $del if ($f_step == 1);
341
342 # repeatable index
343 my $r = $i;
344 $r = 0 if (lc("$2") eq 's');
345
346 my $found = 0;
347 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
348
349 if ($found) {
350 $found_any += $found;
351
352 # we will skip delimiter before first occurence of field!
353 push @out, $del unless($found_any == 1);
354 push @out, $tmp;
355 }
356 $f_step++;
357 }
358
359 return if (! $found_any);
360
361 my $out = join('',@out);
362
363 if ($out) {
364 # add rest of format (suffix)
365 $out .= $format;
366
367 # add prefix if not there
368 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
369
370 $log->debug("result: $out");
371 }
372
373 if ($eval_code) {
374 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
375 $log->debug("about to eval{$eval} format: $out");
376 return if (! $self->_eval($eval));
377 }
378
379 if ($filter_name) {
380 my @filter_args;
381 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
382 @filter_args = split(/,/, $2);
383 }
384 if ($self->{'filter'}->{$filter_name}) {
385 $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
386 unshift @filter_args, $out;
387 $out = $self->{'filter'}->{$filter_name}->(@filter_args);
388 return unless(defined($out));
389 $log->debug("filter result: $out");
390 } elsif (! $warn_once->{$filter_name}) {
391 $log->warn("trying to use undefined filter $filter_name");
392 $warn_once->{$filter_name}++;
393 }
394 }
395
396 return $out;
397 }
398
399 =head2 parse_to_arr
400
401 Similar to C<parse>, but returns array of all repeatable fields
402
403 my @arr = $webpac->parse_to_arr($rec,'v250^a');
404
405 =cut
406
407 sub parse_to_arr {
408 my $self = shift;
409
410 my ($rec, $format_utf8) = @_;
411
412 my $log = $self->_get_logger();
413
414 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
415 return if (! $format_utf8);
416
417 my $i = 0;
418 my @arr;
419
420 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
421 push @arr, $v;
422 }
423
424 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
425
426 return @arr;
427 }
428
429
430 =head2 fill_in
431
432 Workhourse of all: takes record from in-memory structure of database and
433 strings with placeholders and returns string or array of with substituted
434 values from record.
435
436 my $text = $webpac->fill_in($rec,'v250^a');
437
438 Optional argument is ordinal number for repeatable fields. By default,
439 it's assume to be first repeatable field (fields are perl array, so first
440 element is 0).
441 Following example will read second value from repeatable field.
442
443 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
444
445 This function B<does not> perform parsing of format to inteligenty skip
446 delimiters before fields which aren't used.
447
448 This method will automatically decode UTF-8 string to local code page
449 if needed.
450
451 =cut
452
453 sub fill_in {
454 my $self = shift;
455
456 my $log = $self->_get_logger();
457
458 my $rec = shift || $log->logconfess("need data record");
459 my $format = shift || $log->logconfess("need format to parse");
460 # iteration (for repeatable fields)
461 my $i = shift || 0;
462
463 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
464
465 # FIXME remove for speedup?
466 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
467
468 if (utf8::is_utf8($format)) {
469 $format = $self->_x($format);
470 }
471
472 my $found = 0;
473
474 my $eval_code;
475 # remove eval{...} from beginning
476 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
477
478 my $filter_name;
479 # remove filter{...} from beginning
480 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
481
482 # do actual replacement of placeholders
483 # repeatable fields
484 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
485 # non-repeatable fields
486 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
487
488 if ($found) {
489 $log->debug("format: $format");
490 if ($eval_code) {
491 my $eval = $self->fill_in($rec,$eval_code,$i);
492 return if (! $self->_eval($eval));
493 }
494 if ($filter_name && $self->{'filter'}->{$filter_name}) {
495 $log->debug("filter '$filter_name' for $format");
496 $format = $self->{'filter'}->{$filter_name}->($format);
497 return unless(defined($format));
498 $log->debug("filter result: $format");
499 }
500 # do we have lookups?
501 if ($self->{'lookup'}) {
502 if ($self->{'lookup'}->can('lookup')) {
503 my @lookup = $self->{lookup}->lookup($format);
504 $log->debug("lookup $format", join(", ", @lookup));
505 return @lookup;
506 } else {
507 $log->warn("Have lookup object but can't invoke lookup method");
508 }
509 } else {
510 return $format;
511 }
512 } else {
513 return;
514 }
515 }
516
517
518 =head2 fill_in_to_arr
519
520 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
521 for fields which have lookups, so they shouldn't be parsed but rather
522 C<fill_id>ed.
523
524 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
525
526 =cut
527
528 sub fill_in_to_arr {
529 my $self = shift;
530
531 my ($rec, $format_utf8) = @_;
532
533 my $log = $self->_get_logger();
534
535 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
536 return if (! $format_utf8);
537
538 my $i = 0;
539 my @arr;
540
541 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
542 push @arr, @v;
543 }
544
545 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
546
547 return @arr;
548 }
549
550
551 =head2 get_data
552
553 Returns value from record.
554
555 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
556
557 Arguments are:
558 record reference C<$rec>,
559 field C<$f>,
560 optional subfiled C<$sf>,
561 index for repeatable values C<$i>.
562
563 Optinal variable C<$found> will be incremeted if there
564 is field.
565
566 Returns value or empty string.
567
568 =cut
569
570 sub get_data {
571 my $self = shift;
572
573 my ($rec,$f,$sf,$i,$found) = @_;
574
575 if ($$rec->{$f}) {
576 return '' if (! $$rec->{$f}->[$i]);
577 no strict 'refs';
578 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
579 $$found++ if (defined($$found));
580 return $$rec->{$f}->[$i]->{$sf};
581 } elsif (! $sf && $$rec->{$f}->[$i]) {
582 $$found++ if (defined($$found));
583 # it still might have subfield, just
584 # not specified, so we'll dump all
585 if ($$rec->{$f}->[$i] =~ /HASH/o) {
586 my $out;
587 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
588 $out .= $$rec->{$f}->[$i]->{$k}." ";
589 }
590 return $out;
591 } else {
592 return $$rec->{$f}->[$i];
593 }
594 } else {
595 return '';
596 }
597 } else {
598 return '';
599 }
600 }
601
602
603 =head2 apply_format
604
605 Apply format specified in tag with C<format_name="name"> and
606 C<format_delimiter=";;">.
607
608 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
609
610 Formats can contain C<lookup{...}> if you need them.
611
612 =cut
613
614 sub apply_format {
615 my $self = shift;
616
617 my ($name,$delimiter,$data) = @_;
618
619 my $log = $self->_get_logger();
620
621 if (! $self->{'import_xml'}->{'format'}->{$name}) {
622 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
623 return $data;
624 }
625
626 $log->warn("no delimiter for format $name") if (! $delimiter);
627
628 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
629
630 my @data = split(/\Q$delimiter\E/, $data);
631
632 my $out = sprintf($format, @data);
633 $log->debug("using format $name [$format] on $data to produce: $out");
634
635 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
636 return $self->{'lookup'}->lookup($out);
637 } else {
638 return $out;
639 }
640
641 }
642
643 =head2 sort_arr
644
645 Sort array ignoring case and html in data
646
647 my @sorted = $webpac->sort_arr(@unsorted);
648
649 =cut
650
651 sub sort_arr {
652 my $self = shift;
653
654 my $log = $self->_get_logger();
655
656 # FIXME add Schwartzian Transformation?
657
658 my @sorted = sort {
659 $a =~ s#<[^>]+/*>##;
660 $b =~ s#<[^>]+/*>##;
661 lc($b) cmp lc($a)
662 } @_;
663 $log->debug("sorted values: ",sub { join(", ",@sorted) });
664
665 return @sorted;
666 }
667
668
669 =head1 INTERNAL METHODS
670
671 =head2 _sort_by_order
672
673 Sort xml tags data structure accoding to C<order=""> attribute.
674
675 =cut
676
677 sub _sort_by_order {
678 my $self = shift;
679
680 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
681 $self->{'import_xml'}->{'indexer'}->{$a};
682 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
683 $self->{'import_xml'}->{'indexer'}->{$b};
684
685 return $va <=> $vb;
686 }
687
688 =head2 _x
689
690 Convert strings from C<conf/normalize/*.xml> encoding into application
691 specific encoding (optinally specified using C<code_page> to C<new>
692 constructor).
693
694 my $text = $n->_x('normalize text string');
695
696 This is a stub so that other modules doesn't have to implement it.
697
698 =cut
699
700 sub _x {
701 my $self = shift;
702 return shift;
703 }
704
705
706 =head1 AUTHOR
707
708 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
709
710 =head1 COPYRIGHT & LICENSE
711
712 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
713
714 This program is free software; you can redistribute it and/or modify it
715 under the same terms as Perl itself.
716
717 =cut
718
719 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26