/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (show annotations)
Sun Jul 24 15:03:11 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 15304 byte(s)
re-worked logging, added no_log option to disable logging

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 );
86
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
89
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
93
94 =cut
95
96 sub new {
97 my $class = shift;
98 my $self = {@_};
99 bless($self, $class);
100
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
103
104 my $log = $self->_get_logger();
105
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
110 }
111
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114 $self ? return $self : return undef;
115 }
116
117
118 =head2 data_structure
119
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
122
123 This structures are used to produce output.
124
125 my @ds = $webpac->data_structure($rec);
126
127 B<Note: historical oddity follows>
128
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
132
133 =cut
134
135 sub data_structure {
136 my $self = shift;
137
138 my $log = $self->_get_logger();
139
140 my $rec = shift;
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143 my $cache_file;
144
145 if ($self->{'db'}) {
146 my @ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148 return @ds if ($#ds > 0);
149 $log->debug("cache miss, creating");
150 }
151
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
154
155 my @sorted_tags;
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
158 } else {
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
161 }
162
163 my @ds;
164
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167 foreach my $field (@sorted_tags) {
168
169 my $row;
170
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 my $format = $tag->{'value'} || $tag->{'content'};
175
176 $log->debug("format: $format");
177
178 my @v;
179 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180 @v = $self->fill_in_to_arr($rec,$format);
181 } else {
182 @v = $self->parse_to_arr($rec,$format);
183 }
184 next if (! @v);
185
186 if ($tag->{'sort'}) {
187 @v = $self->sort_arr(@v);
188 }
189
190 # use format?
191 if ($tag->{'format_name'}) {
192 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193 }
194
195 if ($field eq 'filename') {
196 $self->{'current_filename'} = join('',@v);
197 $log->debug("filename: ",$self->{'current_filename'});
198 } elsif ($field eq 'headline') {
199 $self->{'headline'} .= join('',@v);
200 $log->debug("headline: ",$self->{'headline'});
201 next; # don't return headline in data_structure!
202 }
203
204 # delimiter will join repeatable fields
205 if ($tag->{'delimiter'}) {
206 @v = ( join($tag->{'delimiter'}, @v) );
207 }
208
209 # default types
210 my @types = qw(display swish);
211 # override by type attribute
212 @types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214 foreach my $type (@types) {
215 # append to previous line?
216 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217 if ($tag->{'append'}) {
218
219 # I will delimit appended part with
220 # delimiter (or ,)
221 my $d = $tag->{'delimiter'};
222 # default delimiter
223 $d ||= " ";
224
225 my $last = pop @{$row->{$type}};
226 $d = "" if (! $last);
227 $last .= $d . join($d, @v);
228 push @{$row->{$type}}, $last;
229
230 } else {
231 push @{$row->{$type}}, @v;
232 }
233 }
234
235
236 }
237
238 if ($row) {
239 $row->{'tag'} = $field;
240
241 # TODO: name_sigular, name_plural
242 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243 $row->{'name'} = $name ? $self->_x($name) : $field;
244
245 # post-sort all values in field
246 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247 $log->warn("sort at field tag not implemented");
248 }
249
250 push @ds, $row;
251
252 $log->debug("row $field: ",sub { Dumper($row) });
253 }
254
255 }
256
257 $self->{'db'}->save_ds(
258 ds => \@ds,
259 current_filename => $self->{'current_filename'},
260 headline => $self->{'headline'},
261 ) if ($self->{'db'});
262
263 $log->debug("ds: ", sub { Dumper(@ds) });
264
265 return @ds;
266
267 }
268
269 =head2 parse
270
271 Perform smart parsing of string, skipping delimiters for fields which aren't
272 defined. It can also eval code in format starting with C<eval{...}> and
273 return output or nothing depending on eval code.
274
275 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
276
277 =cut
278
279 sub parse {
280 my $self = shift;
281
282 my ($rec, $format_utf8, $i) = @_;
283
284 return if (! $format_utf8);
285
286 my $log = $self->_get_logger();
287
288 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
289
290 $i = 0 if (! $i);
291
292 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
293
294 my @out;
295
296 $log->debug("format: $format");
297
298 my $eval_code;
299 # remove eval{...} from beginning
300 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
301
302 my $filter_name;
303 # remove filter{...} from beginning
304 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
305
306 my $prefix;
307 my $all_found=0;
308
309 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
310
311 my $del = $1 || '';
312 $prefix ||= $del if ($all_found == 0);
313
314 # repeatable index
315 my $r = $i;
316 $r = 0 if (lc("$2") eq 's');
317
318 my $found = 0;
319 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
320
321 if ($found) {
322 push @out, $del;
323 push @out, $tmp;
324 $all_found += $found;
325 }
326 }
327
328 return if (! $all_found);
329
330 my $out = join('',@out);
331
332 if ($out) {
333 # add rest of format (suffix)
334 $out .= $format;
335
336 # add prefix if not there
337 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
338
339 $log->debug("result: $out");
340 }
341
342 if ($eval_code) {
343 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
344 $log->debug("about to eval{$eval} format: $out");
345 return if (! $self->_eval($eval));
346 }
347
348 if ($filter_name && $self->{'filter'}->{$filter_name}) {
349 $log->debug("about to filter{$filter_name} format: $out");
350 $out = $self->{'filter'}->{$filter_name}->($out);
351 return unless(defined($out));
352 $log->debug("filter result: $out");
353 }
354
355 return $out;
356 }
357
358 =head2 parse_to_arr
359
360 Similar to C<parse>, but returns array of all repeatable fields
361
362 my @arr = $webpac->parse_to_arr($rec,'v250^a');
363
364 =cut
365
366 sub parse_to_arr {
367 my $self = shift;
368
369 my ($rec, $format_utf8) = @_;
370
371 my $log = $self->_get_logger();
372
373 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
374 return if (! $format_utf8);
375
376 my $i = 0;
377 my @arr;
378
379 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
380 push @arr, $v;
381 }
382
383 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
384
385 return @arr;
386 }
387
388
389 =head2 fill_in
390
391 Workhourse of all: takes record from in-memory structure of database and
392 strings with placeholders and returns string or array of with substituted
393 values from record.
394
395 my $text = $webpac->fill_in($rec,'v250^a');
396
397 Optional argument is ordinal number for repeatable fields. By default,
398 it's assume to be first repeatable field (fields are perl array, so first
399 element is 0).
400 Following example will read second value from repeatable field.
401
402 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
403
404 This function B<does not> perform parsing of format to inteligenty skip
405 delimiters before fields which aren't used.
406
407 This method will automatically decode UTF-8 string to local code page
408 if needed.
409
410 =cut
411
412 sub fill_in {
413 my $self = shift;
414
415 my $log = $self->_get_logger();
416
417 my $rec = shift || $log->logconfess("need data record");
418 my $format = shift || $log->logconfess("need format to parse");
419 # iteration (for repeatable fields)
420 my $i = shift || 0;
421
422 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
423
424 # FIXME remove for speedup?
425 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
426
427 if (utf8::is_utf8($format)) {
428 $format = $self->_x($format);
429 }
430
431 my $found = 0;
432
433 my $eval_code;
434 # remove eval{...} from beginning
435 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
436
437 my $filter_name;
438 # remove filter{...} from beginning
439 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
440
441 # do actual replacement of placeholders
442 # repeatable fields
443 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
444 # non-repeatable fields
445 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
446
447 if ($found) {
448 $log->debug("format: $format");
449 if ($eval_code) {
450 my $eval = $self->fill_in($rec,$eval_code,$i);
451 return if (! $self->_eval($eval));
452 }
453 if ($filter_name && $self->{'filter'}->{$filter_name}) {
454 $log->debug("filter '$filter_name' for $format");
455 $format = $self->{'filter'}->{$filter_name}->($format);
456 return unless(defined($format));
457 $log->debug("filter result: $format");
458 }
459 # do we have lookups?
460 if ($self->{'lookup'}) {
461 if ($self->{'lookup'}->can('lookup')) {
462 return $self->{'lookup'}->lookup($format);
463 } else {
464 $log->warn("Have lookup object but can't invoke lookup method");
465 }
466 } else {
467 return $format;
468 }
469 } else {
470 return;
471 }
472 }
473
474
475 =head2 fill_in_to_arr
476
477 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
478 for fields which have lookups, so they shouldn't be parsed but rather
479 C<fill_id>ed.
480
481 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
482
483 =cut
484
485 sub fill_in_to_arr {
486 my $self = shift;
487
488 my ($rec, $format_utf8) = @_;
489
490 my $log = $self->_get_logger();
491
492 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
493 return if (! $format_utf8);
494
495 my $i = 0;
496 my @arr;
497
498 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
499 push @arr, @v;
500 }
501
502 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
503
504 return @arr;
505 }
506
507
508 =head2 get_data
509
510 Returns value from record.
511
512 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
513
514 Arguments are:
515 record reference C<$rec>,
516 field C<$f>,
517 optional subfiled C<$sf>,
518 index for repeatable values C<$i>.
519
520 Optinal variable C<$found> will be incremeted if there
521 is field.
522
523 Returns value or empty string.
524
525 =cut
526
527 sub get_data {
528 my $self = shift;
529
530 my ($rec,$f,$sf,$i,$found) = @_;
531
532 if ($$rec->{$f}) {
533 return '' if (! $$rec->{$f}->[$i]);
534 no strict 'refs';
535 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
536 $$found++ if (defined($$found));
537 return $$rec->{$f}->[$i]->{$sf};
538 } elsif ($$rec->{$f}->[$i]) {
539 $$found++ if (defined($$found));
540 # it still might have subfield, just
541 # not specified, so we'll dump all
542 if ($$rec->{$f}->[$i] =~ /HASH/o) {
543 my $out;
544 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
545 $out .= $$rec->{$f}->[$i]->{$k}." ";
546 }
547 return $out;
548 } else {
549 return $$rec->{$f}->[$i];
550 }
551 }
552 } else {
553 return '';
554 }
555 }
556
557
558 =head2 apply_format
559
560 Apply format specified in tag with C<format_name="name"> and
561 C<format_delimiter=";;">.
562
563 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
564
565 Formats can contain C<lookup{...}> if you need them.
566
567 =cut
568
569 sub apply_format {
570 my $self = shift;
571
572 my ($name,$delimiter,$data) = @_;
573
574 my $log = $self->_get_logger();
575
576 if (! $self->{'import_xml'}->{'format'}->{$name}) {
577 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
578 return $data;
579 }
580
581 $log->warn("no delimiter for format $name") if (! $delimiter);
582
583 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
584
585 my @data = split(/\Q$delimiter\E/, $data);
586
587 my $out = sprintf($format, @data);
588 $log->debug("using format $name [$format] on $data to produce: $out");
589
590 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
591 return $self->{'lookup'}->lookup($out);
592 } else {
593 return $out;
594 }
595
596 }
597
598 =head2 sort_arr
599
600 Sort array ignoring case and html in data
601
602 my @sorted = $webpac->sort_arr(@unsorted);
603
604 =cut
605
606 sub sort_arr {
607 my $self = shift;
608
609 my $log = $self->_get_logger();
610
611 # FIXME add Schwartzian Transformation?
612
613 my @sorted = sort {
614 $a =~ s#<[^>]+/*>##;
615 $b =~ s#<[^>]+/*>##;
616 lc($b) cmp lc($a)
617 } @_;
618 $log->debug("sorted values: ",sub { join(", ",@sorted) });
619
620 return @sorted;
621 }
622
623
624 =head1 INTERNAL METHODS
625
626 =head2 _sort_by_order
627
628 Sort xml tags data structure accoding to C<order=""> attribute.
629
630 =cut
631
632 sub _sort_by_order {
633 my $self = shift;
634
635 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
636 $self->{'import_xml'}->{'indexer'}->{$a};
637 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
638 $self->{'import_xml'}->{'indexer'}->{$b};
639
640 return $va <=> $vb;
641 }
642
643 =head2 _x
644
645 Convert strings from C<conf/normalize/*.xml> encoding into application
646 specific encoding (optinally specified using C<code_page> to C<new>
647 constructor).
648
649 my $text = $n->_x('normalize text string');
650
651 This is a stub so that other modules doesn't have to implement it.
652
653 =cut
654
655 sub _x {
656 my $self = shift;
657 return shift;
658 }
659
660
661 =head1 AUTHOR
662
663 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
664
665 =head1 COPYRIGHT & LICENSE
666
667 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
668
669 This program is free software; you can redistribute it and/or modify it
670 under the same terms as Perl itself.
671
672 =cut
673
674 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26