/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 38 - (show annotations)
Sat Nov 12 21:21:50 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 15397 byte(s)
added ForceContent so that tags without attributes work, added strict checking

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 );
86
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
89
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
93
94 =cut
95
96 sub new {
97 my $class = shift;
98 my $self = {@_};
99 bless($self, $class);
100
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
103
104 my $log = $self->_get_logger();
105
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
110 }
111
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114 $self ? return $self : return undef;
115 }
116
117
118 =head2 data_structure
119
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
122
123 This structures are used to produce output.
124
125 my @ds = $webpac->data_structure($rec);
126
127 B<Note: historical oddity follows>
128
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
132
133 =cut
134
135 sub data_structure {
136 my $self = shift;
137
138 my $log = $self->_get_logger();
139
140 my $rec = shift;
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143 my $cache_file;
144
145 if ($self->{'db'}) {
146 my @ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148 return @ds if ($#ds > 0);
149 $log->debug("cache miss, creating");
150 }
151
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
154
155 my @sorted_tags;
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
158 } else {
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
161 }
162
163 my @ds;
164
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167 foreach my $field (@sorted_tags) {
168
169 my $row;
170
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 my $format;
175
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
178
179 $log->debug("format: $format");
180
181 my @v;
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
184 } else {
185 @v = $self->parse_to_arr($rec,$format);
186 }
187 next if (! @v);
188
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
191 }
192
193 # use format?
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196 }
197
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
205 }
206
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
210 }
211
212 # default types
213 my @types = qw(display swish);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
216
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
221
222 # I will delimit appended part with
223 # delimiter (or ,)
224 my $d = $tag->{'delimiter'};
225 # default delimiter
226 $d ||= " ";
227
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
232
233 } else {
234 push @{$row->{$type}}, @v;
235 }
236 }
237
238
239 }
240
241 if ($row) {
242 $row->{'tag'} = $field;
243
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 $row->{'name'} = $name ? $self->_x($name) : $field;
247
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
251 }
252
253 push @ds, $row;
254
255 $log->debug("row $field: ",sub { Dumper($row) });
256 }
257
258 }
259
260 $self->{'db'}->save_ds(
261 ds => \@ds,
262 current_filename => $self->{'current_filename'},
263 headline => $self->{'headline'},
264 ) if ($self->{'db'});
265
266 $log->debug("ds: ", sub { Dumper(@ds) });
267
268 return @ds;
269
270 }
271
272 =head2 parse
273
274 Perform smart parsing of string, skipping delimiters for fields which aren't
275 defined. It can also eval code in format starting with C<eval{...}> and
276 return output or nothing depending on eval code.
277
278 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
279
280 =cut
281
282 sub parse {
283 my $self = shift;
284
285 my ($rec, $format_utf8, $i) = @_;
286
287 return if (! $format_utf8);
288
289 my $log = $self->_get_logger();
290
291 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
292
293 $i = 0 if (! $i);
294
295 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
296
297 my @out;
298
299 $log->debug("format: $format");
300
301 my $eval_code;
302 # remove eval{...} from beginning
303 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
304
305 my $filter_name;
306 # remove filter{...} from beginning
307 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
308
309 my $prefix;
310 my $all_found=0;
311
312 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
313
314 my $del = $1 || '';
315 $prefix ||= $del if ($all_found == 0);
316
317 # repeatable index
318 my $r = $i;
319 $r = 0 if (lc("$2") eq 's');
320
321 my $found = 0;
322 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
323
324 if ($found) {
325 push @out, $del;
326 push @out, $tmp;
327 $all_found += $found;
328 }
329 }
330
331 return if (! $all_found);
332
333 my $out = join('',@out);
334
335 if ($out) {
336 # add rest of format (suffix)
337 $out .= $format;
338
339 # add prefix if not there
340 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
341
342 $log->debug("result: $out");
343 }
344
345 if ($eval_code) {
346 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
347 $log->debug("about to eval{$eval} format: $out");
348 return if (! $self->_eval($eval));
349 }
350
351 if ($filter_name && $self->{'filter'}->{$filter_name}) {
352 $log->debug("about to filter{$filter_name} format: $out");
353 $out = $self->{'filter'}->{$filter_name}->($out);
354 return unless(defined($out));
355 $log->debug("filter result: $out");
356 }
357
358 return $out;
359 }
360
361 =head2 parse_to_arr
362
363 Similar to C<parse>, but returns array of all repeatable fields
364
365 my @arr = $webpac->parse_to_arr($rec,'v250^a');
366
367 =cut
368
369 sub parse_to_arr {
370 my $self = shift;
371
372 my ($rec, $format_utf8) = @_;
373
374 my $log = $self->_get_logger();
375
376 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
377 return if (! $format_utf8);
378
379 my $i = 0;
380 my @arr;
381
382 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
383 push @arr, $v;
384 }
385
386 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
387
388 return @arr;
389 }
390
391
392 =head2 fill_in
393
394 Workhourse of all: takes record from in-memory structure of database and
395 strings with placeholders and returns string or array of with substituted
396 values from record.
397
398 my $text = $webpac->fill_in($rec,'v250^a');
399
400 Optional argument is ordinal number for repeatable fields. By default,
401 it's assume to be first repeatable field (fields are perl array, so first
402 element is 0).
403 Following example will read second value from repeatable field.
404
405 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
406
407 This function B<does not> perform parsing of format to inteligenty skip
408 delimiters before fields which aren't used.
409
410 This method will automatically decode UTF-8 string to local code page
411 if needed.
412
413 =cut
414
415 sub fill_in {
416 my $self = shift;
417
418 my $log = $self->_get_logger();
419
420 my $rec = shift || $log->logconfess("need data record");
421 my $format = shift || $log->logconfess("need format to parse");
422 # iteration (for repeatable fields)
423 my $i = shift || 0;
424
425 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
426
427 # FIXME remove for speedup?
428 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429
430 if (utf8::is_utf8($format)) {
431 $format = $self->_x($format);
432 }
433
434 my $found = 0;
435
436 my $eval_code;
437 # remove eval{...} from beginning
438 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
439
440 my $filter_name;
441 # remove filter{...} from beginning
442 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
443
444 # do actual replacement of placeholders
445 # repeatable fields
446 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
447 # non-repeatable fields
448 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
449
450 if ($found) {
451 $log->debug("format: $format");
452 if ($eval_code) {
453 my $eval = $self->fill_in($rec,$eval_code,$i);
454 return if (! $self->_eval($eval));
455 }
456 if ($filter_name && $self->{'filter'}->{$filter_name}) {
457 $log->debug("filter '$filter_name' for $format");
458 $format = $self->{'filter'}->{$filter_name}->($format);
459 return unless(defined($format));
460 $log->debug("filter result: $format");
461 }
462 # do we have lookups?
463 if ($self->{'lookup'}) {
464 if ($self->{'lookup'}->can('lookup')) {
465 return $self->{'lookup'}->lookup($format);
466 } else {
467 $log->warn("Have lookup object but can't invoke lookup method");
468 }
469 } else {
470 return $format;
471 }
472 } else {
473 return;
474 }
475 }
476
477
478 =head2 fill_in_to_arr
479
480 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
481 for fields which have lookups, so they shouldn't be parsed but rather
482 C<fill_id>ed.
483
484 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
485
486 =cut
487
488 sub fill_in_to_arr {
489 my $self = shift;
490
491 my ($rec, $format_utf8) = @_;
492
493 my $log = $self->_get_logger();
494
495 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
496 return if (! $format_utf8);
497
498 my $i = 0;
499 my @arr;
500
501 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
502 push @arr, @v;
503 }
504
505 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
506
507 return @arr;
508 }
509
510
511 =head2 get_data
512
513 Returns value from record.
514
515 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
516
517 Arguments are:
518 record reference C<$rec>,
519 field C<$f>,
520 optional subfiled C<$sf>,
521 index for repeatable values C<$i>.
522
523 Optinal variable C<$found> will be incremeted if there
524 is field.
525
526 Returns value or empty string.
527
528 =cut
529
530 sub get_data {
531 my $self = shift;
532
533 my ($rec,$f,$sf,$i,$found) = @_;
534
535 if ($$rec->{$f}) {
536 return '' if (! $$rec->{$f}->[$i]);
537 no strict 'refs';
538 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
539 $$found++ if (defined($$found));
540 return $$rec->{$f}->[$i]->{$sf};
541 } elsif ($$rec->{$f}->[$i]) {
542 $$found++ if (defined($$found));
543 # it still might have subfield, just
544 # not specified, so we'll dump all
545 if ($$rec->{$f}->[$i] =~ /HASH/o) {
546 my $out;
547 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
548 $out .= $$rec->{$f}->[$i]->{$k}." ";
549 }
550 return $out;
551 } else {
552 return $$rec->{$f}->[$i];
553 }
554 }
555 } else {
556 return '';
557 }
558 }
559
560
561 =head2 apply_format
562
563 Apply format specified in tag with C<format_name="name"> and
564 C<format_delimiter=";;">.
565
566 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
567
568 Formats can contain C<lookup{...}> if you need them.
569
570 =cut
571
572 sub apply_format {
573 my $self = shift;
574
575 my ($name,$delimiter,$data) = @_;
576
577 my $log = $self->_get_logger();
578
579 if (! $self->{'import_xml'}->{'format'}->{$name}) {
580 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
581 return $data;
582 }
583
584 $log->warn("no delimiter for format $name") if (! $delimiter);
585
586 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
587
588 my @data = split(/\Q$delimiter\E/, $data);
589
590 my $out = sprintf($format, @data);
591 $log->debug("using format $name [$format] on $data to produce: $out");
592
593 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
594 return $self->{'lookup'}->lookup($out);
595 } else {
596 return $out;
597 }
598
599 }
600
601 =head2 sort_arr
602
603 Sort array ignoring case and html in data
604
605 my @sorted = $webpac->sort_arr(@unsorted);
606
607 =cut
608
609 sub sort_arr {
610 my $self = shift;
611
612 my $log = $self->_get_logger();
613
614 # FIXME add Schwartzian Transformation?
615
616 my @sorted = sort {
617 $a =~ s#<[^>]+/*>##;
618 $b =~ s#<[^>]+/*>##;
619 lc($b) cmp lc($a)
620 } @_;
621 $log->debug("sorted values: ",sub { join(", ",@sorted) });
622
623 return @sorted;
624 }
625
626
627 =head1 INTERNAL METHODS
628
629 =head2 _sort_by_order
630
631 Sort xml tags data structure accoding to C<order=""> attribute.
632
633 =cut
634
635 sub _sort_by_order {
636 my $self = shift;
637
638 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
639 $self->{'import_xml'}->{'indexer'}->{$a};
640 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
641 $self->{'import_xml'}->{'indexer'}->{$b};
642
643 return $va <=> $vb;
644 }
645
646 =head2 _x
647
648 Convert strings from C<conf/normalize/*.xml> encoding into application
649 specific encoding (optinally specified using C<code_page> to C<new>
650 constructor).
651
652 my $text = $n->_x('normalize text string');
653
654 This is a stub so that other modules doesn't have to implement it.
655
656 =cut
657
658 sub _x {
659 my $self = shift;
660 return shift;
661 }
662
663
664 =head1 AUTHOR
665
666 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
667
668 =head1 COPYRIGHT & LICENSE
669
670 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
671
672 This program is free software; you can redistribute it and/or modify it
673 under the same terms as Perl itself.
674
675 =cut
676
677 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26