/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 64 - (show annotations)
Tue Nov 15 16:56:44 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 15574 byte(s)
 r8894@llin:  dpavlin | 2005-11-15 17:56:56 +0100
 fixed WebPAC::Normalize::get_data to work when called with subfield which
 doesn't exist, added tests

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 );
86
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
89
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
93
94 =cut
95
96 sub new {
97 my $class = shift;
98 my $self = {@_};
99 bless($self, $class);
100
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
103
104 my $log = $self->_get_logger();
105
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
110 }
111
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114 $self ? return $self : return undef;
115 }
116
117
118 =head2 data_structure
119
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
122
123 This structures are used to produce output.
124
125 my @ds = $webpac->data_structure($rec);
126
127 B<Note: historical oddity follows>
128
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
132
133 =cut
134
135 sub data_structure {
136 my $self = shift;
137
138 my $log = $self->_get_logger();
139
140 my $rec = shift;
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143 my $cache_file;
144
145 if ($self->{'db'}) {
146 my @ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
148 return @ds if ($#ds > 0);
149 $log->debug("cache miss, creating");
150 }
151
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
154
155 my @sorted_tags;
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
158 } else {
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
161 }
162
163 my @ds;
164
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167 foreach my $field (@sorted_tags) {
168
169 my $row;
170
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 my $format;
175
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
178
179 $log->debug("format: $format");
180
181 my @v;
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
184 } else {
185 @v = $self->parse_to_arr($rec,$format);
186 }
187 next if (! @v);
188
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
191 }
192
193 # use format?
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196 }
197
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
205 }
206
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
210 }
211
212 # default types
213 my @types = qw(display swish);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
216
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
221
222 # I will delimit appended part with
223 # delimiter (or ,)
224 my $d = $tag->{'delimiter'};
225 # default delimiter
226 $d ||= " ";
227
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
232
233 } else {
234 push @{$row->{$type}}, @v;
235 }
236 }
237
238
239 }
240
241 if ($row) {
242 $row->{'tag'} = $field;
243
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 $row->{'name'} = $name ? $self->_x($name) : $field;
247
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
251 }
252
253 push @ds, $row;
254
255 $log->debug("row $field: ",sub { Dumper($row) });
256 }
257
258 }
259
260 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261
262 $self->{'db'}->save_ds(
263 ds => \@ds,
264 current_filename => $self->{'current_filename'},
265 headline => $self->{'headline'},
266 ) if ($self->{'db'});
267
268 $log->debug("ds: ", sub { Dumper(@ds) });
269
270 return @ds;
271
272 }
273
274 =head2 parse
275
276 Perform smart parsing of string, skipping delimiters for fields which aren't
277 defined. It can also eval code in format starting with C<eval{...}> and
278 return output or nothing depending on eval code.
279
280 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
281
282 =cut
283
284 sub parse {
285 my $self = shift;
286
287 my ($rec, $format_utf8, $i) = @_;
288
289 return if (! $format_utf8);
290
291 my $log = $self->_get_logger();
292
293 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
294
295 $i = 0 if (! $i);
296
297 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
298
299 my @out;
300
301 $log->debug("format: $format");
302
303 my $eval_code;
304 # remove eval{...} from beginning
305 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
306
307 my $filter_name;
308 # remove filter{...} from beginning
309 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
310
311 my $prefix;
312 my $all_found=0;
313
314 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
315
316 my $del = $1 || '';
317 $prefix ||= $del if ($all_found == 0);
318
319 # repeatable index
320 my $r = $i;
321 $r = 0 if (lc("$2") eq 's');
322
323 my $found = 0;
324 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
325
326 if ($found) {
327 push @out, $del;
328 push @out, $tmp;
329 $all_found += $found;
330 }
331 }
332
333 return if (! $all_found);
334
335 my $out = join('',@out);
336
337 if ($out) {
338 # add rest of format (suffix)
339 $out .= $format;
340
341 # add prefix if not there
342 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
343
344 $log->debug("result: $out");
345 }
346
347 if ($eval_code) {
348 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
349 $log->debug("about to eval{$eval} format: $out");
350 return if (! $self->_eval($eval));
351 }
352
353 if ($filter_name && $self->{'filter'}->{$filter_name}) {
354 $log->debug("about to filter{$filter_name} format: $out");
355 $out = $self->{'filter'}->{$filter_name}->($out);
356 return unless(defined($out));
357 $log->debug("filter result: $out");
358 }
359
360 return $out;
361 }
362
363 =head2 parse_to_arr
364
365 Similar to C<parse>, but returns array of all repeatable fields
366
367 my @arr = $webpac->parse_to_arr($rec,'v250^a');
368
369 =cut
370
371 sub parse_to_arr {
372 my $self = shift;
373
374 my ($rec, $format_utf8) = @_;
375
376 my $log = $self->_get_logger();
377
378 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
379 return if (! $format_utf8);
380
381 my $i = 0;
382 my @arr;
383
384 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
385 push @arr, $v;
386 }
387
388 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
389
390 return @arr;
391 }
392
393
394 =head2 fill_in
395
396 Workhourse of all: takes record from in-memory structure of database and
397 strings with placeholders and returns string or array of with substituted
398 values from record.
399
400 my $text = $webpac->fill_in($rec,'v250^a');
401
402 Optional argument is ordinal number for repeatable fields. By default,
403 it's assume to be first repeatable field (fields are perl array, so first
404 element is 0).
405 Following example will read second value from repeatable field.
406
407 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
408
409 This function B<does not> perform parsing of format to inteligenty skip
410 delimiters before fields which aren't used.
411
412 This method will automatically decode UTF-8 string to local code page
413 if needed.
414
415 =cut
416
417 sub fill_in {
418 my $self = shift;
419
420 my $log = $self->_get_logger();
421
422 my $rec = shift || $log->logconfess("need data record");
423 my $format = shift || $log->logconfess("need format to parse");
424 # iteration (for repeatable fields)
425 my $i = shift || 0;
426
427 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
428
429 # FIXME remove for speedup?
430 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
431
432 if (utf8::is_utf8($format)) {
433 $format = $self->_x($format);
434 }
435
436 my $found = 0;
437
438 my $eval_code;
439 # remove eval{...} from beginning
440 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
441
442 my $filter_name;
443 # remove filter{...} from beginning
444 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
445
446 # do actual replacement of placeholders
447 # repeatable fields
448 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
449 # non-repeatable fields
450 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
451
452 if ($found) {
453 $log->debug("format: $format");
454 if ($eval_code) {
455 my $eval = $self->fill_in($rec,$eval_code,$i);
456 return if (! $self->_eval($eval));
457 }
458 if ($filter_name && $self->{'filter'}->{$filter_name}) {
459 $log->debug("filter '$filter_name' for $format");
460 $format = $self->{'filter'}->{$filter_name}->($format);
461 return unless(defined($format));
462 $log->debug("filter result: $format");
463 }
464 # do we have lookups?
465 if ($self->{'lookup'}) {
466 if ($self->{'lookup'}->can('lookup')) {
467 return $self->{'lookup'}->lookup($format);
468 } else {
469 $log->warn("Have lookup object but can't invoke lookup method");
470 }
471 } else {
472 return $format;
473 }
474 } else {
475 return;
476 }
477 }
478
479
480 =head2 fill_in_to_arr
481
482 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
483 for fields which have lookups, so they shouldn't be parsed but rather
484 C<fill_id>ed.
485
486 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
487
488 =cut
489
490 sub fill_in_to_arr {
491 my $self = shift;
492
493 my ($rec, $format_utf8) = @_;
494
495 my $log = $self->_get_logger();
496
497 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
498 return if (! $format_utf8);
499
500 my $i = 0;
501 my @arr;
502
503 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
504 push @arr, @v;
505 }
506
507 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
508
509 return @arr;
510 }
511
512
513 =head2 get_data
514
515 Returns value from record.
516
517 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
518
519 Arguments are:
520 record reference C<$rec>,
521 field C<$f>,
522 optional subfiled C<$sf>,
523 index for repeatable values C<$i>.
524
525 Optinal variable C<$found> will be incremeted if there
526 is field.
527
528 Returns value or empty string.
529
530 =cut
531
532 sub get_data {
533 my $self = shift;
534
535 my ($rec,$f,$sf,$i,$found) = @_;
536
537 if ($$rec->{$f}) {
538 return '' if (! $$rec->{$f}->[$i]);
539 no strict 'refs';
540 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
541 $$found++ if (defined($$found));
542 return $$rec->{$f}->[$i]->{$sf};
543 } elsif (! $sf && $$rec->{$f}->[$i]) {
544 $$found++ if (defined($$found));
545 # it still might have subfield, just
546 # not specified, so we'll dump all
547 if ($$rec->{$f}->[$i] =~ /HASH/o) {
548 my $out;
549 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
550 $out .= $$rec->{$f}->[$i]->{$k}." ";
551 }
552 return $out;
553 } else {
554 return $$rec->{$f}->[$i];
555 }
556 } else {
557 return '';
558 }
559 } else {
560 return '';
561 }
562 }
563
564
565 =head2 apply_format
566
567 Apply format specified in tag with C<format_name="name"> and
568 C<format_delimiter=";;">.
569
570 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
571
572 Formats can contain C<lookup{...}> if you need them.
573
574 =cut
575
576 sub apply_format {
577 my $self = shift;
578
579 my ($name,$delimiter,$data) = @_;
580
581 my $log = $self->_get_logger();
582
583 if (! $self->{'import_xml'}->{'format'}->{$name}) {
584 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
585 return $data;
586 }
587
588 $log->warn("no delimiter for format $name") if (! $delimiter);
589
590 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
591
592 my @data = split(/\Q$delimiter\E/, $data);
593
594 my $out = sprintf($format, @data);
595 $log->debug("using format $name [$format] on $data to produce: $out");
596
597 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
598 return $self->{'lookup'}->lookup($out);
599 } else {
600 return $out;
601 }
602
603 }
604
605 =head2 sort_arr
606
607 Sort array ignoring case and html in data
608
609 my @sorted = $webpac->sort_arr(@unsorted);
610
611 =cut
612
613 sub sort_arr {
614 my $self = shift;
615
616 my $log = $self->_get_logger();
617
618 # FIXME add Schwartzian Transformation?
619
620 my @sorted = sort {
621 $a =~ s#<[^>]+/*>##;
622 $b =~ s#<[^>]+/*>##;
623 lc($b) cmp lc($a)
624 } @_;
625 $log->debug("sorted values: ",sub { join(", ",@sorted) });
626
627 return @sorted;
628 }
629
630
631 =head1 INTERNAL METHODS
632
633 =head2 _sort_by_order
634
635 Sort xml tags data structure accoding to C<order=""> attribute.
636
637 =cut
638
639 sub _sort_by_order {
640 my $self = shift;
641
642 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
643 $self->{'import_xml'}->{'indexer'}->{$a};
644 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
645 $self->{'import_xml'}->{'indexer'}->{$b};
646
647 return $va <=> $vb;
648 }
649
650 =head2 _x
651
652 Convert strings from C<conf/normalize/*.xml> encoding into application
653 specific encoding (optinally specified using C<code_page> to C<new>
654 constructor).
655
656 my $text = $n->_x('normalize text string');
657
658 This is a stub so that other modules doesn't have to implement it.
659
660 =cut
661
662 sub _x {
663 my $self = shift;
664 return shift;
665 }
666
667
668 =head1 AUTHOR
669
670 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
671
672 =head1 COPYRIGHT & LICENSE
673
674 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
675
676 This program is free software; you can redistribute it and/or modify it
677 under the same terms as Perl itself.
678
679 =cut
680
681 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26