/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 70 - (show annotations)
Sat Nov 19 23:48:24 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 15658 byte(s)
 r8980@llin:  dpavlin | 2005-11-20 00:49:22 +0100
 implement data_structure that returns HASH and not ARRAY.
 
 Little explanation for this rationale:
 
 Array was needed back in WebPAC v1 because order of tags in import_xml was
 important. However, since we are no longer depending on order of tags in
 input/*.xml, hash is much better choice.

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.02
15
16 =cut
17
18 our $VERSION = '0.02';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 );
86
87 Parametar C<filter> defines user supplied snippets of perl code which can
88 be use with C<filter{...}> notation.
89
90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
91 in structures. If you pass this parametar, you must also pass C<lookup>
92 which is C<WebPAC::Lookup> object.
93
94 =cut
95
96 sub new {
97 my $class = shift;
98 my $self = {@_};
99 bless($self, $class);
100
101 my $r = $self->{'lookup_regex'} ? 1 : 0;
102 my $l = $self->{'lookup'} ? 1 : 0;
103
104 my $log = $self->_get_logger();
105
106 # those two must be in pair
107 if ( ($r & $l) != ($r || $l) ) {
108 my $log = $self->_get_logger();
109 $log->logdie("lookup_regex and lookup must be in pair");
110 }
111
112 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
113
114 $self ? return $self : return undef;
115 }
116
117
118 =head2 data_structure
119
120 Create in-memory data structure which represents normalized layout from
121 C<conf/normalize/*.xml>.
122
123 This structures are used to produce output.
124
125 my $ds = $webpac->data_structure($rec);
126
127 B<Note: historical oddity follows>
128
129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
131 C<< <headline> >> tag.
132
133 =cut
134
135 sub data_structure {
136 my $self = shift;
137
138 my $log = $self->_get_logger();
139
140 my $rec = shift;
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143 my $cache_file;
144
145 if ($self->{'db'}) {
146 my $ds = $self->{'db'}->load_ds($rec);
147 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
148 return $ds if ($ds);
149 $log->debug("cache miss, creating");
150 }
151
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
154
155 my @sorted_tags;
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
158 } else {
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
161 }
162
163 my $ds;
164
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167 foreach my $field (@sorted_tags) {
168
169 my $row;
170
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 my $format;
175
176 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
177 $format = $tag->{'value'} || $tag->{'content'};
178
179 $log->debug("format: $format");
180
181 my @v;
182 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
183 @v = $self->fill_in_to_arr($rec,$format);
184 } else {
185 @v = $self->parse_to_arr($rec,$format);
186 }
187 next if (! @v);
188
189 if ($tag->{'sort'}) {
190 @v = $self->sort_arr(@v);
191 }
192
193 # use format?
194 if ($tag->{'format_name'}) {
195 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
196 }
197
198 if ($field eq 'filename') {
199 $self->{'current_filename'} = join('',@v);
200 $log->debug("filename: ",$self->{'current_filename'});
201 } elsif ($field eq 'headline') {
202 $self->{'headline'} .= join('',@v);
203 $log->debug("headline: ",$self->{'headline'});
204 next; # don't return headline in data_structure!
205 }
206
207 # delimiter will join repeatable fields
208 if ($tag->{'delimiter'}) {
209 @v = ( join($tag->{'delimiter'}, @v) );
210 }
211
212 # default types
213 my @types = qw(display swish);
214 # override by type attribute
215 @types = ( $tag->{'type'} ) if ($tag->{'type'});
216
217 foreach my $type (@types) {
218 # append to previous line?
219 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
220 if ($tag->{'append'}) {
221
222 # I will delimit appended part with
223 # delimiter (or ,)
224 my $d = $tag->{'delimiter'};
225 # default delimiter
226 $d ||= " ";
227
228 my $last = pop @{$row->{$type}};
229 $d = "" if (! $last);
230 $last .= $d . join($d, @v);
231 push @{$row->{$type}}, $last;
232
233 } else {
234 push @{$row->{$type}}, @v;
235 }
236 }
237
238
239 }
240
241 if ($row) {
242 $row->{'tag'} = $field;
243
244 # TODO: name_sigular, name_plural
245 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
246 my $row_name = $name ? $self->_x($name) : $field;
247
248 # post-sort all values in field
249 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
250 $log->warn("sort at field tag not implemented");
251 }
252
253 $ds->{$row_name} = $row;
254
255 $log->debug("row $field: ",sub { Dumper($row) });
256 }
257
258 }
259
260 $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
261
262 $self->{'db'}->save_ds(
263 ds => $ds,
264 current_filename => $self->{'current_filename'},
265 headline => $self->{'headline'},
266 ) if ($self->{'db'});
267
268 $log->debug("ds: ", sub { Dumper($ds) });
269
270 $log->logconfess("data structure returned is not array any more!") if wantarray;
271
272 return $ds;
273
274 }
275
276 =head2 parse
277
278 Perform smart parsing of string, skipping delimiters for fields which aren't
279 defined. It can also eval code in format starting with C<eval{...}> and
280 return output or nothing depending on eval code.
281
282 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
283
284 =cut
285
286 sub parse {
287 my $self = shift;
288
289 my ($rec, $format_utf8, $i) = @_;
290
291 return if (! $format_utf8);
292
293 my $log = $self->_get_logger();
294
295 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
296
297 $i = 0 if (! $i);
298
299 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
300
301 my @out;
302
303 $log->debug("format: $format");
304
305 my $eval_code;
306 # remove eval{...} from beginning
307 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
308
309 my $filter_name;
310 # remove filter{...} from beginning
311 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
312
313 my $prefix;
314 my $all_found=0;
315
316 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
317
318 my $del = $1 || '';
319 $prefix ||= $del if ($all_found == 0);
320
321 # repeatable index
322 my $r = $i;
323 $r = 0 if (lc("$2") eq 's');
324
325 my $found = 0;
326 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
327
328 if ($found) {
329 push @out, $del;
330 push @out, $tmp;
331 $all_found += $found;
332 }
333 }
334
335 return if (! $all_found);
336
337 my $out = join('',@out);
338
339 if ($out) {
340 # add rest of format (suffix)
341 $out .= $format;
342
343 # add prefix if not there
344 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
345
346 $log->debug("result: $out");
347 }
348
349 if ($eval_code) {
350 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
351 $log->debug("about to eval{$eval} format: $out");
352 return if (! $self->_eval($eval));
353 }
354
355 if ($filter_name && $self->{'filter'}->{$filter_name}) {
356 $log->debug("about to filter{$filter_name} format: $out");
357 $out = $self->{'filter'}->{$filter_name}->($out);
358 return unless(defined($out));
359 $log->debug("filter result: $out");
360 }
361
362 return $out;
363 }
364
365 =head2 parse_to_arr
366
367 Similar to C<parse>, but returns array of all repeatable fields
368
369 my @arr = $webpac->parse_to_arr($rec,'v250^a');
370
371 =cut
372
373 sub parse_to_arr {
374 my $self = shift;
375
376 my ($rec, $format_utf8) = @_;
377
378 my $log = $self->_get_logger();
379
380 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
381 return if (! $format_utf8);
382
383 my $i = 0;
384 my @arr;
385
386 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
387 push @arr, $v;
388 }
389
390 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
391
392 return @arr;
393 }
394
395
396 =head2 fill_in
397
398 Workhourse of all: takes record from in-memory structure of database and
399 strings with placeholders and returns string or array of with substituted
400 values from record.
401
402 my $text = $webpac->fill_in($rec,'v250^a');
403
404 Optional argument is ordinal number for repeatable fields. By default,
405 it's assume to be first repeatable field (fields are perl array, so first
406 element is 0).
407 Following example will read second value from repeatable field.
408
409 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
410
411 This function B<does not> perform parsing of format to inteligenty skip
412 delimiters before fields which aren't used.
413
414 This method will automatically decode UTF-8 string to local code page
415 if needed.
416
417 =cut
418
419 sub fill_in {
420 my $self = shift;
421
422 my $log = $self->_get_logger();
423
424 my $rec = shift || $log->logconfess("need data record");
425 my $format = shift || $log->logconfess("need format to parse");
426 # iteration (for repeatable fields)
427 my $i = shift || 0;
428
429 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
430
431 # FIXME remove for speedup?
432 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
433
434 if (utf8::is_utf8($format)) {
435 $format = $self->_x($format);
436 }
437
438 my $found = 0;
439
440 my $eval_code;
441 # remove eval{...} from beginning
442 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
443
444 my $filter_name;
445 # remove filter{...} from beginning
446 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
447
448 # do actual replacement of placeholders
449 # repeatable fields
450 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
451 # non-repeatable fields
452 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
453
454 if ($found) {
455 $log->debug("format: $format");
456 if ($eval_code) {
457 my $eval = $self->fill_in($rec,$eval_code,$i);
458 return if (! $self->_eval($eval));
459 }
460 if ($filter_name && $self->{'filter'}->{$filter_name}) {
461 $log->debug("filter '$filter_name' for $format");
462 $format = $self->{'filter'}->{$filter_name}->($format);
463 return unless(defined($format));
464 $log->debug("filter result: $format");
465 }
466 # do we have lookups?
467 if ($self->{'lookup'}) {
468 if ($self->{'lookup'}->can('lookup')) {
469 return $self->{'lookup'}->lookup($format);
470 } else {
471 $log->warn("Have lookup object but can't invoke lookup method");
472 }
473 } else {
474 return $format;
475 }
476 } else {
477 return;
478 }
479 }
480
481
482 =head2 fill_in_to_arr
483
484 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
485 for fields which have lookups, so they shouldn't be parsed but rather
486 C<fill_id>ed.
487
488 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
489
490 =cut
491
492 sub fill_in_to_arr {
493 my $self = shift;
494
495 my ($rec, $format_utf8) = @_;
496
497 my $log = $self->_get_logger();
498
499 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
500 return if (! $format_utf8);
501
502 my $i = 0;
503 my @arr;
504
505 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
506 push @arr, @v;
507 }
508
509 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
510
511 return @arr;
512 }
513
514
515 =head2 get_data
516
517 Returns value from record.
518
519 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
520
521 Arguments are:
522 record reference C<$rec>,
523 field C<$f>,
524 optional subfiled C<$sf>,
525 index for repeatable values C<$i>.
526
527 Optinal variable C<$found> will be incremeted if there
528 is field.
529
530 Returns value or empty string.
531
532 =cut
533
534 sub get_data {
535 my $self = shift;
536
537 my ($rec,$f,$sf,$i,$found) = @_;
538
539 if ($$rec->{$f}) {
540 return '' if (! $$rec->{$f}->[$i]);
541 no strict 'refs';
542 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
543 $$found++ if (defined($$found));
544 return $$rec->{$f}->[$i]->{$sf};
545 } elsif (! $sf && $$rec->{$f}->[$i]) {
546 $$found++ if (defined($$found));
547 # it still might have subfield, just
548 # not specified, so we'll dump all
549 if ($$rec->{$f}->[$i] =~ /HASH/o) {
550 my $out;
551 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
552 $out .= $$rec->{$f}->[$i]->{$k}." ";
553 }
554 return $out;
555 } else {
556 return $$rec->{$f}->[$i];
557 }
558 } else {
559 return '';
560 }
561 } else {
562 return '';
563 }
564 }
565
566
567 =head2 apply_format
568
569 Apply format specified in tag with C<format_name="name"> and
570 C<format_delimiter=";;">.
571
572 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
573
574 Formats can contain C<lookup{...}> if you need them.
575
576 =cut
577
578 sub apply_format {
579 my $self = shift;
580
581 my ($name,$delimiter,$data) = @_;
582
583 my $log = $self->_get_logger();
584
585 if (! $self->{'import_xml'}->{'format'}->{$name}) {
586 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
587 return $data;
588 }
589
590 $log->warn("no delimiter for format $name") if (! $delimiter);
591
592 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
593
594 my @data = split(/\Q$delimiter\E/, $data);
595
596 my $out = sprintf($format, @data);
597 $log->debug("using format $name [$format] on $data to produce: $out");
598
599 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
600 return $self->{'lookup'}->lookup($out);
601 } else {
602 return $out;
603 }
604
605 }
606
607 =head2 sort_arr
608
609 Sort array ignoring case and html in data
610
611 my @sorted = $webpac->sort_arr(@unsorted);
612
613 =cut
614
615 sub sort_arr {
616 my $self = shift;
617
618 my $log = $self->_get_logger();
619
620 # FIXME add Schwartzian Transformation?
621
622 my @sorted = sort {
623 $a =~ s#<[^>]+/*>##;
624 $b =~ s#<[^>]+/*>##;
625 lc($b) cmp lc($a)
626 } @_;
627 $log->debug("sorted values: ",sub { join(", ",@sorted) });
628
629 return @sorted;
630 }
631
632
633 =head1 INTERNAL METHODS
634
635 =head2 _sort_by_order
636
637 Sort xml tags data structure accoding to C<order=""> attribute.
638
639 =cut
640
641 sub _sort_by_order {
642 my $self = shift;
643
644 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
645 $self->{'import_xml'}->{'indexer'}->{$a};
646 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
647 $self->{'import_xml'}->{'indexer'}->{$b};
648
649 return $va <=> $vb;
650 }
651
652 =head2 _x
653
654 Convert strings from C<conf/normalize/*.xml> encoding into application
655 specific encoding (optinally specified using C<code_page> to C<new>
656 constructor).
657
658 my $text = $n->_x('normalize text string');
659
660 This is a stub so that other modules doesn't have to implement it.
661
662 =cut
663
664 sub _x {
665 my $self = shift;
666 return shift;
667 }
668
669
670 =head1 AUTHOR
671
672 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
673
674 =head1 COPYRIGHT & LICENSE
675
676 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
677
678 This program is free software; you can redistribute it and/or modify it
679 under the same terms as Perl itself.
680
681 =cut
682
683 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26