/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations)
Sun Jul 17 10:42:23 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 16320 byte(s)
WebPAC::Common cleanup, most code moved to WebPAC::Normalize. Added
documentation about order of data mungling when normalising data.

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use Data::Dumper;
6 use Storable;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 cache_data_structure => './cache/ds/',
83 lookup_regex => $lookup->regex,
84 );
85
86 Parametar C<filter> defines user supplied snippets of perl code which can
87 be use with C<filter{...}> notation.
88
89 Optional parameter C<cache_data_structure> defines path to directory
90 in which cache file for C<data_structure> call will be created.
91
92 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
93 in structures.
94
95 =cut
96
97 sub new {
98 my $class = shift;
99 my $self = {@_};
100 bless($self, $class);
101
102 $self->setup_cache_dir( $self->{'cache_data_structure'} );
103
104 $self ? return $self : return undef;
105 }
106
107 =head2 setup_cache_dir
108
109 Check if specified cache directory exist, and if not, disable caching.
110
111 $setup_cache_dir('./cache/ds/');
112
113 If you pass false or zero value to this function, it will disable
114 cacheing.
115
116 =cut
117
118 sub setup_cache_dir {
119 my $self = shift;
120
121 my $dir = shift;
122
123 my $log = $self->_get_logger();
124
125 if ($dir) {
126 my $msg;
127 if (! -e $dir) {
128 $msg = "doesn't exist";
129 } elsif (! -d $dir) {
130 $msg = "is not directory";
131 } elsif (! -w $dir) {
132 $msg = "not writable";
133 }
134
135 if ($msg) {
136 undef $self->{'cache_data_structure'};
137 $log->warn("cache_data_structure $dir $msg, disabling...");
138 } else {
139 $log->debug("using cache dir $dir");
140 }
141 } else {
142 $log->debug("disabling cache");
143 undef $self->{'cache_data_structure'};
144 }
145 }
146
147
148 =head2 data_structure
149
150 Create in-memory data structure which represents normalized layout from
151 C<conf/normalize/*.xml>.
152
153 This structures are used to produce output.
154
155 my @ds = $webpac->data_structure($rec);
156
157 B<Note: historical oddity follows>
158
159 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
160 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
161 C<< <headline> >> tag.
162
163 =cut
164
165 sub data_structure {
166 my $self = shift;
167
168 my $log = $self->_get_logger();
169
170 my $rec = shift;
171 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
172
173 my $cache_file;
174
175 if (my $cache_path = $self->{'cache_data_structure'}) {
176 my $id = $rec->{'000'};
177 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
178 unless (defined($id)) {
179 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
180 undef $self->{'cache_data_structure'};
181 } else {
182 $cache_file = "$cache_path/$id";
183 if (-r $cache_file) {
184 my $ds_ref = retrieve($cache_file);
185 if ($ds_ref) {
186 $log->debug("cache hit: $cache_file");
187 my $ok = 1;
188 foreach my $f (qw(current_filename headline)) {
189 if ($ds_ref->{$f}) {
190 $self->{$f} = $ds_ref->{$f};
191 } else {
192 $ok = 0;
193 }
194 };
195 if ($ok && $ds_ref->{'ds'}) {
196 return @{ $ds_ref->{'ds'} };
197 } else {
198 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
199 undef $self->{'cache_data_structure'};
200 }
201 }
202 }
203 }
204 }
205
206 undef $self->{'currnet_filename'};
207 undef $self->{'headline'};
208
209 my @sorted_tags;
210 if ($self->{tags_by_order}) {
211 @sorted_tags = @{$self->{tags_by_order}};
212 } else {
213 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
214 $self->{tags_by_order} = \@sorted_tags;
215 }
216
217 my @ds;
218
219 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
220
221 foreach my $field (@sorted_tags) {
222
223 my $row;
224
225 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
226
227 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
228 my $format = $tag->{'value'} || $tag->{'content'};
229
230 $log->debug("format: $format");
231
232 my @v;
233 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
234 @v = $self->fill_in_to_arr($rec,$format);
235 } else {
236 @v = $self->parse_to_arr($rec,$format);
237 }
238 next if (! @v);
239
240 if ($tag->{'sort'}) {
241 @v = $self->sort_arr(@v);
242 }
243
244 # use format?
245 if ($tag->{'format_name'}) {
246 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
247 }
248
249 if ($field eq 'filename') {
250 $self->{'current_filename'} = join('',@v);
251 $log->debug("filename: ",$self->{'current_filename'});
252 } elsif ($field eq 'headline') {
253 $self->{'headline'} .= join('',@v);
254 $log->debug("headline: ",$self->{'headline'});
255 next; # don't return headline in data_structure!
256 }
257
258 # delimiter will join repeatable fields
259 if ($tag->{'delimiter'}) {
260 @v = ( join($tag->{'delimiter'}, @v) );
261 }
262
263 # default types
264 my @types = qw(display swish);
265 # override by type attribute
266 @types = ( $tag->{'type'} ) if ($tag->{'type'});
267
268 foreach my $type (@types) {
269 # append to previous line?
270 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
271 if ($tag->{'append'}) {
272
273 # I will delimit appended part with
274 # delimiter (or ,)
275 my $d = $tag->{'delimiter'};
276 # default delimiter
277 $d ||= " ";
278
279 my $last = pop @{$row->{$type}};
280 $d = "" if (! $last);
281 $last .= $d . join($d, @v);
282 push @{$row->{$type}}, $last;
283
284 } else {
285 push @{$row->{$type}}, @v;
286 }
287 }
288
289
290 }
291
292 if ($row) {
293 $row->{'tag'} = $field;
294
295 # TODO: name_sigular, name_plural
296 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
297 $row->{'name'} = $name ? $self->_x($name) : $field;
298
299 # post-sort all values in field
300 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
301 $log->warn("sort at field tag not implemented");
302 }
303
304 push @ds, $row;
305
306 $log->debug("row $field: ",sub { Dumper($row) });
307 }
308
309 }
310
311 if ($cache_file) {
312 store {
313 ds => \@ds,
314 current_filename => $self->{'current_filename'},
315 headline => $self->{'headline'},
316 }, $cache_file;
317 $log->debug("created storable cache file $cache_file");
318 }
319
320 return @ds;
321
322 }
323
324 =head2 parse
325
326 Perform smart parsing of string, skipping delimiters for fields which aren't
327 defined. It can also eval code in format starting with C<eval{...}> and
328 return output or nothing depending on eval code.
329
330 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
331
332 =cut
333
334 sub parse {
335 my $self = shift;
336
337 my ($rec, $format_utf8, $i) = @_;
338
339 return if (! $format_utf8);
340
341 my $log = $self->_get_logger();
342
343 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
344
345 $i = 0 if (! $i);
346
347 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
348
349 my @out;
350
351 $log->debug("format: $format");
352
353 my $eval_code;
354 # remove eval{...} from beginning
355 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
356
357 my $filter_name;
358 # remove filter{...} from beginning
359 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
360
361 my $prefix;
362 my $all_found=0;
363
364 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
365
366 my $del = $1 || '';
367 $prefix ||= $del if ($all_found == 0);
368
369 # repeatable index
370 my $r = $i;
371 $r = 0 if (lc("$2") eq 's');
372
373 my $found = 0;
374 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
375
376 if ($found) {
377 push @out, $del;
378 push @out, $tmp;
379 $all_found += $found;
380 }
381 }
382
383 return if (! $all_found);
384
385 my $out = join('',@out);
386
387 if ($out) {
388 # add rest of format (suffix)
389 $out .= $format;
390
391 # add prefix if not there
392 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
393
394 $log->debug("result: $out");
395 }
396
397 if ($eval_code) {
398 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
399 $log->debug("about to eval{$eval} format: $out");
400 return if (! $self->_eval($eval));
401 }
402
403 if ($filter_name && $self->{'filter'}->{$filter_name}) {
404 $log->debug("about to filter{$filter_name} format: $out");
405 $out = $self->{'filter'}->{$filter_name}->($out);
406 return unless(defined($out));
407 $log->debug("filter result: $out");
408 }
409
410 return $out;
411 }
412
413 =head2 parse_to_arr
414
415 Similar to C<parse>, but returns array of all repeatable fields
416
417 my @arr = $webpac->parse_to_arr($rec,'v250^a');
418
419 =cut
420
421 sub parse_to_arr {
422 my $self = shift;
423
424 my ($rec, $format_utf8) = @_;
425
426 my $log = $self->_get_logger();
427
428 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
429 return if (! $format_utf8);
430
431 my $i = 0;
432 my @arr;
433
434 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
435 push @arr, $v;
436 }
437
438 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
439
440 return @arr;
441 }
442
443
444 =head2 fill_in
445
446 Workhourse of all: takes record from in-memory structure of database and
447 strings with placeholders and returns string or array of with substituted
448 values from record.
449
450 my $text = $webpac->fill_in($rec,'v250^a');
451
452 Optional argument is ordinal number for repeatable fields. By default,
453 it's assume to be first repeatable field (fields are perl array, so first
454 element is 0).
455 Following example will read second value from repeatable field.
456
457 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
458
459 This function B<does not> perform parsing of format to inteligenty skip
460 delimiters before fields which aren't used.
461
462 This method will automatically decode UTF-8 string to local code page
463 if needed.
464
465 =cut
466
467 sub fill_in {
468 my $self = shift;
469
470 my $log = $self->_get_logger();
471
472 my $rec = shift || $log->logconfess("need data record");
473 my $format = shift || $log->logconfess("need format to parse");
474 # iteration (for repeatable fields)
475 my $i = shift || 0;
476
477 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
478
479 # FIXME remove for speedup?
480 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
481
482 if (utf8::is_utf8($format)) {
483 $format = $self->_x($format);
484 }
485
486 my $found = 0;
487
488 my $eval_code;
489 # remove eval{...} from beginning
490 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
491
492 my $filter_name;
493 # remove filter{...} from beginning
494 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
495
496 # do actual replacement of placeholders
497 # repeatable fields
498 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
499 # non-repeatable fields
500 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
501
502 if ($found) {
503 $log->debug("format: $format");
504 if ($eval_code) {
505 my $eval = $self->fill_in($rec,$eval_code,$i);
506 return if (! $self->_eval($eval));
507 }
508 if ($filter_name && $self->{'filter'}->{$filter_name}) {
509 $log->debug("filter '$filter_name' for $format");
510 $format = $self->{'filter'}->{$filter_name}->($format);
511 return unless(defined($format));
512 $log->debug("filter result: $format");
513 }
514 # do we have lookups?
515 if ($self->{'lookup'}) {
516 return $self->lookup($format);
517 } else {
518 return $format;
519 }
520 } else {
521 return;
522 }
523 }
524
525
526 =head2 fill_in_to_arr
527
528 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
529 for fields which have lookups, so they shouldn't be parsed but rather
530 C<fill_id>ed.
531
532 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
533
534 =cut
535
536 sub fill_in_to_arr {
537 my $self = shift;
538
539 my ($rec, $format_utf8) = @_;
540
541 my $log = $self->_get_logger();
542
543 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
544 return if (! $format_utf8);
545
546 my $i = 0;
547 my @arr;
548
549 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
550 push @arr, @v;
551 }
552
553 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
554
555 return @arr;
556 }
557
558
559 =head2 get_data
560
561 Returns value from record.
562
563 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
564
565 Arguments are:
566 record reference C<$rec>,
567 field C<$f>,
568 optional subfiled C<$sf>,
569 index for repeatable values C<$i>.
570
571 Optinal variable C<$found> will be incremeted if there
572 is field.
573
574 Returns value or empty string.
575
576 =cut
577
578 sub get_data {
579 my $self = shift;
580
581 my ($rec,$f,$sf,$i,$found) = @_;
582
583 if ($$rec->{$f}) {
584 return '' if (! $$rec->{$f}->[$i]);
585 no strict 'refs';
586 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
587 $$found++ if (defined($$found));
588 return $$rec->{$f}->[$i]->{$sf};
589 } elsif ($$rec->{$f}->[$i]) {
590 $$found++ if (defined($$found));
591 # it still might have subfield, just
592 # not specified, so we'll dump all
593 if ($$rec->{$f}->[$i] =~ /HASH/o) {
594 my $out;
595 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
596 $out .= $$rec->{$f}->[$i]->{$k}." ";
597 }
598 return $out;
599 } else {
600 return $$rec->{$f}->[$i];
601 }
602 }
603 } else {
604 return '';
605 }
606 }
607
608
609 =head2 apply_format
610
611 Apply format specified in tag with C<format_name="name"> and
612 C<format_delimiter=";;">.
613
614 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
615
616 Formats can contain C<lookup{...}> if you need them.
617
618 =cut
619
620 sub apply_format {
621 my $self = shift;
622
623 my ($name,$delimiter,$data) = @_;
624
625 my $log = $self->_get_logger();
626
627 if (! $self->{'import_xml'}->{'format'}->{$name}) {
628 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
629 return $data;
630 }
631
632 $log->warn("no delimiter for format $name") if (! $delimiter);
633
634 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
635
636 my @data = split(/\Q$delimiter\E/, $data);
637
638 my $out = sprintf($format, @data);
639 $log->debug("using format $name [$format] on $data to produce: $out");
640
641 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
642 return $self->lookup($out);
643 } else {
644 return $out;
645 }
646
647 }
648
649 =head2 sort_arr
650
651 Sort array ignoring case and html in data
652
653 my @sorted = $webpac->sort_arr(@unsorted);
654
655 =cut
656
657 sub sort_arr {
658 my $self = shift;
659
660 my $log = $self->_get_logger();
661
662 # FIXME add Schwartzian Transformation?
663
664 my @sorted = sort {
665 $a =~ s#<[^>]+/*>##;
666 $b =~ s#<[^>]+/*>##;
667 lc($b) cmp lc($a)
668 } @_;
669 $log->debug("sorted values: ",sub { join(", ",@sorted) });
670
671 return @sorted;
672 }
673
674
675 =head1 INTERNAL METHODS
676
677 =head2 _sort_by_order
678
679 Sort xml tags data structure accoding to C<order=""> attribute.
680
681 =cut
682
683 sub _sort_by_order {
684 my $self = shift;
685
686 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
687 $self->{'import_xml'}->{'indexer'}->{$a};
688 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
689 $self->{'import_xml'}->{'indexer'}->{$b};
690
691 return $va <=> $vb;
692 }
693
694 =head2 _x
695
696 Convert strings from C<conf/normalize/*.xml> encoding into application
697 specific encoding (optinally specified using C<code_page> to C<new>
698 constructor).
699
700 my $text = $n->_x('normalize text string');
701
702 This is a stub so that other modules doesn't have to implement it.
703
704 =cut
705
706 sub _x {
707 my $self = shift;
708 return shift;
709 }
710
711
712 =head1 AUTHOR
713
714 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
715
716 =head1 COPYRIGHT & LICENSE
717
718 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
719
720 This program is free software; you can redistribute it and/or modify it
721 under the same terms as Perl itself.
722
723 =cut
724
725 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26