/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 219 - (show annotations)
Mon Dec 5 17:48:08 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 15470 byte(s)
 r11541@llin:  dpavlin | 2005-12-05 16:47:44 +0100
 added prefix [0.04]

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.04
15
16 =cut
17
18 our $VERSION = '0.04';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 prefix => 'foobar',
86 );
87
88 Parametar C<filter> defines user supplied snippets of perl code which can
89 be use with C<filter{...}> notation.
90
91 C<prefix> is used to form filename for database record (to support multiple
92 source files which are joined in one database).
93
94 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95 in structures. If you pass this parametar, you must also pass C<lookup>
96 which is C<WebPAC::Lookup> object.
97
98 =cut
99
100 sub new {
101 my $class = shift;
102 my $self = {@_};
103 bless($self, $class);
104
105 my $r = $self->{'lookup_regex'} ? 1 : 0;
106 my $l = $self->{'lookup'} ? 1 : 0;
107
108 my $log = $self->_get_logger();
109
110 # those two must be in pair
111 if ( ($r & $l) != ($r || $l) ) {
112 my $log = $self->_get_logger();
113 $log->logdie("lookup_regex and lookup must be in pair");
114 }
115
116 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120 $self ? return $self : return undef;
121 }
122
123
124 =head2 data_structure
125
126 Create in-memory data structure which represents normalized layout from
127 C<conf/normalize/*.xml>.
128
129 This structures are used to produce output.
130
131 my $ds = $webpac->data_structure($rec);
132
133 =cut
134
135 sub data_structure {
136 my $self = shift;
137
138 my $log = $self->_get_logger();
139
140 my $rec = shift;
141 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
142
143 $log->debug("data_structure rec = ", sub { Dumper($rec) });
144
145 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
146
147 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
148
149 my $cache_file;
150
151 if ($self->{'db'}) {
152 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
153 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
154 return $ds if ($ds);
155 $log->debug("cache miss, creating");
156 }
157
158 undef $self->{'currnet_filename'};
159 undef $self->{'headline'};
160
161 my @sorted_tags;
162 if ($self->{tags_by_order}) {
163 @sorted_tags = @{$self->{tags_by_order}};
164 } else {
165 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
166 $self->{tags_by_order} = \@sorted_tags;
167 }
168
169 my $ds;
170
171 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
172
173 foreach my $field (@sorted_tags) {
174
175 my $row;
176
177 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
178
179 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
180 my $format;
181
182 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
183 $format = $tag->{'value'} || $tag->{'content'};
184
185 $log->debug("format: $format");
186
187 my @v;
188 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
189 @v = $self->fill_in_to_arr($rec,$format);
190 } else {
191 @v = $self->parse_to_arr($rec,$format);
192 }
193 next if (! @v);
194
195 if ($tag->{'sort'}) {
196 @v = $self->sort_arr(@v);
197 }
198
199 # use format?
200 if ($tag->{'format_name'}) {
201 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
202 }
203
204 # delimiter will join repeatable fields
205 if ($tag->{'delimiter'}) {
206 @v = ( join($tag->{'delimiter'}, @v) );
207 }
208
209 # default types
210 my @types = qw(display search);
211 # override by type attribute
212 @types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214 foreach my $type (@types) {
215 # append to previous line?
216 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217 if ($tag->{'append'}) {
218
219 # I will delimit appended part with
220 # delimiter (or ,)
221 my $d = $tag->{'delimiter'};
222 # default delimiter
223 $d ||= " ";
224
225 my $last = pop @{$row->{$type}};
226 $d = "" if (! $last);
227 $last .= $d . join($d, @v);
228 push @{$row->{$type}}, $last;
229
230 } else {
231 push @{$row->{$type}}, @v;
232 }
233 }
234
235
236 }
237
238 if ($row) {
239 $row->{'tag'} = $field;
240
241 # TODO: name_sigular, name_plural
242 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243 my $row_name = $name ? $self->_x($name) : $field;
244
245 # post-sort all values in field
246 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247 $log->warn("sort at field tag not implemented");
248 }
249
250 $ds->{$row_name} = $row;
251
252 $log->debug("row $field: ",sub { Dumper($row) });
253 }
254
255 }
256
257 $self->{'db'}->save_ds(
258 id => $id,
259 ds => $ds,
260 prefix => $self->{prefix},
261 ) if ($self->{'db'});
262
263 $log->debug("ds: ", sub { Dumper($ds) });
264
265 $log->logconfess("data structure returned is not array any more!") if wantarray;
266
267 return $ds;
268
269 }
270
271 =head2 parse
272
273 Perform smart parsing of string, skipping delimiters for fields which aren't
274 defined. It can also eval code in format starting with C<eval{...}> and
275 return output or nothing depending on eval code.
276
277 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
278
279 =cut
280
281 sub parse {
282 my $self = shift;
283
284 my ($rec, $format_utf8, $i) = @_;
285
286 return if (! $format_utf8);
287
288 my $log = $self->_get_logger();
289
290 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
291
292 $i = 0 if (! $i);
293
294 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
295
296 my @out;
297
298 $log->debug("format: $format");
299
300 my $eval_code;
301 # remove eval{...} from beginning
302 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
303
304 my $filter_name;
305 # remove filter{...} from beginning
306 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
307
308 my $prefix;
309 my $all_found=0;
310
311 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
312
313 my $del = $1 || '';
314 $prefix ||= $del if ($all_found == 0);
315
316 # repeatable index
317 my $r = $i;
318 $r = 0 if (lc("$2") eq 's');
319
320 my $found = 0;
321 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
322
323 if ($found) {
324 push @out, $del;
325 push @out, $tmp;
326 $all_found += $found;
327 }
328 }
329
330 return if (! $all_found);
331
332 my $out = join('',@out);
333
334 if ($out) {
335 # add rest of format (suffix)
336 $out .= $format;
337
338 # add prefix if not there
339 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
340
341 $log->debug("result: $out");
342 }
343
344 if ($eval_code) {
345 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
346 $log->debug("about to eval{$eval} format: $out");
347 return if (! $self->_eval($eval));
348 }
349
350 if ($filter_name && $self->{'filter'}->{$filter_name}) {
351 $log->debug("about to filter{$filter_name} format: $out");
352 $out = $self->{'filter'}->{$filter_name}->($out);
353 return unless(defined($out));
354 $log->debug("filter result: $out");
355 }
356
357 return $out;
358 }
359
360 =head2 parse_to_arr
361
362 Similar to C<parse>, but returns array of all repeatable fields
363
364 my @arr = $webpac->parse_to_arr($rec,'v250^a');
365
366 =cut
367
368 sub parse_to_arr {
369 my $self = shift;
370
371 my ($rec, $format_utf8) = @_;
372
373 my $log = $self->_get_logger();
374
375 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
376 return if (! $format_utf8);
377
378 my $i = 0;
379 my @arr;
380
381 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
382 push @arr, $v;
383 }
384
385 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
386
387 return @arr;
388 }
389
390
391 =head2 fill_in
392
393 Workhourse of all: takes record from in-memory structure of database and
394 strings with placeholders and returns string or array of with substituted
395 values from record.
396
397 my $text = $webpac->fill_in($rec,'v250^a');
398
399 Optional argument is ordinal number for repeatable fields. By default,
400 it's assume to be first repeatable field (fields are perl array, so first
401 element is 0).
402 Following example will read second value from repeatable field.
403
404 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
405
406 This function B<does not> perform parsing of format to inteligenty skip
407 delimiters before fields which aren't used.
408
409 This method will automatically decode UTF-8 string to local code page
410 if needed.
411
412 =cut
413
414 sub fill_in {
415 my $self = shift;
416
417 my $log = $self->_get_logger();
418
419 my $rec = shift || $log->logconfess("need data record");
420 my $format = shift || $log->logconfess("need format to parse");
421 # iteration (for repeatable fields)
422 my $i = shift || 0;
423
424 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
425
426 # FIXME remove for speedup?
427 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
428
429 if (utf8::is_utf8($format)) {
430 $format = $self->_x($format);
431 }
432
433 my $found = 0;
434
435 my $eval_code;
436 # remove eval{...} from beginning
437 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
438
439 my $filter_name;
440 # remove filter{...} from beginning
441 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
442
443 # do actual replacement of placeholders
444 # repeatable fields
445 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
446 # non-repeatable fields
447 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
448
449 if ($found) {
450 $log->debug("format: $format");
451 if ($eval_code) {
452 my $eval = $self->fill_in($rec,$eval_code,$i);
453 return if (! $self->_eval($eval));
454 }
455 if ($filter_name && $self->{'filter'}->{$filter_name}) {
456 $log->debug("filter '$filter_name' for $format");
457 $format = $self->{'filter'}->{$filter_name}->($format);
458 return unless(defined($format));
459 $log->debug("filter result: $format");
460 }
461 # do we have lookups?
462 if ($self->{'lookup'}) {
463 if ($self->{'lookup'}->can('lookup')) {
464 return $self->{'lookup'}->lookup($format);
465 } else {
466 $log->warn("Have lookup object but can't invoke lookup method");
467 }
468 } else {
469 return $format;
470 }
471 } else {
472 return;
473 }
474 }
475
476
477 =head2 fill_in_to_arr
478
479 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
480 for fields which have lookups, so they shouldn't be parsed but rather
481 C<fill_id>ed.
482
483 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
484
485 =cut
486
487 sub fill_in_to_arr {
488 my $self = shift;
489
490 my ($rec, $format_utf8) = @_;
491
492 my $log = $self->_get_logger();
493
494 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
495 return if (! $format_utf8);
496
497 my $i = 0;
498 my @arr;
499
500 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
501 push @arr, @v;
502 }
503
504 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
505
506 return @arr;
507 }
508
509
510 =head2 get_data
511
512 Returns value from record.
513
514 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
515
516 Arguments are:
517 record reference C<$rec>,
518 field C<$f>,
519 optional subfiled C<$sf>,
520 index for repeatable values C<$i>.
521
522 Optinal variable C<$found> will be incremeted if there
523 is field.
524
525 Returns value or empty string.
526
527 =cut
528
529 sub get_data {
530 my $self = shift;
531
532 my ($rec,$f,$sf,$i,$found) = @_;
533
534 if ($$rec->{$f}) {
535 return '' if (! $$rec->{$f}->[$i]);
536 no strict 'refs';
537 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
538 $$found++ if (defined($$found));
539 return $$rec->{$f}->[$i]->{$sf};
540 } elsif (! $sf && $$rec->{$f}->[$i]) {
541 $$found++ if (defined($$found));
542 # it still might have subfield, just
543 # not specified, so we'll dump all
544 if ($$rec->{$f}->[$i] =~ /HASH/o) {
545 my $out;
546 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
547 $out .= $$rec->{$f}->[$i]->{$k}." ";
548 }
549 return $out;
550 } else {
551 return $$rec->{$f}->[$i];
552 }
553 } else {
554 return '';
555 }
556 } else {
557 return '';
558 }
559 }
560
561
562 =head2 apply_format
563
564 Apply format specified in tag with C<format_name="name"> and
565 C<format_delimiter=";;">.
566
567 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
568
569 Formats can contain C<lookup{...}> if you need them.
570
571 =cut
572
573 sub apply_format {
574 my $self = shift;
575
576 my ($name,$delimiter,$data) = @_;
577
578 my $log = $self->_get_logger();
579
580 if (! $self->{'import_xml'}->{'format'}->{$name}) {
581 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
582 return $data;
583 }
584
585 $log->warn("no delimiter for format $name") if (! $delimiter);
586
587 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
588
589 my @data = split(/\Q$delimiter\E/, $data);
590
591 my $out = sprintf($format, @data);
592 $log->debug("using format $name [$format] on $data to produce: $out");
593
594 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
595 return $self->{'lookup'}->lookup($out);
596 } else {
597 return $out;
598 }
599
600 }
601
602 =head2 sort_arr
603
604 Sort array ignoring case and html in data
605
606 my @sorted = $webpac->sort_arr(@unsorted);
607
608 =cut
609
610 sub sort_arr {
611 my $self = shift;
612
613 my $log = $self->_get_logger();
614
615 # FIXME add Schwartzian Transformation?
616
617 my @sorted = sort {
618 $a =~ s#<[^>]+/*>##;
619 $b =~ s#<[^>]+/*>##;
620 lc($b) cmp lc($a)
621 } @_;
622 $log->debug("sorted values: ",sub { join(", ",@sorted) });
623
624 return @sorted;
625 }
626
627
628 =head1 INTERNAL METHODS
629
630 =head2 _sort_by_order
631
632 Sort xml tags data structure accoding to C<order=""> attribute.
633
634 =cut
635
636 sub _sort_by_order {
637 my $self = shift;
638
639 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
640 $self->{'import_xml'}->{'indexer'}->{$a};
641 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
642 $self->{'import_xml'}->{'indexer'}->{$b};
643
644 return $va <=> $vb;
645 }
646
647 =head2 _x
648
649 Convert strings from C<conf/normalize/*.xml> encoding into application
650 specific encoding (optinally specified using C<code_page> to C<new>
651 constructor).
652
653 my $text = $n->_x('normalize text string');
654
655 This is a stub so that other modules doesn't have to implement it.
656
657 =cut
658
659 sub _x {
660 my $self = shift;
661 return shift;
662 }
663
664
665 =head1 AUTHOR
666
667 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
668
669 =head1 COPYRIGHT & LICENSE
670
671 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
672
673 This program is free software; you can redistribute it and/or modify it
674 under the same terms as Perl itself.
675
676 =cut
677
678 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26