/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 253 - (show annotations)
Thu Dec 15 17:01:10 2005 UTC (17 years, 1 month ago) by dpavlin
File size: 15631 byte(s)
 r11712@llin:  dpavlin | 2005-12-15 21:01:03 +0100
 lookups now work [2.00_3]

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.05
15
16 =cut
17
18 our $VERSION = '0.05';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 lookup => $lookup_obj,
85 prefix => 'foobar',
86 );
87
88 Parametar C<filter> defines user supplied snippets of perl code which can
89 be use with C<filter{...}> notation.
90
91 C<prefix> is used to form filename for database record (to support multiple
92 source files which are joined in one database).
93
94 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
95 in structures. If you pass this parametar, you must also pass C<lookup>
96 which is C<WebPAC::Lookup> object.
97
98 =cut
99
100 sub new {
101 my $class = shift;
102 my $self = {@_};
103 bless($self, $class);
104
105 my $r = $self->{'lookup_regex'} ? 1 : 0;
106 my $l = $self->{'lookup'} ? 1 : 0;
107
108 my $log = $self->_get_logger();
109
110 # those two must be in pair
111 if ( ($r & $l) != ($r || $l) ) {
112 my $log = $self->_get_logger();
113 $log->logdie("lookup_regex and lookup must be in pair");
114 }
115
116 $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
117
118 $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
119
120 $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
121
122 $self ? return $self : return undef;
123 }
124
125
126 =head2 data_structure
127
128 Create in-memory data structure which represents normalized layout from
129 C<conf/normalize/*.xml>.
130
131 This structures are used to produce output.
132
133 my $ds = $webpac->data_structure($rec);
134
135 =cut
136
137 sub data_structure {
138 my $self = shift;
139
140 my $log = $self->_get_logger();
141
142 my $rec = shift;
143 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
144
145 $log->debug("data_structure rec = ", sub { Dumper($rec) });
146
147 $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
148
149 my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
150
151 my $cache_file;
152
153 if ($self->{'db'}) {
154 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
155 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
156 return $ds if ($ds);
157 $log->debug("cache miss, creating");
158 }
159
160 undef $self->{'currnet_filename'};
161 undef $self->{'headline'};
162
163 my @sorted_tags;
164 if ($self->{tags_by_order}) {
165 @sorted_tags = @{$self->{tags_by_order}};
166 } else {
167 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
168 $self->{tags_by_order} = \@sorted_tags;
169 }
170
171 my $ds;
172
173 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
174
175 foreach my $field (@sorted_tags) {
176
177 my $row;
178
179 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
180
181 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
182 my $format;
183
184 $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
185 $format = $tag->{'value'} || $tag->{'content'};
186
187 $log->debug("format: $format");
188
189 my @v;
190 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
191 @v = $self->fill_in_to_arr($rec,$format);
192 } else {
193 @v = $self->parse_to_arr($rec,$format);
194 }
195 next if (! @v);
196
197 if ($tag->{'sort'}) {
198 @v = $self->sort_arr(@v);
199 }
200
201 # use format?
202 if ($tag->{'format_name'}) {
203 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
204 }
205
206 # delimiter will join repeatable fields
207 if ($tag->{'delimiter'}) {
208 @v = ( join($tag->{'delimiter'}, @v) );
209 }
210
211 # default types
212 my @types = qw(display search);
213 # override by type attribute
214 @types = ( $tag->{'type'} ) if ($tag->{'type'});
215
216 foreach my $type (@types) {
217 # append to previous line?
218 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
219 if ($tag->{'append'}) {
220
221 # I will delimit appended part with
222 # delimiter (or ,)
223 my $d = $tag->{'delimiter'};
224 # default delimiter
225 $d ||= " ";
226
227 my $last = pop @{$row->{$type}};
228 $d = "" if (! $last);
229 $last .= $d . join($d, @v);
230 push @{$row->{$type}}, $last;
231
232 } else {
233 push @{$row->{$type}}, @v;
234 }
235 }
236
237
238 }
239
240 if ($row) {
241 $row->{'tag'} = $field;
242
243 # TODO: name_sigular, name_plural
244 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
245 my $row_name = $name ? $self->_x($name) : $field;
246
247 # post-sort all values in field
248 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
249 $log->warn("sort at field tag not implemented");
250 }
251
252 $ds->{$row_name} = $row;
253
254 $log->debug("row $field: ",sub { Dumper($row) });
255 }
256
257 }
258
259 $self->{'db'}->save_ds(
260 id => $id,
261 ds => $ds,
262 prefix => $self->{prefix},
263 ) if ($self->{'db'});
264
265 $log->debug("ds: ", sub { Dumper($ds) });
266
267 $log->logconfess("data structure returned is not array any more!") if wantarray;
268
269 return $ds;
270
271 }
272
273 =head2 parse
274
275 Perform smart parsing of string, skipping delimiters for fields which aren't
276 defined. It can also eval code in format starting with C<eval{...}> and
277 return output or nothing depending on eval code.
278
279 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
280
281 =cut
282
283 sub parse {
284 my $self = shift;
285
286 my ($rec, $format_utf8, $i) = @_;
287
288 return if (! $format_utf8);
289
290 my $log = $self->_get_logger();
291
292 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
293
294 $i = 0 if (! $i);
295
296 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
297
298 my @out;
299
300 $log->debug("format: $format");
301
302 my $eval_code;
303 # remove eval{...} from beginning
304 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
305
306 my $filter_name;
307 # remove filter{...} from beginning
308 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
309
310 my $prefix;
311 my $all_found=0;
312
313 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
314
315 my $del = $1 || '';
316 $prefix ||= $del if ($all_found == 0);
317
318 # repeatable index
319 my $r = $i;
320 $r = 0 if (lc("$2") eq 's');
321
322 my $found = 0;
323 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
324
325 if ($found) {
326 push @out, $del;
327 push @out, $tmp;
328 $all_found += $found;
329 }
330 }
331
332 return if (! $all_found);
333
334 my $out = join('',@out);
335
336 if ($out) {
337 # add rest of format (suffix)
338 $out .= $format;
339
340 # add prefix if not there
341 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
342
343 $log->debug("result: $out");
344 }
345
346 if ($eval_code) {
347 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
348 $log->debug("about to eval{$eval} format: $out");
349 return if (! $self->_eval($eval));
350 }
351
352 if ($filter_name && $self->{'filter'}->{$filter_name}) {
353 $log->debug("about to filter{$filter_name} format: $out");
354 $out = $self->{'filter'}->{$filter_name}->($out);
355 return unless(defined($out));
356 $log->debug("filter result: $out");
357 }
358
359 return $out;
360 }
361
362 =head2 parse_to_arr
363
364 Similar to C<parse>, but returns array of all repeatable fields
365
366 my @arr = $webpac->parse_to_arr($rec,'v250^a');
367
368 =cut
369
370 sub parse_to_arr {
371 my $self = shift;
372
373 my ($rec, $format_utf8) = @_;
374
375 my $log = $self->_get_logger();
376
377 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
378 return if (! $format_utf8);
379
380 my $i = 0;
381 my @arr;
382
383 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
384 push @arr, $v;
385 }
386
387 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
388
389 return @arr;
390 }
391
392
393 =head2 fill_in
394
395 Workhourse of all: takes record from in-memory structure of database and
396 strings with placeholders and returns string or array of with substituted
397 values from record.
398
399 my $text = $webpac->fill_in($rec,'v250^a');
400
401 Optional argument is ordinal number for repeatable fields. By default,
402 it's assume to be first repeatable field (fields are perl array, so first
403 element is 0).
404 Following example will read second value from repeatable field.
405
406 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
407
408 This function B<does not> perform parsing of format to inteligenty skip
409 delimiters before fields which aren't used.
410
411 This method will automatically decode UTF-8 string to local code page
412 if needed.
413
414 =cut
415
416 sub fill_in {
417 my $self = shift;
418
419 my $log = $self->_get_logger();
420
421 my $rec = shift || $log->logconfess("need data record");
422 my $format = shift || $log->logconfess("need format to parse");
423 # iteration (for repeatable fields)
424 my $i = shift || 0;
425
426 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
427
428 # FIXME remove for speedup?
429 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
430
431 if (utf8::is_utf8($format)) {
432 $format = $self->_x($format);
433 }
434
435 my $found = 0;
436
437 my $eval_code;
438 # remove eval{...} from beginning
439 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
440
441 my $filter_name;
442 # remove filter{...} from beginning
443 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
444
445 # do actual replacement of placeholders
446 # repeatable fields
447 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
448 # non-repeatable fields
449 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
450
451 if ($found) {
452 $log->debug("format: $format");
453 if ($eval_code) {
454 my $eval = $self->fill_in($rec,$eval_code,$i);
455 return if (! $self->_eval($eval));
456 }
457 if ($filter_name && $self->{'filter'}->{$filter_name}) {
458 $log->debug("filter '$filter_name' for $format");
459 $format = $self->{'filter'}->{$filter_name}->($format);
460 return unless(defined($format));
461 $log->debug("filter result: $format");
462 }
463 # do we have lookups?
464 if ($self->{'lookup'}) {
465 if ($self->{'lookup'}->can('lookup')) {
466 my @lookup = $self->{lookup}->lookup($format);
467 $log->debug("lookup $format", join(", ", @lookup));
468 return @lookup;
469 } else {
470 $log->warn("Have lookup object but can't invoke lookup method");
471 }
472 } else {
473 return $format;
474 }
475 } else {
476 return;
477 }
478 }
479
480
481 =head2 fill_in_to_arr
482
483 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
484 for fields which have lookups, so they shouldn't be parsed but rather
485 C<fill_id>ed.
486
487 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
488
489 =cut
490
491 sub fill_in_to_arr {
492 my $self = shift;
493
494 my ($rec, $format_utf8) = @_;
495
496 my $log = $self->_get_logger();
497
498 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
499 return if (! $format_utf8);
500
501 my $i = 0;
502 my @arr;
503
504 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
505 push @arr, @v;
506 }
507
508 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
509
510 return @arr;
511 }
512
513
514 =head2 get_data
515
516 Returns value from record.
517
518 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
519
520 Arguments are:
521 record reference C<$rec>,
522 field C<$f>,
523 optional subfiled C<$sf>,
524 index for repeatable values C<$i>.
525
526 Optinal variable C<$found> will be incremeted if there
527 is field.
528
529 Returns value or empty string.
530
531 =cut
532
533 sub get_data {
534 my $self = shift;
535
536 my ($rec,$f,$sf,$i,$found) = @_;
537
538 if ($$rec->{$f}) {
539 return '' if (! $$rec->{$f}->[$i]);
540 no strict 'refs';
541 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
542 $$found++ if (defined($$found));
543 return $$rec->{$f}->[$i]->{$sf};
544 } elsif (! $sf && $$rec->{$f}->[$i]) {
545 $$found++ if (defined($$found));
546 # it still might have subfield, just
547 # not specified, so we'll dump all
548 if ($$rec->{$f}->[$i] =~ /HASH/o) {
549 my $out;
550 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
551 $out .= $$rec->{$f}->[$i]->{$k}." ";
552 }
553 return $out;
554 } else {
555 return $$rec->{$f}->[$i];
556 }
557 } else {
558 return '';
559 }
560 } else {
561 return '';
562 }
563 }
564
565
566 =head2 apply_format
567
568 Apply format specified in tag with C<format_name="name"> and
569 C<format_delimiter=";;">.
570
571 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
572
573 Formats can contain C<lookup{...}> if you need them.
574
575 =cut
576
577 sub apply_format {
578 my $self = shift;
579
580 my ($name,$delimiter,$data) = @_;
581
582 my $log = $self->_get_logger();
583
584 if (! $self->{'import_xml'}->{'format'}->{$name}) {
585 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
586 return $data;
587 }
588
589 $log->warn("no delimiter for format $name") if (! $delimiter);
590
591 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
592
593 my @data = split(/\Q$delimiter\E/, $data);
594
595 my $out = sprintf($format, @data);
596 $log->debug("using format $name [$format] on $data to produce: $out");
597
598 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
599 return $self->{'lookup'}->lookup($out);
600 } else {
601 return $out;
602 }
603
604 }
605
606 =head2 sort_arr
607
608 Sort array ignoring case and html in data
609
610 my @sorted = $webpac->sort_arr(@unsorted);
611
612 =cut
613
614 sub sort_arr {
615 my $self = shift;
616
617 my $log = $self->_get_logger();
618
619 # FIXME add Schwartzian Transformation?
620
621 my @sorted = sort {
622 $a =~ s#<[^>]+/*>##;
623 $b =~ s#<[^>]+/*>##;
624 lc($b) cmp lc($a)
625 } @_;
626 $log->debug("sorted values: ",sub { join(", ",@sorted) });
627
628 return @sorted;
629 }
630
631
632 =head1 INTERNAL METHODS
633
634 =head2 _sort_by_order
635
636 Sort xml tags data structure accoding to C<order=""> attribute.
637
638 =cut
639
640 sub _sort_by_order {
641 my $self = shift;
642
643 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
644 $self->{'import_xml'}->{'indexer'}->{$a};
645 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
646 $self->{'import_xml'}->{'indexer'}->{$b};
647
648 return $va <=> $vb;
649 }
650
651 =head2 _x
652
653 Convert strings from C<conf/normalize/*.xml> encoding into application
654 specific encoding (optinally specified using C<code_page> to C<new>
655 constructor).
656
657 my $text = $n->_x('normalize text string');
658
659 This is a stub so that other modules doesn't have to implement it.
660
661 =cut
662
663 sub _x {
664 my $self = shift;
665 return shift;
666 }
667
668
669 =head1 AUTHOR
670
671 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
672
673 =head1 COPYRIGHT & LICENSE
674
675 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
676
677 This program is free software; you can redistribute it and/or modify it
678 under the same terms as Perl itself.
679
680 =cut
681
682 1; # End of WebPAC::Normalize

  ViewVC Help
Powered by ViewVC 1.1.26