/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 29 - (show annotations)
Sun Jul 24 11:17:44 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 14636 byte(s)
some logging improvements and sample configuration file

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use base 'WebPAC::Common';
6 use Data::Dumper;
7
8 =head1 NAME
9
10 WebPAC::Normalize - data mungling for normalisation
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that mungle data to produce normalized format.
23
24 It contains several assumptions:
25
26 =over
27
28 =item *
29
30 format of fields is defined using C<v123^a> notation for repeatable fields
31 or C<s123^a> for single (or first) value, where C<123> is field number and
32 C<a> is subfield.
33
34 =item *
35
36 source data records (C<$rec>) have unique identifiers in field C<000>
37
38 =item *
39
40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
41 perl code that is evaluated before producing output (value of field will be
42 interpolated before that)
43
44 =item *
45
46 optional C<filter{filter_name}> at B<begining of format> will apply perl
47 code defined as code ref on format after field substitution to producing
48 output
49
50 =item *
51
52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
53
54 =item *
55
56 at end, optional C<format>s rules are resolved. Format rules are similar to
57 C<sprintf> and can also contain C<lookup{...}> which is performed after
58 values are inserted in format.
59
60 =back
61
62 This also describes order in which transformations are applied (eval,
63 filter, lookup, format) which is important to undestand when deciding how to
64 solve your data mungling and normalisation process.
65
66
67
68
69 =head1 FUNCTIONS
70
71 =head2 new
72
73 Create new normalisation object
74
75 my $n = new WebPAC::Normalize::Something(
76 filter => {
77 'filter_name_1' => sub {
78 # filter code
79 return length($_);
80 }, ...
81 },
82 db => $db_obj,
83 lookup_regex => $lookup->regex,
84 );
85
86 Parametar C<filter> defines user supplied snippets of perl code which can
87 be use with C<filter{...}> notation.
88
89 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
90 in structures.
91
92 =cut
93
94 sub new {
95 my $class = shift;
96 my $self = {@_};
97 bless($self, $class);
98
99 $self ? return $self : return undef;
100 }
101
102
103 =head2 data_structure
104
105 Create in-memory data structure which represents normalized layout from
106 C<conf/normalize/*.xml>.
107
108 This structures are used to produce output.
109
110 my @ds = $webpac->data_structure($rec);
111
112 B<Note: historical oddity follows>
113
114 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
115 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
116 C<< <headline> >> tag.
117
118 =cut
119
120 sub data_structure {
121 my $self = shift;
122
123 my $log = $self->_get_logger();
124
125 my $rec = shift;
126 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
127
128 my $cache_file;
129
130 if ($self->{'db'}) {
131 my @ds = $self->{'db'}->load_ds($rec);
132 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
133 return @ds if ($#ds > 0);
134 $log->debug("cache miss, creating");
135 }
136
137 undef $self->{'currnet_filename'};
138 undef $self->{'headline'};
139
140 my @sorted_tags;
141 if ($self->{tags_by_order}) {
142 @sorted_tags = @{$self->{tags_by_order}};
143 } else {
144 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
145 $self->{tags_by_order} = \@sorted_tags;
146 }
147
148 my @ds;
149
150 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
151
152 foreach my $field (@sorted_tags) {
153
154 my $row;
155
156 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
157
158 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
159 my $format = $tag->{'value'} || $tag->{'content'};
160
161 $log->debug("format: $format");
162
163 my @v;
164 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
165 @v = $self->fill_in_to_arr($rec,$format);
166 } else {
167 @v = $self->parse_to_arr($rec,$format);
168 }
169 next if (! @v);
170
171 if ($tag->{'sort'}) {
172 @v = $self->sort_arr(@v);
173 }
174
175 # use format?
176 if ($tag->{'format_name'}) {
177 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
178 }
179
180 if ($field eq 'filename') {
181 $self->{'current_filename'} = join('',@v);
182 $log->debug("filename: ",$self->{'current_filename'});
183 } elsif ($field eq 'headline') {
184 $self->{'headline'} .= join('',@v);
185 $log->debug("headline: ",$self->{'headline'});
186 next; # don't return headline in data_structure!
187 }
188
189 # delimiter will join repeatable fields
190 if ($tag->{'delimiter'}) {
191 @v = ( join($tag->{'delimiter'}, @v) );
192 }
193
194 # default types
195 my @types = qw(display swish);
196 # override by type attribute
197 @types = ( $tag->{'type'} ) if ($tag->{'type'});
198
199 foreach my $type (@types) {
200 # append to previous line?
201 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
202 if ($tag->{'append'}) {
203
204 # I will delimit appended part with
205 # delimiter (or ,)
206 my $d = $tag->{'delimiter'};
207 # default delimiter
208 $d ||= " ";
209
210 my $last = pop @{$row->{$type}};
211 $d = "" if (! $last);
212 $last .= $d . join($d, @v);
213 push @{$row->{$type}}, $last;
214
215 } else {
216 push @{$row->{$type}}, @v;
217 }
218 }
219
220
221 }
222
223 if ($row) {
224 $row->{'tag'} = $field;
225
226 # TODO: name_sigular, name_plural
227 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
228 $row->{'name'} = $name ? $self->_x($name) : $field;
229
230 # post-sort all values in field
231 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
232 $log->warn("sort at field tag not implemented");
233 }
234
235 push @ds, $row;
236
237 $log->debug("row $field: ",sub { Dumper($row) });
238 }
239
240 }
241
242 $self->{'db'}->save_ds(
243 ds => \@ds,
244 current_filename => $self->{'current_filename'},
245 headline => $self->{'headline'},
246 ) if ($self->{'db'});
247
248 $log->debug("ds: ", sub { Dumper(@ds) });
249
250 return @ds;
251
252 }
253
254 =head2 parse
255
256 Perform smart parsing of string, skipping delimiters for fields which aren't
257 defined. It can also eval code in format starting with C<eval{...}> and
258 return output or nothing depending on eval code.
259
260 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
261
262 =cut
263
264 sub parse {
265 my $self = shift;
266
267 my ($rec, $format_utf8, $i) = @_;
268
269 return if (! $format_utf8);
270
271 my $log = $self->_get_logger();
272
273 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
274
275 $i = 0 if (! $i);
276
277 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
278
279 my @out;
280
281 $log->debug("format: $format");
282
283 my $eval_code;
284 # remove eval{...} from beginning
285 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
286
287 my $filter_name;
288 # remove filter{...} from beginning
289 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
290
291 my $prefix;
292 my $all_found=0;
293
294 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
295
296 my $del = $1 || '';
297 $prefix ||= $del if ($all_found == 0);
298
299 # repeatable index
300 my $r = $i;
301 $r = 0 if (lc("$2") eq 's');
302
303 my $found = 0;
304 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
305
306 if ($found) {
307 push @out, $del;
308 push @out, $tmp;
309 $all_found += $found;
310 }
311 }
312
313 return if (! $all_found);
314
315 my $out = join('',@out);
316
317 if ($out) {
318 # add rest of format (suffix)
319 $out .= $format;
320
321 # add prefix if not there
322 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
323
324 $log->debug("result: $out");
325 }
326
327 if ($eval_code) {
328 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
329 $log->debug("about to eval{$eval} format: $out");
330 return if (! $self->_eval($eval));
331 }
332
333 if ($filter_name && $self->{'filter'}->{$filter_name}) {
334 $log->debug("about to filter{$filter_name} format: $out");
335 $out = $self->{'filter'}->{$filter_name}->($out);
336 return unless(defined($out));
337 $log->debug("filter result: $out");
338 }
339
340 return $out;
341 }
342
343 =head2 parse_to_arr
344
345 Similar to C<parse>, but returns array of all repeatable fields
346
347 my @arr = $webpac->parse_to_arr($rec,'v250^a');
348
349 =cut
350
351 sub parse_to_arr {
352 my $self = shift;
353
354 my ($rec, $format_utf8) = @_;
355
356 my $log = $self->_get_logger();
357
358 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
359 return if (! $format_utf8);
360
361 my $i = 0;
362 my @arr;
363
364 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
365 push @arr, $v;
366 }
367
368 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
369
370 return @arr;
371 }
372
373
374 =head2 fill_in
375
376 Workhourse of all: takes record from in-memory structure of database and
377 strings with placeholders and returns string or array of with substituted
378 values from record.
379
380 my $text = $webpac->fill_in($rec,'v250^a');
381
382 Optional argument is ordinal number for repeatable fields. By default,
383 it's assume to be first repeatable field (fields are perl array, so first
384 element is 0).
385 Following example will read second value from repeatable field.
386
387 my $text = $webpac->fill_in($rec,'Title: v250^a',1);
388
389 This function B<does not> perform parsing of format to inteligenty skip
390 delimiters before fields which aren't used.
391
392 This method will automatically decode UTF-8 string to local code page
393 if needed.
394
395 =cut
396
397 sub fill_in {
398 my $self = shift;
399
400 my $log = $self->_get_logger();
401
402 my $rec = shift || $log->logconfess("need data record");
403 my $format = shift || $log->logconfess("need format to parse");
404 # iteration (for repeatable fields)
405 my $i = shift || 0;
406
407 $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
408
409 # FIXME remove for speedup?
410 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
411
412 if (utf8::is_utf8($format)) {
413 $format = $self->_x($format);
414 }
415
416 my $found = 0;
417
418 my $eval_code;
419 # remove eval{...} from beginning
420 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
421
422 my $filter_name;
423 # remove filter{...} from beginning
424 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
425
426 # do actual replacement of placeholders
427 # repeatable fields
428 $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
429 # non-repeatable fields
430 $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
431
432 if ($found) {
433 $log->debug("format: $format");
434 if ($eval_code) {
435 my $eval = $self->fill_in($rec,$eval_code,$i);
436 return if (! $self->_eval($eval));
437 }
438 if ($filter_name && $self->{'filter'}->{$filter_name}) {
439 $log->debug("filter '$filter_name' for $format");
440 $format = $self->{'filter'}->{$filter_name}->($format);
441 return unless(defined($format));
442 $log->debug("filter result: $format");
443 }
444 # do we have lookups?
445 if ($self->{'lookup'}) {
446 return $self->lookup($format);
447 } else {
448 return $format;
449 }
450 } else {
451 return;
452 }
453 }
454
455
456 =head2 fill_in_to_arr
457
458 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
459 for fields which have lookups, so they shouldn't be parsed but rather
460 C<fill_id>ed.
461
462 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
463
464 =cut
465
466 sub fill_in_to_arr {
467 my $self = shift;
468
469 my ($rec, $format_utf8) = @_;
470
471 my $log = $self->_get_logger();
472
473 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
474 return if (! $format_utf8);
475
476 my $i = 0;
477 my @arr;
478
479 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
480 push @arr, @v;
481 }
482
483 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
484
485 return @arr;
486 }
487
488
489 =head2 get_data
490
491 Returns value from record.
492
493 my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
494
495 Arguments are:
496 record reference C<$rec>,
497 field C<$f>,
498 optional subfiled C<$sf>,
499 index for repeatable values C<$i>.
500
501 Optinal variable C<$found> will be incremeted if there
502 is field.
503
504 Returns value or empty string.
505
506 =cut
507
508 sub get_data {
509 my $self = shift;
510
511 my ($rec,$f,$sf,$i,$found) = @_;
512
513 if ($$rec->{$f}) {
514 return '' if (! $$rec->{$f}->[$i]);
515 no strict 'refs';
516 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
517 $$found++ if (defined($$found));
518 return $$rec->{$f}->[$i]->{$sf};
519 } elsif ($$rec->{$f}->[$i]) {
520 $$found++ if (defined($$found));
521 # it still might have subfield, just
522 # not specified, so we'll dump all
523 if ($$rec->{$f}->[$i] =~ /HASH/o) {
524 my $out;
525 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
526 $out .= $$rec->{$f}->[$i]->{$k}." ";
527 }
528 return $out;
529 } else {
530 return $$rec->{$f}->[$i];
531 }
532 }
533 } else {
534 return '';
535 }
536 }
537
538
539 =head2 apply_format
540
541 Apply format specified in tag with C<format_name="name"> and
542 C<format_delimiter=";;">.
543
544 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
545
546 Formats can contain C<lookup{...}> if you need them.
547
548 =cut
549
550 sub apply_format {
551 my $self = shift;
552
553 my ($name,$delimiter,$data) = @_;
554
555 my $log = $self->_get_logger();
556
557 if (! $self->{'import_xml'}->{'format'}->{$name}) {
558 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
559 return $data;
560 }
561
562 $log->warn("no delimiter for format $name") if (! $delimiter);
563
564 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
565
566 my @data = split(/\Q$delimiter\E/, $data);
567
568 my $out = sprintf($format, @data);
569 $log->debug("using format $name [$format] on $data to produce: $out");
570
571 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
572 return $self->lookup($out);
573 } else {
574 return $out;
575 }
576
577 }
578
579 =head2 sort_arr
580
581 Sort array ignoring case and html in data
582
583 my @sorted = $webpac->sort_arr(@unsorted);
584
585 =cut
586
587 sub sort_arr {
588 my $self = shift;
589
590 my $log = $self->_get_logger();
591
592 # FIXME add Schwartzian Transformation?
593
594 my @sorted = sort {
595 $a =~ s#<[^>]+/*>##;
596 $b =~ s#<[^>]+/*>##;
597 lc($b) cmp lc($a)
598 } @_;
599 $log->debug("sorted values: ",sub { join(", ",@sorted) });
600
601 return @sorted;
602 }
603
604
605 =head1 INTERNAL METHODS
606
607 =head2 _sort_by_order
608
609 Sort xml tags data structure accoding to C<order=""> attribute.
610
611 =cut
612
613 sub _sort_by_order {
614 my $self = shift;
615
616 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
617 $self->{'import_xml'}->{'indexer'}->{$a};
618 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
619 $self->{'import_xml'}->{'indexer'}->{$b};
620
621 return $va <=> $vb;
622 }
623
624 =head2 _x
625
626 Convert strings from C<conf/normalize/*.xml> encoding into application
627 specific encoding (optinally specified using C<code_page> to C<new>
628 constructor).
629
630 my $text = $n->_x('normalize text string');
631
632 This is a stub so that other modules doesn't have to implement it.
633
634 =cut
635
636 sub _x {
637 my $self = shift;
638 return shift;
639 }
640
641
642 =head1 AUTHOR
643
644 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
645
646 =head1 COPYRIGHT & LICENSE
647
648 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
649
650 This program is free software; you can redistribute it and/or modify it
651 under the same terms as Perl itself.
652
653 =cut
654
655 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26