/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 566 - (hide annotations)
Sun Jul 2 21:17:54 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 13421 byte(s)
test split_rec_on corner cases, and fix one

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 538
8 dpavlin 536 tag search display
9 dpavlin 547 marc marc_indicators marc_repeatable_subfield
10 dpavlin 564 marc_compose marc_leader
11 dpavlin 540
12 dpavlin 536 rec1 rec2 rec
13     regex prefix suffix surround
14     first lookup join_with
15 dpavlin 562
16     split_rec_on
17 dpavlin 536 /;
18 dpavlin 10
19     use warnings;
20     use strict;
21 dpavlin 536
22     #use base qw/WebPAC::Common/;
23 dpavlin 550 use Data::Dump qw/dump/;
24 dpavlin 541 use Encode qw/from_to/;
25 dpavlin 10
26 dpavlin 550 # debugging warn(s)
27     my $debug = 0;
28    
29    
30 dpavlin 10 =head1 NAME
31    
32 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
33 dpavlin 10
34     =head1 VERSION
35    
36 dpavlin 564 Version 0.09
37 dpavlin 10
38     =cut
39    
40 dpavlin 564 our $VERSION = '0.09';
41 dpavlin 10
42     =head1 SYNOPSIS
43    
44 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
45     from input records using perl functions which are specialized for set
46     processing.
47 dpavlin 10
48 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
49     means that you check it's validity before running WebPAC using
50     C<perl -c normalize.pl>.
51 dpavlin 15
52 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
53 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
54 dpavlin 547 C<marc>.
55 dpavlin 15
56 dpavlin 10 =head1 FUNCTIONS
57    
58 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
59     All other functions are available for use within normalisation rules.
60    
61 dpavlin 536 =head2 data_structure
62 dpavlin 10
63 dpavlin 536 Return data structure
64 dpavlin 13
65 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
66 dpavlin 536 lookup => $lookup->lookup_hash,
67     row => $row,
68     rules => $normalize_pl_config,
69 dpavlin 541 marc_encoding => 'utf-8',
70 dpavlin 13 );
71    
72 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
73     other are optional.
74    
75 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
76 dpavlin 15
77 dpavlin 538 Since this function isn't exported you have to call it with
78     C<WebPAC::Normalize::data_structure>.
79    
80 dpavlin 10 =cut
81    
82 dpavlin 536 sub data_structure {
83     my $arg = {@_};
84 dpavlin 13
85 dpavlin 536 die "need row argument" unless ($arg->{row});
86     die "need normalisation argument" unless ($arg->{rules});
87 dpavlin 31
88 dpavlin 536 no strict 'subs';
89 dpavlin 538 _set_lookup( $arg->{lookup} );
90     _set_rec( $arg->{row} );
91 dpavlin 541 _clean_ds( %{ $arg } );
92 dpavlin 536 eval "$arg->{rules}";
93     die "error evaling $arg->{rules}: $@\n" if ($@);
94 dpavlin 540
95 dpavlin 538 return _get_ds();
96 dpavlin 10 }
97    
98 dpavlin 538 =head2 _set_rec
99 dpavlin 13
100 dpavlin 536 Set current record hash
101 dpavlin 433
102 dpavlin 538 _set_rec( $rec );
103 dpavlin 433
104     =cut
105    
106 dpavlin 536 my $rec;
107 dpavlin 433
108 dpavlin 538 sub _set_rec {
109 dpavlin 536 $rec = shift or die "no record hash";
110 dpavlin 433 }
111    
112 dpavlin 538 =head2 _get_ds
113    
114     Return hash formatted as data structure
115    
116     my $ds = _get_ds();
117    
118     =cut
119    
120 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
121 dpavlin 538
122     sub _get_ds {
123     return $out;
124     }
125    
126     =head2 _clean_ds
127    
128     Clean data structure hash for next record
129    
130     _clean_ds();
131    
132     =cut
133    
134     sub _clean_ds {
135 dpavlin 541 my $a = {@_};
136 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 dpavlin 541 $marc_encoding = $a->{marc_encoding};
138 dpavlin 538 }
139    
140     =head2 _set_lookup
141    
142     Set current lookup hash
143    
144     _set_lookup( $lookup );
145    
146     =cut
147    
148     my $lookup;
149    
150     sub _set_lookup {
151     $lookup = shift;
152     }
153    
154 dpavlin 547 =head2 _get_marc_fields
155 dpavlin 540
156 dpavlin 547 Get all fields defined by calls to C<marc>
157 dpavlin 540
158 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
159 dpavlin 540
160 dpavlin 543 We are using I<magic> which detect repeatable fields only from
161     sequence of field/subfield data generated by normalization.
162    
163 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
164     if any of indicators are different.
165 dpavlin 543
166 dpavlin 554 This is sane for most cases. Something like:
167 dpavlin 543
168 dpavlin 554 900a-1 900b-1 900c-1
169     900a-2 900b-2
170     900a-3
171    
172     will be created from any combination of:
173    
174     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
175    
176     and following rules:
177    
178     marc('900','a', rec('200','a') );
179     marc('900','b', rec('200','b') );
180     marc('900','c', rec('200','c') );
181    
182     which might not be what you have in mind. If you need repeatable subfield,
183     define it using C<marc_repeatable_subfield> like this:
184    
185     ....
186    
187 dpavlin 540 =cut
188    
189 dpavlin 547 sub _get_marc_fields {
190 dpavlin 550
191 dpavlin 551 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
192 dpavlin 550
193     # first, sort all existing fields
194     # XXX might not be needed, but modern perl might randomize elements in hash
195     my @sorted_marc_record = sort {
196     $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
197     } @{ $marc_record };
198    
199 dpavlin 562 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
200    
201 dpavlin 550 # output marc fields
202 dpavlin 542 my @m;
203 dpavlin 550
204     # count unique field-subfields (used for offset when walking to next subfield)
205     my $u;
206     map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
207    
208     if ($debug) {
209     warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210     warn "## marc_record ", dump( $marc_record ), $/;
211     warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212     warn "## subfield count ", dump( $u ), $/;
213     }
214    
215     my $len = $#sorted_marc_record;
216     my $visited;
217     my $i = 0;
218     my $field;
219    
220     foreach ( 0 .. $len ) {
221    
222     # find next element which isn't visited
223     while ($visited->{$i}) {
224     $i = ($i + 1) % ($len + 1);
225 dpavlin 542 }
226    
227 dpavlin 550 # mark it visited
228     $visited->{$i}++;
229    
230     my $row = $sorted_marc_record[$i];
231    
232     # field and subfield which is key for
233     # marc_repeatable_subfield and u
234     my $fsf = $row->[0] . $row->[3];
235    
236     if ($debug > 1) {
237    
238     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239     print "### this [$i]: ", dump( $row ),$/;
240     print "### sf: ", $row->[3], " vs ", $field->[3],
241     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
242     if ($#$field >= 0);
243    
244     }
245    
246     # if field exists
247     if ( $#$field >= 0 ) {
248     if (
249     $row->[0] ne $field->[0] || # field
250     $row->[1] ne $field->[1] || # i1
251     $row->[2] ne $field->[2] # i2
252     ) {
253     push @m, $field;
254     warn "## saved/1 ", dump( $field ),$/ if ($debug);
255     $field = $row;
256    
257     } elsif (
258     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
259     ||
260     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261     ! $marc_repeatable_subfield->{ $fsf }
262     )
263     ) {
264     push @m, $field;
265     warn "## saved/2 ", dump( $field ),$/ if ($debug);
266     $field = $row;
267    
268     } else {
269     # append new subfields to existing field
270     push @$field, ( $row->[3], $row->[4] );
271     }
272     } else {
273     # insert first field
274     $field = $row;
275     }
276    
277     if (! $marc_repeatable_subfield->{ $fsf }) {
278     # make step to next subfield
279     $i = ($i + $u->{ $fsf } ) % ($len + 1);
280     }
281 dpavlin 542 }
282    
283 dpavlin 550 if ($#$field >= 0) {
284     push @m, $field;
285     warn "## saved/3 ", dump( $field ),$/ if ($debug);
286     }
287 dpavlin 542
288     return @m;
289 dpavlin 540 }
290    
291 dpavlin 554 =head2 _debug
292    
293     Change level of debug warnings
294    
295     _debug( 2 );
296    
297     =cut
298    
299     sub _debug {
300     my $l = shift;
301     return $debug unless defined($l);
302 dpavlin 562 warn "debug level $l" if ($l > 0);
303 dpavlin 554 $debug = $l;
304     }
305    
306 dpavlin 540 =head1 Functions to create C<data_structure>
307    
308     Those functions generally have to first in your normalization file.
309    
310 dpavlin 536 =head2 tag
311 dpavlin 433
312 dpavlin 536 Define new tag for I<search> and I<display>.
313 dpavlin 433
314 dpavlin 536 tag('Title', rec('200','a') );
315 dpavlin 13
316    
317     =cut
318    
319 dpavlin 536 sub tag {
320     my $name = shift or die "tag needs name as first argument";
321     my @o = grep { defined($_) && $_ ne '' } @_;
322     return unless (@o);
323     $out->{$name}->{tag} = $name;
324     $out->{$name}->{search} = \@o;
325     $out->{$name}->{display} = \@o;
326     }
327 dpavlin 13
328 dpavlin 536 =head2 display
329 dpavlin 13
330 dpavlin 536 Define tag just for I<display>
331 dpavlin 125
332 dpavlin 536 @v = display('Title', rec('200','a') );
333 dpavlin 125
334 dpavlin 536 =cut
335 dpavlin 125
336 dpavlin 536 sub display {
337     my $name = shift or die "display needs name as first argument";
338     my @o = grep { defined($_) && $_ ne '' } @_;
339     return unless (@o);
340     $out->{$name}->{tag} = $name;
341     $out->{$name}->{display} = \@o;
342     }
343 dpavlin 13
344 dpavlin 536 =head2 search
345 dpavlin 13
346 dpavlin 536 Prepare values just for I<search>
347 dpavlin 13
348 dpavlin 536 @v = search('Title', rec('200','a') );
349 dpavlin 433
350 dpavlin 536 =cut
351 dpavlin 13
352 dpavlin 536 sub search {
353     my $name = shift or die "search needs name as first argument";
354     my @o = grep { defined($_) && $_ ne '' } @_;
355     return unless (@o);
356     $out->{$name}->{tag} = $name;
357     $out->{$name}->{search} = \@o;
358 dpavlin 13 }
359    
360 dpavlin 564 =head2 marc_leader
361    
362     Setup fields within MARC leader or get leader
363    
364     marc_leader('05','c');
365     my $leader = marc_leader();
366    
367     =cut
368    
369     sub marc_leader {
370     my ($offset,$value) = @_;
371    
372     if ($offset) {
373     $out->{' leader'}->{ $offset } = $value;
374     } else {
375     return $out->{' leader'};
376     }
377     }
378    
379 dpavlin 547 =head2 marc
380 dpavlin 540
381     Save value for MARC field
382    
383 dpavlin 547 marc('900','a', rec('200','a') );
384 dpavlin 540
385     =cut
386    
387 dpavlin 547 sub marc {
388     my $f = shift or die "marc needs field";
389     die "marc field must be numer" unless ($f =~ /^\d+$/);
390 dpavlin 540
391 dpavlin 547 my $sf = shift or die "marc needs subfield";
392 dpavlin 540
393 dpavlin 541 foreach (@_) {
394     my $v = $_; # make var read-write for Encode
395 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
396 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
397 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
398     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
399 dpavlin 540 }
400     }
401    
402 dpavlin 547 =head2 marc_repeatable_subfield
403    
404     Save values for MARC repetable subfield
405    
406     marc_repeatable_subfield('910', 'z', rec('909') );
407    
408     =cut
409    
410     sub marc_repeatable_subfield {
411 dpavlin 550 my ($f,$sf) = @_;
412     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
413     $marc_repeatable_subfield->{ $f . $sf }++;
414 dpavlin 547 marc(@_);
415     }
416    
417     =head2 marc_indicators
418    
419     Set both indicators for MARC field
420    
421     marc_indicators('900', ' ', 1);
422    
423     Any indicator value other than C<0-9> will be treated as undefined.
424    
425     =cut
426    
427     sub marc_indicators {
428     my $f = shift || die "marc_indicators need field!\n";
429     my ($i1,$i2) = @_;
430     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
431     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
432    
433     $i1 = ' ' if ($i1 !~ /^\d$/);
434     $i2 = ' ' if ($i2 !~ /^\d$/);
435 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
436 dpavlin 547 }
437    
438 dpavlin 562 =head2 marc_compose
439 dpavlin 547
440 dpavlin 562 Save values for each MARC subfield explicitly
441    
442     marc_compose('900',
443     'a', rec('200','a')
444     'b', rec('201','a')
445     'a', rec('200','b')
446     'c', rec('200','c')
447     );
448    
449     =cut
450    
451     sub marc_compose {
452     my $f = shift or die "marc_compose needs field";
453     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
454    
455     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
456     my $m = [ $f, $i1, $i2 ];
457    
458     while (@_) {
459     my $sf = shift or die "marc_compose $f needs subfield";
460 dpavlin 565 my $v = shift;
461 dpavlin 562
462     next unless (defined($v) && $v !~ /^\s*$/);
463     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
464     push @$m, ( $sf, $v );
465     warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
466     }
467    
468     warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
469    
470 dpavlin 565 push @{ $marc_record }, $m if ($#{$m} > 2);
471 dpavlin 562 }
472    
473    
474 dpavlin 540 =head1 Functions to extract data from input
475    
476     This function should be used inside functions to create C<data_structure> described
477     above.
478    
479 dpavlin 536 =head2 rec1
480 dpavlin 371
481 dpavlin 536 Return all values in some field
482 dpavlin 371
483 dpavlin 536 @v = rec1('200')
484 dpavlin 15
485 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
486 dpavlin 15
487 dpavlin 536 =cut
488 dpavlin 15
489 dpavlin 536 sub rec1 {
490     my $f = shift;
491     return unless (defined($rec) && defined($rec->{$f}));
492     if (ref($rec->{$f}) eq 'ARRAY') {
493     return map {
494     if (ref($_) eq 'HASH') {
495     values %{$_};
496 dpavlin 31 } else {
497 dpavlin 536 $_;
498 dpavlin 31 }
499 dpavlin 536 } @{ $rec->{$f} };
500     } elsif( defined($rec->{$f}) ) {
501     return $rec->{$f};
502 dpavlin 15 }
503     }
504    
505 dpavlin 536 =head2 rec2
506 dpavlin 15
507 dpavlin 536 Return all values in specific field and subfield
508 dpavlin 13
509 dpavlin 536 @v = rec2('200','a')
510 dpavlin 13
511     =cut
512    
513 dpavlin 536 sub rec2 {
514     my $f = shift;
515     return unless (defined($rec && $rec->{$f}));
516     my $sf = shift;
517     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
518     }
519 dpavlin 13
520 dpavlin 536 =head2 rec
521 dpavlin 13
522 dpavlin 536 syntaxtic sugar for
523 dpavlin 13
524 dpavlin 536 @v = rec('200')
525     @v = rec('200','a')
526 dpavlin 13
527 dpavlin 536 =cut
528 dpavlin 373
529 dpavlin 536 sub rec {
530     if ($#_ == 0) {
531     return rec1(@_);
532     } elsif ($#_ == 1) {
533     return rec2(@_);
534 dpavlin 13 }
535     }
536    
537 dpavlin 536 =head2 regex
538 dpavlin 15
539 dpavlin 536 Apply regex to some or all values
540 dpavlin 15
541 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
542 dpavlin 15
543     =cut
544    
545 dpavlin 536 sub regex {
546     my $r = shift;
547     my @out;
548 dpavlin 550 #warn "r: $r\n", dump(\@_);
549 dpavlin 536 foreach my $t (@_) {
550     next unless ($t);
551     eval "\$t =~ $r";
552     push @out, $t if ($t && $t ne '');
553 dpavlin 368 }
554 dpavlin 536 return @out;
555 dpavlin 15 }
556    
557 dpavlin 536 =head2 prefix
558 dpavlin 15
559 dpavlin 536 Prefix all values with a string
560 dpavlin 15
561 dpavlin 536 @v = prefix( 'my_', @v );
562 dpavlin 15
563     =cut
564    
565 dpavlin 536 sub prefix {
566     my $p = shift or die "prefix needs string as first argument";
567     return map { $p . $_ } grep { defined($_) } @_;
568     }
569 dpavlin 15
570 dpavlin 536 =head2 suffix
571 dpavlin 15
572 dpavlin 536 suffix all values with a string
573 dpavlin 15
574 dpavlin 536 @v = suffix( '_my', @v );
575 dpavlin 15
576 dpavlin 536 =cut
577 dpavlin 15
578 dpavlin 536 sub suffix {
579     my $s = shift or die "suffix needs string as first argument";
580     return map { $_ . $s } grep { defined($_) } @_;
581 dpavlin 15 }
582    
583 dpavlin 536 =head2 surround
584 dpavlin 13
585 dpavlin 536 surround all values with a two strings
586 dpavlin 13
587 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
588 dpavlin 13
589     =cut
590    
591 dpavlin 536 sub surround {
592     my $p = shift or die "surround need prefix as first argument";
593     my $s = shift or die "surround needs suffix as second argument";
594     return map { $p . $_ . $s } grep { defined($_) } @_;
595 dpavlin 13 }
596    
597 dpavlin 536 =head2 first
598 dpavlin 13
599 dpavlin 536 Return first element
600 dpavlin 15
601 dpavlin 536 $v = first( @v );
602 dpavlin 13
603     =cut
604    
605 dpavlin 536 sub first {
606     my $r = shift;
607     return $r;
608 dpavlin 13 }
609    
610 dpavlin 536 =head2 lookup
611 dpavlin 13
612 dpavlin 536 Consult lookup hashes for some value
613 dpavlin 13
614 dpavlin 536 @v = lookup( $v );
615     @v = lookup( @v );
616 dpavlin 13
617     =cut
618    
619 dpavlin 536 sub lookup {
620     my $k = shift or return;
621     return unless (defined($lookup->{$k}));
622     if (ref($lookup->{$k}) eq 'ARRAY') {
623     return @{ $lookup->{$k} };
624     } else {
625     return $lookup->{$k};
626     }
627 dpavlin 13 }
628    
629 dpavlin 536 =head2 join_with
630 dpavlin 13
631 dpavlin 536 Joins walues with some delimiter
632 dpavlin 10
633 dpavlin 536 $v = join_with(", ", @v);
634 dpavlin 10
635 dpavlin 536 =cut
636 dpavlin 10
637 dpavlin 536 sub join_with {
638     my $d = shift;
639     return join($d, grep { defined($_) && $_ ne '' } @_);
640     }
641 dpavlin 10
642 dpavlin 562 =head2 split_rec_on
643    
644     Split record subfield on some regex and take one of parts out
645    
646     $a_before_semi_column =
647     split_rec_on('200','a', /\s*;\s*/, $part);
648    
649     C<$part> is optional number of element. First element is
650     B<1>, not 0!
651    
652     If there is no C<$part> parameter or C<$part> is 0, this function will
653     return all values produced by splitting.
654    
655     =cut
656    
657     sub split_rec_on {
658     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
659    
660     my ($fld, $sf, $regex, $part) = @_;
661     warn "### regex ", ref($regex), $regex if ($debug > 2);
662    
663     my @r = rec( $fld, $sf );
664     my $v = shift @r;
665     warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
666    
667 dpavlin 566 return '' if( ! defined($v) || $v =~ /^\s*$/);
668    
669 dpavlin 562 my @s = split( $regex, $v );
670     warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
671 dpavlin 566 if ($part && $part > 0) {
672 dpavlin 562 return $s[ $part - 1 ];
673     } else {
674 dpavlin 566 return [ @s ];
675 dpavlin 562 }
676     }
677    
678 dpavlin 536 # END
679     1;

  ViewVC Help
Powered by ViewVC 1.1.26