/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 583 - (show annotations)
Wed Jul 5 00:12:08 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 17538 byte(s)
rec and join_with now return '' if there are no results, so they are safe to
use inside marc_compose

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
12
13 rec1 rec2 rec
14 regex prefix suffix surround
15 first lookup join_with
16
17 split_rec_on
18 /;
19
20 use warnings;
21 use strict;
22
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
27
28 # debugging warn(s)
29 my $debug = 0;
30
31
32 =head1 NAME
33
34 WebPAC::Normalize - describe normalisaton rules using sets
35
36 =head1 VERSION
37
38 Version 0.11
39
40 =cut
41
42 our $VERSION = '0.11';
43
44 =head1 SYNOPSIS
45
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
48 processing.
49
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
53
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56 C<marc>.
57
58 =head1 FUNCTIONS
59
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
62
63 =head2 data_structure
64
65 Return data structure
66
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
69 row => $row,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
72 );
73
74 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75 other are optional.
76
77 This function will B<die> if normalizastion can't be evaled.
78
79 Since this function isn't exported you have to call it with
80 C<WebPAC::Normalize::data_structure>.
81
82 =cut
83
84 sub data_structure {
85 my $arg = {@_};
86
87 die "need row argument" unless ($arg->{row});
88 die "need normalisation argument" unless ($arg->{rules});
89
90 no strict 'subs';
91 _set_lookup( $arg->{lookup} );
92 _set_rec( $arg->{row} );
93 _clean_ds( %{ $arg } );
94 eval "$arg->{rules}";
95 die "error evaling $arg->{rules}: $@\n" if ($@);
96
97 return _get_ds();
98 }
99
100 =head2 _set_rec
101
102 Set current record hash
103
104 _set_rec( $rec );
105
106 =cut
107
108 my $rec;
109
110 sub _set_rec {
111 $rec = shift or die "no record hash";
112 }
113
114 =head2 _get_ds
115
116 Return hash formatted as data structure
117
118 my $ds = _get_ds();
119
120 =cut
121
122 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125 sub _get_ds {
126 return $out;
127 }
128
129 =head2 _clean_ds
130
131 Clean data structure hash for next record
132
133 _clean_ds();
134
135 =cut
136
137 sub _clean_ds {
138 my $a = {@_};
139 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140 ($marc_record_offset, $marc_fetch_offset) = (0,0);
141 $marc_encoding = $a->{marc_encoding};
142 }
143
144 =head2 _set_lookup
145
146 Set current lookup hash
147
148 _set_lookup( $lookup );
149
150 =cut
151
152 my $lookup;
153
154 sub _set_lookup {
155 $lookup = shift;
156 }
157
158 =head2 _get_marc_fields
159
160 Get all fields defined by calls to C<marc>
161
162 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164 We are using I<magic> which detect repeatable fields only from
165 sequence of field/subfield data generated by normalization.
166
167 Repeatable field is created when there is second occurence of same subfield or
168 if any of indicators are different.
169
170 This is sane for most cases. Something like:
171
172 900a-1 900b-1 900c-1
173 900a-2 900b-2
174 900a-3
175
176 will be created from any combination of:
177
178 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180 and following rules:
181
182 marc('900','a', rec('200','a') );
183 marc('900','b', rec('200','b') );
184 marc('900','c', rec('200','c') );
185
186 which might not be what you have in mind. If you need repeatable subfield,
187 define it using C<marc_repeatable_subfield> like this:
188
189 marc_repeatable_subfield('900','a');
190 marc('900','a', rec('200','a') );
191 marc('900','b', rec('200','b') );
192 marc('900','c', rec('200','c') );
193
194 will create:
195
196 900a-1 900a-2 900a-3 900b-1 900c-1
197 900b-2
198
199 There is also support for returning next or specific using:
200
201 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202 # do something with $mf
203 }
204
205 will always return fields from next MARC record or
206
207 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209 will return 42th copy record (if it exists).
210
211 =cut
212
213 sub _get_marc_fields {
214
215 my $arg = {@_};
216 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217 my $offset = $marc_fetch_offset;
218 if ($arg->{offset}) {
219 $offset = $arg->{offset};
220 } elsif($arg->{fetch_next}) {
221 $marc_fetch_offset++;
222 }
223
224 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
225
226 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228 my $marc_rec = $marc_record->[ $offset ];
229
230 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
233
234 # first, sort all existing fields
235 # XXX might not be needed, but modern perl might randomize elements in hash
236 my @sorted_marc_record = sort {
237 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
238 } @{ $marc_rec };
239
240 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242 # output marc fields
243 my @m;
244
245 # count unique field-subfields (used for offset when walking to next subfield)
246 my $u;
247 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
248
249 if ($debug) {
250 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253 warn "## subfield count = ", dump( $u ), $/;
254 }
255
256 my $len = $#sorted_marc_record;
257 my $visited;
258 my $i = 0;
259 my $field;
260
261 foreach ( 0 .. $len ) {
262
263 # find next element which isn't visited
264 while ($visited->{$i}) {
265 $i = ($i + 1) % ($len + 1);
266 }
267
268 # mark it visited
269 $visited->{$i}++;
270
271 my $row = dclone( $sorted_marc_record[$i] );
272
273 # field and subfield which is key for
274 # marc_repeatable_subfield and u
275 my $fsf = $row->[0] . ( $row->[3] || '' );
276
277 if ($debug > 1) {
278
279 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280 print "### this [$i]: ", dump( $row ),$/;
281 print "### sf: ", $row->[3], " vs ", $field->[3],
282 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283 if ($#$field >= 0);
284
285 }
286
287 # if field exists
288 if ( $#$field >= 0 ) {
289 if (
290 $row->[0] ne $field->[0] || # field
291 $row->[1] ne $field->[1] || # i1
292 $row->[2] ne $field->[2] # i2
293 ) {
294 push @m, $field;
295 warn "## saved/1 ", dump( $field ),$/ if ($debug);
296 $field = $row;
297
298 } elsif (
299 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300 ||
301 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302 ! $marc_repeatable_subfield->{ $fsf }
303 )
304 ) {
305 push @m, $field;
306 warn "## saved/2 ", dump( $field ),$/ if ($debug);
307 $field = $row;
308
309 } else {
310 # append new subfields to existing field
311 push @$field, ( $row->[3], $row->[4] );
312 }
313 } else {
314 # insert first field
315 $field = $row;
316 }
317
318 if (! $marc_repeatable_subfield->{ $fsf }) {
319 # make step to next subfield
320 $i = ($i + $u->{ $fsf } ) % ($len + 1);
321 }
322 }
323
324 if ($#$field >= 0) {
325 push @m, $field;
326 warn "## saved/3 ", dump( $field ),$/ if ($debug);
327 }
328
329 return \@m;
330 }
331
332 =head2 _debug
333
334 Change level of debug warnings
335
336 _debug( 2 );
337
338 =cut
339
340 sub _debug {
341 my $l = shift;
342 return $debug unless defined($l);
343 warn "debug level $l",$/ if ($l > 0);
344 $debug = $l;
345 }
346
347 =head1 Functions to create C<data_structure>
348
349 Those functions generally have to first in your normalization file.
350
351 =head2 tag
352
353 Define new tag for I<search> and I<display>.
354
355 tag('Title', rec('200','a') );
356
357
358 =cut
359
360 sub tag {
361 my $name = shift or die "tag needs name as first argument";
362 my @o = grep { defined($_) && $_ ne '' } @_;
363 return unless (@o);
364 $out->{$name}->{tag} = $name;
365 $out->{$name}->{search} = \@o;
366 $out->{$name}->{display} = \@o;
367 }
368
369 =head2 display
370
371 Define tag just for I<display>
372
373 @v = display('Title', rec('200','a') );
374
375 =cut
376
377 sub display {
378 my $name = shift or die "display needs name as first argument";
379 my @o = grep { defined($_) && $_ ne '' } @_;
380 return unless (@o);
381 $out->{$name}->{tag} = $name;
382 $out->{$name}->{display} = \@o;
383 }
384
385 =head2 search
386
387 Prepare values just for I<search>
388
389 @v = search('Title', rec('200','a') );
390
391 =cut
392
393 sub search {
394 my $name = shift or die "search needs name as first argument";
395 my @o = grep { defined($_) && $_ ne '' } @_;
396 return unless (@o);
397 $out->{$name}->{tag} = $name;
398 $out->{$name}->{search} = \@o;
399 }
400
401 =head2 marc_leader
402
403 Setup fields within MARC leader or get leader
404
405 marc_leader('05','c');
406 my $leader = marc_leader();
407
408 =cut
409
410 sub marc_leader {
411 my ($offset,$value) = @_;
412
413 if ($offset) {
414 $out->{' leader'}->{ $offset } = $value;
415 } else {
416 return $out->{' leader'};
417 }
418 }
419
420 =head2 marc
421
422 Save value for MARC field
423
424 marc('900','a', rec('200','a') );
425 marc('001', rec('000') );
426
427 =cut
428
429 sub marc {
430 my $f = shift or die "marc needs field";
431 die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433 my $sf;
434 if ($f >= 10) {
435 $sf = shift or die "marc needs subfield";
436 }
437
438 foreach (@_) {
439 my $v = $_; # make var read-write for Encode
440 next unless (defined($v) && $v !~ /^\s*$/);
441 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443 if (defined $sf) {
444 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445 } else {
446 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447 }
448 }
449 }
450
451 =head2 marc_repeatable_subfield
452
453 Save values for MARC repetable subfield
454
455 marc_repeatable_subfield('910', 'z', rec('909') );
456
457 =cut
458
459 sub marc_repeatable_subfield {
460 my ($f,$sf) = @_;
461 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462 $marc_repeatable_subfield->{ $f . $sf }++;
463 marc(@_);
464 }
465
466 =head2 marc_indicators
467
468 Set both indicators for MARC field
469
470 marc_indicators('900', ' ', 1);
471
472 Any indicator value other than C<0-9> will be treated as undefined.
473
474 =cut
475
476 sub marc_indicators {
477 my $f = shift || die "marc_indicators need field!\n";
478 my ($i1,$i2) = @_;
479 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482 $i1 = ' ' if ($i1 !~ /^\d$/);
483 $i2 = ' ' if ($i2 !~ /^\d$/);
484 @{ $marc_indicators->{$f} } = ($i1,$i2);
485 }
486
487 =head2 marc_compose
488
489 Save values for each MARC subfield explicitly
490
491 marc_compose('900',
492 'a', rec('200','a')
493 'b', rec('201','a')
494 'a', rec('200','b')
495 'c', rec('200','c')
496 );
497
498 =cut
499
500 sub marc_compose {
501 my $f = shift or die "marc_compose needs field";
502 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505 my $m = [ $f, $i1, $i2 ];
506
507 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
508
509 while (@_) {
510 my $sf = shift or die "marc_compose $f needs subfield";
511 my $v = shift;
512
513 next unless (defined($v) && $v !~ /^\s*$/);
514 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
515 push @$m, ( $sf, $v );
516 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
517 }
518
519 warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
520
521 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
522 }
523
524 =head2 marc_duplicate
525
526 Generate copy of current MARC record and continue working on copy
527
528 marc_duplicate();
529
530 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
531 C<< _get_marc_fields( offset => 42 ) >>.
532
533 =cut
534
535 sub marc_duplicate {
536 my $m = $marc_record->[ -1 ];
537 die "can't duplicate record which isn't defined" unless ($m);
538 push @{ $marc_record }, dclone( $m );
539 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
540 $marc_record_offset = $#{ $marc_record };
541 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
542 }
543
544 =head2 marc_remove
545
546 Remove some field or subfield from MARC record.
547
548 marc_remove('200');
549 marc_remove('200','a');
550
551 This will erase field C<200> or C<200^a> from current MARC record.
552
553 This is useful after calling C<marc_duplicate> or on it's own (but, you
554 should probably just remove that subfield definition if you are not
555 using C<marc_duplicate>).
556
557 FIXME: support fields < 10.
558
559 =cut
560
561 sub marc_remove {
562 my ($f, $sf) = @_;
563
564 die "marc_remove needs record number" unless defined($f);
565
566 my $marc = $marc_record->[ $marc_record_offset ];
567
568 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
569
570 my $i = 0;
571 foreach ( 0 .. $#{ $marc } ) {
572 last unless (defined $marc->[$i]);
573 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
574 if ($marc->[$i]->[0] eq $f) {
575 if (! defined $sf) {
576 # remove whole field
577 splice @$marc, $i, 1;
578 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
579 $i--;
580 } else {
581 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
582 my $o = ($j * 2) + 3;
583 if ($marc->[$i]->[$o] eq $sf) {
584 # remove subfield
585 splice @{$marc->[$i]}, $o, 2;
586 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
587 # is record now empty?
588 if ($#{ $marc->[$i] } == 2) {
589 splice @$marc, $i, 1;
590 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
591 $i--;
592 };
593 }
594 }
595 }
596 }
597 $i++;
598 }
599
600 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
601
602 $marc_record->[ $marc_record_offset ] = $marc;
603
604 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
605 }
606
607 =head1 Functions to extract data from input
608
609 This function should be used inside functions to create C<data_structure> described
610 above.
611
612 =head2 rec1
613
614 Return all values in some field
615
616 @v = rec1('200')
617
618 TODO: order of values is probably same as in source data, need to investigate that
619
620 =cut
621
622 sub rec1 {
623 my $f = shift;
624 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
625 return unless (defined($rec) && defined($rec->{$f}));
626 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
627 if (ref($rec->{$f}) eq 'ARRAY') {
628 return map {
629 if (ref($_) eq 'HASH') {
630 values %{$_};
631 } else {
632 $_;
633 }
634 } @{ $rec->{$f} };
635 } elsif( defined($rec->{$f}) ) {
636 return $rec->{$f};
637 }
638 }
639
640 =head2 rec2
641
642 Return all values in specific field and subfield
643
644 @v = rec2('200','a')
645
646 =cut
647
648 sub rec2 {
649 my $f = shift;
650 return unless (defined($rec && $rec->{$f}));
651 my $sf = shift;
652 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
653 }
654
655 =head2 rec
656
657 syntaxtic sugar for
658
659 @v = rec('200')
660 @v = rec('200','a')
661
662 =cut
663
664 sub rec {
665 my @out;
666 if ($#_ == 0) {
667 @out = rec1(@_);
668 } elsif ($#_ == 1) {
669 @out = rec2(@_);
670 }
671 if (@out) {
672 return @out;
673 } else {
674 return '';
675 }
676 }
677
678 =head2 regex
679
680 Apply regex to some or all values
681
682 @v = regex( 's/foo/bar/g', @v );
683
684 =cut
685
686 sub regex {
687 my $r = shift;
688 my @out;
689 #warn "r: $r\n", dump(\@_);
690 foreach my $t (@_) {
691 next unless ($t);
692 eval "\$t =~ $r";
693 push @out, $t if ($t && $t ne '');
694 }
695 return @out;
696 }
697
698 =head2 prefix
699
700 Prefix all values with a string
701
702 @v = prefix( 'my_', @v );
703
704 =cut
705
706 sub prefix {
707 my $p = shift or die "prefix needs string as first argument";
708 return map { $p . $_ } grep { defined($_) } @_;
709 }
710
711 =head2 suffix
712
713 suffix all values with a string
714
715 @v = suffix( '_my', @v );
716
717 =cut
718
719 sub suffix {
720 my $s = shift or die "suffix needs string as first argument";
721 return map { $_ . $s } grep { defined($_) } @_;
722 }
723
724 =head2 surround
725
726 surround all values with a two strings
727
728 @v = surround( 'prefix_', '_suffix', @v );
729
730 =cut
731
732 sub surround {
733 my $p = shift or die "surround need prefix as first argument";
734 my $s = shift or die "surround needs suffix as second argument";
735 return map { $p . $_ . $s } grep { defined($_) } @_;
736 }
737
738 =head2 first
739
740 Return first element
741
742 $v = first( @v );
743
744 =cut
745
746 sub first {
747 my $r = shift;
748 return $r;
749 }
750
751 =head2 lookup
752
753 Consult lookup hashes for some value
754
755 @v = lookup( $v );
756 @v = lookup( @v );
757
758 =cut
759
760 sub lookup {
761 my $k = shift or return;
762 return unless (defined($lookup->{$k}));
763 if (ref($lookup->{$k}) eq 'ARRAY') {
764 return @{ $lookup->{$k} };
765 } else {
766 return $lookup->{$k};
767 }
768 }
769
770 =head2 join_with
771
772 Joins walues with some delimiter
773
774 $v = join_with(", ", @v);
775
776 =cut
777
778 sub join_with {
779 my $d = shift;
780 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
781 return '' unless defined($v);
782 return $v;
783 }
784
785 =head2 split_rec_on
786
787 Split record subfield on some regex and take one of parts out
788
789 $a_before_semi_column =
790 split_rec_on('200','a', /\s*;\s*/, $part);
791
792 C<$part> is optional number of element. First element is
793 B<1>, not 0!
794
795 If there is no C<$part> parameter or C<$part> is 0, this function will
796 return all values produced by splitting.
797
798 =cut
799
800 sub split_rec_on {
801 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
802
803 my ($fld, $sf, $regex, $part) = @_;
804 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
805
806 my @r = rec( $fld, $sf );
807 my $v = shift @r;
808 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
809
810 return '' if( ! defined($v) || $v =~ /^\s*$/);
811
812 my @s = split( $regex, $v );
813 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
814 if ($part && $part > 0) {
815 return $s[ $part - 1 ];
816 } else {
817 return @s;
818 }
819 }
820
821 # END
822 1;

  ViewVC Help
Powered by ViewVC 1.1.26