/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 579 - (show annotations)
Tue Jul 4 11:08:43 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 17355 byte(s)
 r798@llin:  dpavlin | 2006-07-04 13:08:44 +0200
 changed _get_marc_fields to return arrayref, tests and fix for marc_remove(field)

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
12
13 rec1 rec2 rec
14 regex prefix suffix surround
15 first lookup join_with
16
17 split_rec_on
18 /;
19
20 use warnings;
21 use strict;
22
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
27
28 # debugging warn(s)
29 my $debug = 0;
30
31
32 =head1 NAME
33
34 WebPAC::Normalize - describe normalisaton rules using sets
35
36 =head1 VERSION
37
38 Version 0.11
39
40 =cut
41
42 our $VERSION = '0.11';
43
44 =head1 SYNOPSIS
45
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
48 processing.
49
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
53
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56 C<marc>.
57
58 =head1 FUNCTIONS
59
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
62
63 =head2 data_structure
64
65 Return data structure
66
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
69 row => $row,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
72 );
73
74 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75 other are optional.
76
77 This function will B<die> if normalizastion can't be evaled.
78
79 Since this function isn't exported you have to call it with
80 C<WebPAC::Normalize::data_structure>.
81
82 =cut
83
84 sub data_structure {
85 my $arg = {@_};
86
87 die "need row argument" unless ($arg->{row});
88 die "need normalisation argument" unless ($arg->{rules});
89
90 no strict 'subs';
91 _set_lookup( $arg->{lookup} );
92 _set_rec( $arg->{row} );
93 _clean_ds( %{ $arg } );
94 eval "$arg->{rules}";
95 die "error evaling $arg->{rules}: $@\n" if ($@);
96
97 return _get_ds();
98 }
99
100 =head2 _set_rec
101
102 Set current record hash
103
104 _set_rec( $rec );
105
106 =cut
107
108 my $rec;
109
110 sub _set_rec {
111 $rec = shift or die "no record hash";
112 }
113
114 =head2 _get_ds
115
116 Return hash formatted as data structure
117
118 my $ds = _get_ds();
119
120 =cut
121
122 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125 sub _get_ds {
126 return $out;
127 }
128
129 =head2 _clean_ds
130
131 Clean data structure hash for next record
132
133 _clean_ds();
134
135 =cut
136
137 sub _clean_ds {
138 my $a = {@_};
139 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140 ($marc_record_offset, $marc_fetch_offset) = (0,0);
141 $marc_encoding = $a->{marc_encoding};
142 }
143
144 =head2 _set_lookup
145
146 Set current lookup hash
147
148 _set_lookup( $lookup );
149
150 =cut
151
152 my $lookup;
153
154 sub _set_lookup {
155 $lookup = shift;
156 }
157
158 =head2 _get_marc_fields
159
160 Get all fields defined by calls to C<marc>
161
162 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164 We are using I<magic> which detect repeatable fields only from
165 sequence of field/subfield data generated by normalization.
166
167 Repeatable field is created when there is second occurence of same subfield or
168 if any of indicators are different.
169
170 This is sane for most cases. Something like:
171
172 900a-1 900b-1 900c-1
173 900a-2 900b-2
174 900a-3
175
176 will be created from any combination of:
177
178 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180 and following rules:
181
182 marc('900','a', rec('200','a') );
183 marc('900','b', rec('200','b') );
184 marc('900','c', rec('200','c') );
185
186 which might not be what you have in mind. If you need repeatable subfield,
187 define it using C<marc_repeatable_subfield> like this:
188
189 marc_repeatable_subfield('900','a');
190 marc('900','a', rec('200','a') );
191 marc('900','b', rec('200','b') );
192 marc('900','c', rec('200','c') );
193
194 will create:
195
196 900a-1 900a-2 900a-3 900b-1 900c-1
197 900b-2
198
199 There is also support for returning next or specific using:
200
201 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202 # do something with $mf
203 }
204
205 will always return fields from next MARC record or
206
207 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209 will return 42th copy record (if it exists).
210
211 =cut
212
213 sub _get_marc_fields {
214
215 my $arg = {@_};
216 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217 my $offset = $marc_fetch_offset;
218 if ($arg->{offset}) {
219 $offset = $arg->{offset};
220 } elsif($arg->{fetch_next}) {
221 $marc_fetch_offset++;
222 }
223
224 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
225
226 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228 my $marc_rec = $marc_record->[ $offset ];
229
230 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
233
234 # first, sort all existing fields
235 # XXX might not be needed, but modern perl might randomize elements in hash
236 my @sorted_marc_record = sort {
237 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
238 } @{ $marc_rec };
239
240 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242 # output marc fields
243 my @m;
244
245 # count unique field-subfields (used for offset when walking to next subfield)
246 my $u;
247 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
248
249 if ($debug) {
250 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253 warn "## subfield count = ", dump( $u ), $/;
254 }
255
256 my $len = $#sorted_marc_record;
257 my $visited;
258 my $i = 0;
259 my $field;
260
261 foreach ( 0 .. $len ) {
262
263 # find next element which isn't visited
264 while ($visited->{$i}) {
265 $i = ($i + 1) % ($len + 1);
266 }
267
268 # mark it visited
269 $visited->{$i}++;
270
271 my $row = dclone( $sorted_marc_record[$i] );
272
273 # field and subfield which is key for
274 # marc_repeatable_subfield and u
275 my $fsf = $row->[0] . ( $row->[3] || '' );
276
277 if ($debug > 1) {
278
279 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280 print "### this [$i]: ", dump( $row ),$/;
281 print "### sf: ", $row->[3], " vs ", $field->[3],
282 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283 if ($#$field >= 0);
284
285 }
286
287 # if field exists
288 if ( $#$field >= 0 ) {
289 if (
290 $row->[0] ne $field->[0] || # field
291 $row->[1] ne $field->[1] || # i1
292 $row->[2] ne $field->[2] # i2
293 ) {
294 push @m, $field;
295 warn "## saved/1 ", dump( $field ),$/ if ($debug);
296 $field = $row;
297
298 } elsif (
299 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300 ||
301 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302 ! $marc_repeatable_subfield->{ $fsf }
303 )
304 ) {
305 push @m, $field;
306 warn "## saved/2 ", dump( $field ),$/ if ($debug);
307 $field = $row;
308
309 } else {
310 # append new subfields to existing field
311 push @$field, ( $row->[3], $row->[4] );
312 }
313 } else {
314 # insert first field
315 $field = $row;
316 }
317
318 if (! $marc_repeatable_subfield->{ $fsf }) {
319 # make step to next subfield
320 $i = ($i + $u->{ $fsf } ) % ($len + 1);
321 }
322 }
323
324 if ($#$field >= 0) {
325 push @m, $field;
326 warn "## saved/3 ", dump( $field ),$/ if ($debug);
327 }
328
329 return \@m;
330 }
331
332 =head2 _debug
333
334 Change level of debug warnings
335
336 _debug( 2 );
337
338 =cut
339
340 sub _debug {
341 my $l = shift;
342 return $debug unless defined($l);
343 warn "debug level $l",$/ if ($l > 0);
344 $debug = $l;
345 }
346
347 =head1 Functions to create C<data_structure>
348
349 Those functions generally have to first in your normalization file.
350
351 =head2 tag
352
353 Define new tag for I<search> and I<display>.
354
355 tag('Title', rec('200','a') );
356
357
358 =cut
359
360 sub tag {
361 my $name = shift or die "tag needs name as first argument";
362 my @o = grep { defined($_) && $_ ne '' } @_;
363 return unless (@o);
364 $out->{$name}->{tag} = $name;
365 $out->{$name}->{search} = \@o;
366 $out->{$name}->{display} = \@o;
367 }
368
369 =head2 display
370
371 Define tag just for I<display>
372
373 @v = display('Title', rec('200','a') );
374
375 =cut
376
377 sub display {
378 my $name = shift or die "display needs name as first argument";
379 my @o = grep { defined($_) && $_ ne '' } @_;
380 return unless (@o);
381 $out->{$name}->{tag} = $name;
382 $out->{$name}->{display} = \@o;
383 }
384
385 =head2 search
386
387 Prepare values just for I<search>
388
389 @v = search('Title', rec('200','a') );
390
391 =cut
392
393 sub search {
394 my $name = shift or die "search needs name as first argument";
395 my @o = grep { defined($_) && $_ ne '' } @_;
396 return unless (@o);
397 $out->{$name}->{tag} = $name;
398 $out->{$name}->{search} = \@o;
399 }
400
401 =head2 marc_leader
402
403 Setup fields within MARC leader or get leader
404
405 marc_leader('05','c');
406 my $leader = marc_leader();
407
408 =cut
409
410 sub marc_leader {
411 my ($offset,$value) = @_;
412
413 if ($offset) {
414 $out->{' leader'}->{ $offset } = $value;
415 } else {
416 return $out->{' leader'};
417 }
418 }
419
420 =head2 marc
421
422 Save value for MARC field
423
424 marc('900','a', rec('200','a') );
425 marc('001', rec('000') );
426
427 =cut
428
429 sub marc {
430 my $f = shift or die "marc needs field";
431 die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433 my $sf;
434 if ($f >= 10) {
435 $sf = shift or die "marc needs subfield";
436 }
437
438 foreach (@_) {
439 my $v = $_; # make var read-write for Encode
440 next unless (defined($v) && $v !~ /^\s*$/);
441 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443 if (defined $sf) {
444 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445 } else {
446 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447 }
448 }
449 }
450
451 =head2 marc_repeatable_subfield
452
453 Save values for MARC repetable subfield
454
455 marc_repeatable_subfield('910', 'z', rec('909') );
456
457 =cut
458
459 sub marc_repeatable_subfield {
460 my ($f,$sf) = @_;
461 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462 $marc_repeatable_subfield->{ $f . $sf }++;
463 marc(@_);
464 }
465
466 =head2 marc_indicators
467
468 Set both indicators for MARC field
469
470 marc_indicators('900', ' ', 1);
471
472 Any indicator value other than C<0-9> will be treated as undefined.
473
474 =cut
475
476 sub marc_indicators {
477 my $f = shift || die "marc_indicators need field!\n";
478 my ($i1,$i2) = @_;
479 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482 $i1 = ' ' if ($i1 !~ /^\d$/);
483 $i2 = ' ' if ($i2 !~ /^\d$/);
484 @{ $marc_indicators->{$f} } = ($i1,$i2);
485 }
486
487 =head2 marc_compose
488
489 Save values for each MARC subfield explicitly
490
491 marc_compose('900',
492 'a', rec('200','a')
493 'b', rec('201','a')
494 'a', rec('200','b')
495 'c', rec('200','c')
496 );
497
498 =cut
499
500 sub marc_compose {
501 my $f = shift or die "marc_compose needs field";
502 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505 my $m = [ $f, $i1, $i2 ];
506
507 while (@_) {
508 my $sf = shift or die "marc_compose $f needs subfield";
509 my $v = shift;
510
511 next unless (defined($v) && $v !~ /^\s*$/);
512 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
513 push @$m, ( $sf, $v );
514 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
515 }
516
517 warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
518
519 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
520 }
521
522 =head2 marc_duplicate
523
524 Generate copy of current MARC record and continue working on copy
525
526 marc_duplicate();
527
528 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
529 C<< _get_marc_fields( offset => 42 ) >>.
530
531 =cut
532
533 sub marc_duplicate {
534 my $m = $marc_record->[ -1 ];
535 die "can't duplicate record which isn't defined" unless ($m);
536 push @{ $marc_record }, dclone( $m );
537 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
538 $marc_record_offset = $#{ $marc_record };
539 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
540 }
541
542 =head2 marc_remove
543
544 Remove some field or subfield from MARC record.
545
546 marc_remove('200');
547 marc_remove('200','a');
548
549 This will erase field C<200> or C<200^a> from current MARC record.
550
551 This is useful after calling C<marc_duplicate> or on it's own (but, you
552 should probably just remove that subfield definition if you are not
553 using C<marc_duplicate>).
554
555 FIXME: support fields < 10.
556
557 =cut
558
559 sub marc_remove {
560 my ($f, $sf) = @_;
561
562 die "marc_remove needs record number" unless defined($f);
563
564 my $marc = $marc_record->[ $marc_record_offset ];
565
566 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
567
568 my $i = 0;
569 foreach ( 0 .. $#{ $marc } ) {
570 last unless (defined $marc->[$i]);
571 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
572 if ($marc->[$i]->[0] eq $f) {
573 if (! defined $sf) {
574 # remove whole field
575 splice @$marc, $i, 1;
576 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
577 $i--;
578 } else {
579 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
580 my $o = ($j * 2) + 3;
581 if ($marc->[$i]->[$o] eq $sf) {
582 # remove subfield
583 splice @{$marc->[$i]}, $o, 2;
584 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
585 # is record now empty?
586 if ($#{ $marc->[$i] } == 2) {
587 splice @$marc, $i, 1;
588 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
589 $i--;
590 };
591 }
592 }
593 }
594 }
595 $i++;
596 }
597
598 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
599
600 $marc_record->[ $marc_record_offset ] = $marc;
601
602 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
603 }
604
605 =head1 Functions to extract data from input
606
607 This function should be used inside functions to create C<data_structure> described
608 above.
609
610 =head2 rec1
611
612 Return all values in some field
613
614 @v = rec1('200')
615
616 TODO: order of values is probably same as in source data, need to investigate that
617
618 =cut
619
620 sub rec1 {
621 my $f = shift;
622 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
623 return unless (defined($rec) && defined($rec->{$f}));
624 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
625 if (ref($rec->{$f}) eq 'ARRAY') {
626 return map {
627 if (ref($_) eq 'HASH') {
628 values %{$_};
629 } else {
630 $_;
631 }
632 } @{ $rec->{$f} };
633 } elsif( defined($rec->{$f}) ) {
634 return $rec->{$f};
635 }
636 }
637
638 =head2 rec2
639
640 Return all values in specific field and subfield
641
642 @v = rec2('200','a')
643
644 =cut
645
646 sub rec2 {
647 my $f = shift;
648 return unless (defined($rec && $rec->{$f}));
649 my $sf = shift;
650 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
651 }
652
653 =head2 rec
654
655 syntaxtic sugar for
656
657 @v = rec('200')
658 @v = rec('200','a')
659
660 =cut
661
662 sub rec {
663 if ($#_ == 0) {
664 return rec1(@_);
665 } elsif ($#_ == 1) {
666 return rec2(@_);
667 }
668 }
669
670 =head2 regex
671
672 Apply regex to some or all values
673
674 @v = regex( 's/foo/bar/g', @v );
675
676 =cut
677
678 sub regex {
679 my $r = shift;
680 my @out;
681 #warn "r: $r\n", dump(\@_);
682 foreach my $t (@_) {
683 next unless ($t);
684 eval "\$t =~ $r";
685 push @out, $t if ($t && $t ne '');
686 }
687 return @out;
688 }
689
690 =head2 prefix
691
692 Prefix all values with a string
693
694 @v = prefix( 'my_', @v );
695
696 =cut
697
698 sub prefix {
699 my $p = shift or die "prefix needs string as first argument";
700 return map { $p . $_ } grep { defined($_) } @_;
701 }
702
703 =head2 suffix
704
705 suffix all values with a string
706
707 @v = suffix( '_my', @v );
708
709 =cut
710
711 sub suffix {
712 my $s = shift or die "suffix needs string as first argument";
713 return map { $_ . $s } grep { defined($_) } @_;
714 }
715
716 =head2 surround
717
718 surround all values with a two strings
719
720 @v = surround( 'prefix_', '_suffix', @v );
721
722 =cut
723
724 sub surround {
725 my $p = shift or die "surround need prefix as first argument";
726 my $s = shift or die "surround needs suffix as second argument";
727 return map { $p . $_ . $s } grep { defined($_) } @_;
728 }
729
730 =head2 first
731
732 Return first element
733
734 $v = first( @v );
735
736 =cut
737
738 sub first {
739 my $r = shift;
740 return $r;
741 }
742
743 =head2 lookup
744
745 Consult lookup hashes for some value
746
747 @v = lookup( $v );
748 @v = lookup( @v );
749
750 =cut
751
752 sub lookup {
753 my $k = shift or return;
754 return unless (defined($lookup->{$k}));
755 if (ref($lookup->{$k}) eq 'ARRAY') {
756 return @{ $lookup->{$k} };
757 } else {
758 return $lookup->{$k};
759 }
760 }
761
762 =head2 join_with
763
764 Joins walues with some delimiter
765
766 $v = join_with(", ", @v);
767
768 =cut
769
770 sub join_with {
771 my $d = shift;
772 return join($d, grep { defined($_) && $_ ne '' } @_);
773 }
774
775 =head2 split_rec_on
776
777 Split record subfield on some regex and take one of parts out
778
779 $a_before_semi_column =
780 split_rec_on('200','a', /\s*;\s*/, $part);
781
782 C<$part> is optional number of element. First element is
783 B<1>, not 0!
784
785 If there is no C<$part> parameter or C<$part> is 0, this function will
786 return all values produced by splitting.
787
788 =cut
789
790 sub split_rec_on {
791 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
792
793 my ($fld, $sf, $regex, $part) = @_;
794 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
795
796 my @r = rec( $fld, $sf );
797 my $v = shift @r;
798 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
799
800 return '' if( ! defined($v) || $v =~ /^\s*$/);
801
802 my @s = split( $regex, $v );
803 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
804 if ($part && $part > 0) {
805 return $s[ $part - 1 ];
806 } else {
807 return @s;
808 }
809 }
810
811 # END
812 1;

  ViewVC Help
Powered by ViewVC 1.1.26