/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 574 - (show annotations)
Mon Jul 3 21:08:07 2006 UTC (16 years, 6 months ago) by dpavlin
File size: 17340 byte(s)
added marc_duplicate and marc_remove

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
12
13 rec1 rec2 rec
14 regex prefix suffix surround
15 first lookup join_with
16
17 split_rec_on
18 /;
19
20 use warnings;
21 use strict;
22
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
27
28 # debugging warn(s)
29 my $debug = 0;
30
31
32 =head1 NAME
33
34 WebPAC::Normalize - describe normalisaton rules using sets
35
36 =head1 VERSION
37
38 Version 0.10
39
40 =cut
41
42 our $VERSION = '0.10';
43
44 =head1 SYNOPSIS
45
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
48 processing.
49
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
53
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56 C<marc>.
57
58 =head1 FUNCTIONS
59
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
62
63 =head2 data_structure
64
65 Return data structure
66
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
69 row => $row,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
72 );
73
74 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75 other are optional.
76
77 This function will B<die> if normalizastion can't be evaled.
78
79 Since this function isn't exported you have to call it with
80 C<WebPAC::Normalize::data_structure>.
81
82 =cut
83
84 sub data_structure {
85 my $arg = {@_};
86
87 die "need row argument" unless ($arg->{row});
88 die "need normalisation argument" unless ($arg->{rules});
89
90 no strict 'subs';
91 _set_lookup( $arg->{lookup} );
92 _set_rec( $arg->{row} );
93 _clean_ds( %{ $arg } );
94 eval "$arg->{rules}";
95 die "error evaling $arg->{rules}: $@\n" if ($@);
96
97 return _get_ds();
98 }
99
100 =head2 _set_rec
101
102 Set current record hash
103
104 _set_rec( $rec );
105
106 =cut
107
108 my $rec;
109
110 sub _set_rec {
111 $rec = shift or die "no record hash";
112 }
113
114 =head2 _get_ds
115
116 Return hash formatted as data structure
117
118 my $ds = _get_ds();
119
120 =cut
121
122 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125 sub _get_ds {
126 return $out;
127 }
128
129 =head2 _clean_ds
130
131 Clean data structure hash for next record
132
133 _clean_ds();
134
135 =cut
136
137 sub _clean_ds {
138 my $a = {@_};
139 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140 ($marc_record_offset, $marc_fetch_offset) = (0,0);
141 $marc_encoding = $a->{marc_encoding};
142 }
143
144 =head2 _set_lookup
145
146 Set current lookup hash
147
148 _set_lookup( $lookup );
149
150 =cut
151
152 my $lookup;
153
154 sub _set_lookup {
155 $lookup = shift;
156 }
157
158 =head2 _get_marc_fields
159
160 Get all fields defined by calls to C<marc>
161
162 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164 We are using I<magic> which detect repeatable fields only from
165 sequence of field/subfield data generated by normalization.
166
167 Repeatable field is created when there is second occurence of same subfield or
168 if any of indicators are different.
169
170 This is sane for most cases. Something like:
171
172 900a-1 900b-1 900c-1
173 900a-2 900b-2
174 900a-3
175
176 will be created from any combination of:
177
178 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180 and following rules:
181
182 marc('900','a', rec('200','a') );
183 marc('900','b', rec('200','b') );
184 marc('900','c', rec('200','c') );
185
186 which might not be what you have in mind. If you need repeatable subfield,
187 define it using C<marc_repeatable_subfield> like this:
188
189 marc_repeatable_subfield('900','a');
190 marc('900','a', rec('200','a') );
191 marc('900','b', rec('200','b') );
192 marc('900','c', rec('200','c') );
193
194 will create:
195
196 900a-1 900a-2 900a-3 900b-1 900c-1
197 900b-2
198
199 There is also support for returning next or specific using:
200
201 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202 # do something with $mf
203 }
204
205 will always return fields from next MARC record or
206
207 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209 will return 42th copy record (if it exists).
210
211 =cut
212
213 sub _get_marc_fields {
214
215 my $arg = {@_};
216 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217 my $offset = $marc_fetch_offset;
218 if ($arg->{offset}) {
219 $offset = $arg->{offset};
220 } elsif($arg->{fetch_next}) {
221 $marc_fetch_offset++;
222 }
223
224 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
225
226 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228 my $marc_rec = $marc_record->[ $offset ];
229
230 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
233
234 # first, sort all existing fields
235 # XXX might not be needed, but modern perl might randomize elements in hash
236 my @sorted_marc_record = sort {
237 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
238 } @{ $marc_rec };
239
240 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242 # output marc fields
243 my @m;
244
245 # count unique field-subfields (used for offset when walking to next subfield)
246 my $u;
247 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
248
249 if ($debug) {
250 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253 warn "## subfield count = ", dump( $u ), $/;
254 }
255
256 my $len = $#sorted_marc_record;
257 my $visited;
258 my $i = 0;
259 my $field;
260
261 foreach ( 0 .. $len ) {
262
263 # find next element which isn't visited
264 while ($visited->{$i}) {
265 $i = ($i + 1) % ($len + 1);
266 }
267
268 # mark it visited
269 $visited->{$i}++;
270
271 my $row = dclone( $sorted_marc_record[$i] );
272
273 # field and subfield which is key for
274 # marc_repeatable_subfield and u
275 my $fsf = $row->[0] . ( $row->[3] || '' );
276
277 if ($debug > 1) {
278
279 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280 print "### this [$i]: ", dump( $row ),$/;
281 print "### sf: ", $row->[3], " vs ", $field->[3],
282 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283 if ($#$field >= 0);
284
285 }
286
287 # if field exists
288 if ( $#$field >= 0 ) {
289 if (
290 $row->[0] ne $field->[0] || # field
291 $row->[1] ne $field->[1] || # i1
292 $row->[2] ne $field->[2] # i2
293 ) {
294 push @m, $field;
295 warn "## saved/1 ", dump( $field ),$/ if ($debug);
296 $field = $row;
297
298 } elsif (
299 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300 ||
301 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302 ! $marc_repeatable_subfield->{ $fsf }
303 )
304 ) {
305 push @m, $field;
306 warn "## saved/2 ", dump( $field ),$/ if ($debug);
307 $field = $row;
308
309 } else {
310 # append new subfields to existing field
311 push @$field, ( $row->[3], $row->[4] );
312 }
313 } else {
314 # insert first field
315 $field = $row;
316 }
317
318 if (! $marc_repeatable_subfield->{ $fsf }) {
319 # make step to next subfield
320 $i = ($i + $u->{ $fsf } ) % ($len + 1);
321 }
322 }
323
324 if ($#$field >= 0) {
325 push @m, $field;
326 warn "## saved/3 ", dump( $field ),$/ if ($debug);
327 }
328
329 return @m;
330 }
331
332 =head2 _debug
333
334 Change level of debug warnings
335
336 _debug( 2 );
337
338 =cut
339
340 sub _debug {
341 my $l = shift;
342 return $debug unless defined($l);
343 warn "debug level $l",$/ if ($l > 0);
344 $debug = $l;
345 }
346
347 =head1 Functions to create C<data_structure>
348
349 Those functions generally have to first in your normalization file.
350
351 =head2 tag
352
353 Define new tag for I<search> and I<display>.
354
355 tag('Title', rec('200','a') );
356
357
358 =cut
359
360 sub tag {
361 my $name = shift or die "tag needs name as first argument";
362 my @o = grep { defined($_) && $_ ne '' } @_;
363 return unless (@o);
364 $out->{$name}->{tag} = $name;
365 $out->{$name}->{search} = \@o;
366 $out->{$name}->{display} = \@o;
367 }
368
369 =head2 display
370
371 Define tag just for I<display>
372
373 @v = display('Title', rec('200','a') );
374
375 =cut
376
377 sub display {
378 my $name = shift or die "display needs name as first argument";
379 my @o = grep { defined($_) && $_ ne '' } @_;
380 return unless (@o);
381 $out->{$name}->{tag} = $name;
382 $out->{$name}->{display} = \@o;
383 }
384
385 =head2 search
386
387 Prepare values just for I<search>
388
389 @v = search('Title', rec('200','a') );
390
391 =cut
392
393 sub search {
394 my $name = shift or die "search needs name as first argument";
395 my @o = grep { defined($_) && $_ ne '' } @_;
396 return unless (@o);
397 $out->{$name}->{tag} = $name;
398 $out->{$name}->{search} = \@o;
399 }
400
401 =head2 marc_leader
402
403 Setup fields within MARC leader or get leader
404
405 marc_leader('05','c');
406 my $leader = marc_leader();
407
408 =cut
409
410 sub marc_leader {
411 my ($offset,$value) = @_;
412
413 if ($offset) {
414 $out->{' leader'}->{ $offset } = $value;
415 } else {
416 return $out->{' leader'};
417 }
418 }
419
420 =head2 marc
421
422 Save value for MARC field
423
424 marc('900','a', rec('200','a') );
425 marc('001', rec('000') );
426
427 =cut
428
429 sub marc {
430 my $f = shift or die "marc needs field";
431 die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433 my $sf;
434 if ($f >= 10) {
435 $sf = shift or die "marc needs subfield";
436 }
437
438 foreach (@_) {
439 my $v = $_; # make var read-write for Encode
440 next unless (defined($v) && $v !~ /^\s*$/);
441 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443 if (defined $sf) {
444 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445 } else {
446 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447 }
448 }
449 }
450
451 =head2 marc_repeatable_subfield
452
453 Save values for MARC repetable subfield
454
455 marc_repeatable_subfield('910', 'z', rec('909') );
456
457 =cut
458
459 sub marc_repeatable_subfield {
460 my ($f,$sf) = @_;
461 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462 $marc_repeatable_subfield->{ $f . $sf }++;
463 marc(@_);
464 }
465
466 =head2 marc_indicators
467
468 Set both indicators for MARC field
469
470 marc_indicators('900', ' ', 1);
471
472 Any indicator value other than C<0-9> will be treated as undefined.
473
474 =cut
475
476 sub marc_indicators {
477 my $f = shift || die "marc_indicators need field!\n";
478 my ($i1,$i2) = @_;
479 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482 $i1 = ' ' if ($i1 !~ /^\d$/);
483 $i2 = ' ' if ($i2 !~ /^\d$/);
484 @{ $marc_indicators->{$f} } = ($i1,$i2);
485 }
486
487 =head2 marc_compose
488
489 Save values for each MARC subfield explicitly
490
491 marc_compose('900',
492 'a', rec('200','a')
493 'b', rec('201','a')
494 'a', rec('200','b')
495 'c', rec('200','c')
496 );
497
498 =cut
499
500 sub marc_compose {
501 my $f = shift or die "marc_compose needs field";
502 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505 my $m = [ $f, $i1, $i2 ];
506
507 while (@_) {
508 my $sf = shift or die "marc_compose $f needs subfield";
509 my $v = shift;
510
511 next unless (defined($v) && $v !~ /^\s*$/);
512 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
513 push @$m, ( $sf, $v );
514 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
515 }
516
517 warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
518
519 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
520 }
521
522 =head2 marc_duplicate
523
524 Generate copy of current MARC record and continue working on copy
525
526 marc_duplicate();
527
528 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
529 C<< _get_marc_fields( offset => 42 ) >>.
530
531 =cut
532
533 sub marc_duplicate {
534 my $m = $marc_record->[ -1 ];
535 die "can't duplicate record which isn't defined" unless ($m);
536 push @{ $marc_record }, dclone( $m );
537 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
538 $marc_record_offset = $#{ $marc_record };
539 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
540 }
541
542 =head2 marc_remove
543
544 Remove some field or subfield from MARC record.
545
546 marc_remove('200');
547 marc_remove('200','a');
548
549 This will erase field C<200> or C<200^a> from current MARC record.
550
551 This is useful after calling C<marc_duplicate> or on it's own (but, you
552 should probably just remove that subfield definition if you are not
553 using C<marc_duplicate>).
554
555 FIXME: support fields < 10.
556
557 =cut
558
559 sub marc_remove {
560 my ($f, $sf) = @_;
561
562 die "marc_remove needs record number" unless defined($f);
563
564 my $marc = $marc_record->[ $marc_record_offset ];
565
566 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
567
568 foreach my $i ( 0 .. $#{ $marc } ) {
569 last unless (defined $marc->[$i]);
570 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
571 if ($marc->[$i]->[0] eq $f) {
572 if (! defined $sf) {
573 # remove whole field
574 splice @$marc, $i, 1;
575 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
576 $i--;
577 } else {
578 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
579 my $o = ($j * 2) + 3;
580 if ($marc->[$i]->[$o] eq $sf) {
581 # remove subfield
582 splice @{$marc->[$i]}, $o, 2;
583 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
584 # is record now empty?
585 if ($#{ $marc->[$i] } == 2) {
586 splice @$marc, $i, 1;
587 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
588 $i--;
589 };
590 }
591 }
592 }
593 }
594 }
595
596 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
597
598 $marc_record->[ $marc_record_offset ] = $marc;
599
600 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
601 }
602
603 =head1 Functions to extract data from input
604
605 This function should be used inside functions to create C<data_structure> described
606 above.
607
608 =head2 rec1
609
610 Return all values in some field
611
612 @v = rec1('200')
613
614 TODO: order of values is probably same as in source data, need to investigate that
615
616 =cut
617
618 sub rec1 {
619 my $f = shift;
620 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
621 return unless (defined($rec) && defined($rec->{$f}));
622 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
623 if (ref($rec->{$f}) eq 'ARRAY') {
624 return map {
625 if (ref($_) eq 'HASH') {
626 values %{$_};
627 } else {
628 $_;
629 }
630 } @{ $rec->{$f} };
631 } elsif( defined($rec->{$f}) ) {
632 return $rec->{$f};
633 }
634 }
635
636 =head2 rec2
637
638 Return all values in specific field and subfield
639
640 @v = rec2('200','a')
641
642 =cut
643
644 sub rec2 {
645 my $f = shift;
646 return unless (defined($rec && $rec->{$f}));
647 my $sf = shift;
648 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
649 }
650
651 =head2 rec
652
653 syntaxtic sugar for
654
655 @v = rec('200')
656 @v = rec('200','a')
657
658 =cut
659
660 sub rec {
661 if ($#_ == 0) {
662 return rec1(@_);
663 } elsif ($#_ == 1) {
664 return rec2(@_);
665 }
666 }
667
668 =head2 regex
669
670 Apply regex to some or all values
671
672 @v = regex( 's/foo/bar/g', @v );
673
674 =cut
675
676 sub regex {
677 my $r = shift;
678 my @out;
679 #warn "r: $r\n", dump(\@_);
680 foreach my $t (@_) {
681 next unless ($t);
682 eval "\$t =~ $r";
683 push @out, $t if ($t && $t ne '');
684 }
685 return @out;
686 }
687
688 =head2 prefix
689
690 Prefix all values with a string
691
692 @v = prefix( 'my_', @v );
693
694 =cut
695
696 sub prefix {
697 my $p = shift or die "prefix needs string as first argument";
698 return map { $p . $_ } grep { defined($_) } @_;
699 }
700
701 =head2 suffix
702
703 suffix all values with a string
704
705 @v = suffix( '_my', @v );
706
707 =cut
708
709 sub suffix {
710 my $s = shift or die "suffix needs string as first argument";
711 return map { $_ . $s } grep { defined($_) } @_;
712 }
713
714 =head2 surround
715
716 surround all values with a two strings
717
718 @v = surround( 'prefix_', '_suffix', @v );
719
720 =cut
721
722 sub surround {
723 my $p = shift or die "surround need prefix as first argument";
724 my $s = shift or die "surround needs suffix as second argument";
725 return map { $p . $_ . $s } grep { defined($_) } @_;
726 }
727
728 =head2 first
729
730 Return first element
731
732 $v = first( @v );
733
734 =cut
735
736 sub first {
737 my $r = shift;
738 return $r;
739 }
740
741 =head2 lookup
742
743 Consult lookup hashes for some value
744
745 @v = lookup( $v );
746 @v = lookup( @v );
747
748 =cut
749
750 sub lookup {
751 my $k = shift or return;
752 return unless (defined($lookup->{$k}));
753 if (ref($lookup->{$k}) eq 'ARRAY') {
754 return @{ $lookup->{$k} };
755 } else {
756 return $lookup->{$k};
757 }
758 }
759
760 =head2 join_with
761
762 Joins walues with some delimiter
763
764 $v = join_with(", ", @v);
765
766 =cut
767
768 sub join_with {
769 my $d = shift;
770 return join($d, grep { defined($_) && $_ ne '' } @_);
771 }
772
773 =head2 split_rec_on
774
775 Split record subfield on some regex and take one of parts out
776
777 $a_before_semi_column =
778 split_rec_on('200','a', /\s*;\s*/, $part);
779
780 C<$part> is optional number of element. First element is
781 B<1>, not 0!
782
783 If there is no C<$part> parameter or C<$part> is 0, this function will
784 return all values produced by splitting.
785
786 =cut
787
788 sub split_rec_on {
789 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
790
791 my ($fld, $sf, $regex, $part) = @_;
792 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
793
794 my @r = rec( $fld, $sf );
795 my $v = shift @r;
796 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
797
798 return '' if( ! defined($v) || $v =~ /^\s*$/);
799
800 my @s = split( $regex, $v );
801 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
802 if ($part && $part > 0) {
803 return $s[ $part - 1 ];
804 } else {
805 return @s;
806 }
807 }
808
809 # END
810 1;

  ViewVC Help
Powered by ViewVC 1.1.26