/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 592 - (show annotations)
Sun Jul 9 15:22:30 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 17647 byte(s)
 r822@llin:  dpavlin | 2006-07-09 17:14:07 +0200
 prefix doesn't die if first parametar is undef

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
12
13 rec1 rec2 rec
14 regex prefix suffix surround
15 first lookup join_with
16
17 split_rec_on
18 /;
19
20 use warnings;
21 use strict;
22
23 #use base qw/WebPAC::Common/;
24 use Data::Dump qw/dump/;
25 use Encode qw/from_to/;
26 use Storable qw/dclone/;
27
28 # debugging warn(s)
29 my $debug = 0;
30
31
32 =head1 NAME
33
34 WebPAC::Normalize - describe normalisaton rules using sets
35
36 =head1 VERSION
37
38 Version 0.12
39
40 =cut
41
42 our $VERSION = '0.12';
43
44 =head1 SYNOPSIS
45
46 This module uses C<conf/normalize/*.pl> files to perform normalisation
47 from input records using perl functions which are specialized for set
48 processing.
49
50 Sets are implemented as arrays, and normalisation file is valid perl, which
51 means that you check it's validity before running WebPAC using
52 C<perl -c normalize.pl>.
53
54 Normalisation can generate multiple output normalized data. For now, supported output
55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56 C<marc>.
57
58 =head1 FUNCTIONS
59
60 Functions which start with C<_> are private and used by WebPAC internally.
61 All other functions are available for use within normalisation rules.
62
63 =head2 data_structure
64
65 Return data structure
66
67 my $ds = WebPAC::Normalize::data_structure(
68 lookup => $lookup->lookup_hash,
69 row => $row,
70 rules => $normalize_pl_config,
71 marc_encoding => 'utf-8',
72 );
73
74 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
75 other are optional.
76
77 This function will B<die> if normalizastion can't be evaled.
78
79 Since this function isn't exported you have to call it with
80 C<WebPAC::Normalize::data_structure>.
81
82 =cut
83
84 sub data_structure {
85 my $arg = {@_};
86
87 die "need row argument" unless ($arg->{row});
88 die "need normalisation argument" unless ($arg->{rules});
89
90 no strict 'subs';
91 _set_lookup( $arg->{lookup} );
92 _set_rec( $arg->{row} );
93 _clean_ds( %{ $arg } );
94 eval "$arg->{rules}";
95 die "error evaling $arg->{rules}: $@\n" if ($@);
96
97 return _get_ds();
98 }
99
100 =head2 _set_rec
101
102 Set current record hash
103
104 _set_rec( $rec );
105
106 =cut
107
108 my $rec;
109
110 sub _set_rec {
111 $rec = shift or die "no record hash";
112 }
113
114 =head2 _get_ds
115
116 Return hash formatted as data structure
117
118 my $ds = _get_ds();
119
120 =cut
121
122 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
123 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
124
125 sub _get_ds {
126 return $out;
127 }
128
129 =head2 _clean_ds
130
131 Clean data structure hash for next record
132
133 _clean_ds();
134
135 =cut
136
137 sub _clean_ds {
138 my $a = {@_};
139 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
140 ($marc_record_offset, $marc_fetch_offset) = (0,0);
141 $marc_encoding = $a->{marc_encoding};
142 }
143
144 =head2 _set_lookup
145
146 Set current lookup hash
147
148 _set_lookup( $lookup );
149
150 =cut
151
152 my $lookup;
153
154 sub _set_lookup {
155 $lookup = shift;
156 }
157
158 =head2 _get_marc_fields
159
160 Get all fields defined by calls to C<marc>
161
162 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
163
164 We are using I<magic> which detect repeatable fields only from
165 sequence of field/subfield data generated by normalization.
166
167 Repeatable field is created when there is second occurence of same subfield or
168 if any of indicators are different.
169
170 This is sane for most cases. Something like:
171
172 900a-1 900b-1 900c-1
173 900a-2 900b-2
174 900a-3
175
176 will be created from any combination of:
177
178 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
179
180 and following rules:
181
182 marc('900','a', rec('200','a') );
183 marc('900','b', rec('200','b') );
184 marc('900','c', rec('200','c') );
185
186 which might not be what you have in mind. If you need repeatable subfield,
187 define it using C<marc_repeatable_subfield> like this:
188
189 marc_repeatable_subfield('900','a');
190 marc('900','a', rec('200','a') );
191 marc('900','b', rec('200','b') );
192 marc('900','c', rec('200','c') );
193
194 will create:
195
196 900a-1 900a-2 900a-3 900b-1 900c-1
197 900b-2
198
199 There is also support for returning next or specific using:
200
201 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
202 # do something with $mf
203 }
204
205 will always return fields from next MARC record or
206
207 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
208
209 will return 42th copy record (if it exists).
210
211 =cut
212
213 sub _get_marc_fields {
214
215 my $arg = {@_};
216 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
217 my $offset = $marc_fetch_offset;
218 if ($arg->{offset}) {
219 $offset = $arg->{offset};
220 } elsif($arg->{fetch_next}) {
221 $marc_fetch_offset++;
222 }
223
224 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
225
226 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
227
228 my $marc_rec = $marc_record->[ $offset ];
229
230 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
231
232 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
233
234 # first, sort all existing fields
235 # XXX might not be needed, but modern perl might randomize elements in hash
236 my @sorted_marc_record = sort {
237 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
238 } @{ $marc_rec };
239
240 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
241
242 # output marc fields
243 my @m;
244
245 # count unique field-subfields (used for offset when walking to next subfield)
246 my $u;
247 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
248
249 if ($debug) {
250 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
251 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
252 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
253 warn "## subfield count = ", dump( $u ), $/;
254 }
255
256 my $len = $#sorted_marc_record;
257 my $visited;
258 my $i = 0;
259 my $field;
260
261 foreach ( 0 .. $len ) {
262
263 # find next element which isn't visited
264 while ($visited->{$i}) {
265 $i = ($i + 1) % ($len + 1);
266 }
267
268 # mark it visited
269 $visited->{$i}++;
270
271 my $row = dclone( $sorted_marc_record[$i] );
272
273 # field and subfield which is key for
274 # marc_repeatable_subfield and u
275 my $fsf = $row->[0] . ( $row->[3] || '' );
276
277 if ($debug > 1) {
278
279 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
280 print "### this [$i]: ", dump( $row ),$/;
281 print "### sf: ", $row->[3], " vs ", $field->[3],
282 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
283 if ($#$field >= 0);
284
285 }
286
287 # if field exists
288 if ( $#$field >= 0 ) {
289 if (
290 $row->[0] ne $field->[0] || # field
291 $row->[1] ne $field->[1] || # i1
292 $row->[2] ne $field->[2] # i2
293 ) {
294 push @m, $field;
295 warn "## saved/1 ", dump( $field ),$/ if ($debug);
296 $field = $row;
297
298 } elsif (
299 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
300 ||
301 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
302 ! $marc_repeatable_subfield->{ $fsf }
303 )
304 ) {
305 push @m, $field;
306 warn "## saved/2 ", dump( $field ),$/ if ($debug);
307 $field = $row;
308
309 } else {
310 # append new subfields to existing field
311 push @$field, ( $row->[3], $row->[4] );
312 }
313 } else {
314 # insert first field
315 $field = $row;
316 }
317
318 if (! $marc_repeatable_subfield->{ $fsf }) {
319 # make step to next subfield
320 $i = ($i + $u->{ $fsf } ) % ($len + 1);
321 }
322 }
323
324 if ($#$field >= 0) {
325 push @m, $field;
326 warn "## saved/3 ", dump( $field ),$/ if ($debug);
327 }
328
329 return \@m;
330 }
331
332 =head2 _debug
333
334 Change level of debug warnings
335
336 _debug( 2 );
337
338 =cut
339
340 sub _debug {
341 my $l = shift;
342 return $debug unless defined($l);
343 warn "debug level $l",$/ if ($l > 0);
344 $debug = $l;
345 }
346
347 =head1 Functions to create C<data_structure>
348
349 Those functions generally have to first in your normalization file.
350
351 =head2 tag
352
353 Define new tag for I<search> and I<display>.
354
355 tag('Title', rec('200','a') );
356
357
358 =cut
359
360 sub tag {
361 my $name = shift or die "tag needs name as first argument";
362 my @o = grep { defined($_) && $_ ne '' } @_;
363 return unless (@o);
364 $out->{$name}->{tag} = $name;
365 $out->{$name}->{search} = \@o;
366 $out->{$name}->{display} = \@o;
367 }
368
369 =head2 display
370
371 Define tag just for I<display>
372
373 @v = display('Title', rec('200','a') );
374
375 =cut
376
377 sub display {
378 my $name = shift or die "display needs name as first argument";
379 my @o = grep { defined($_) && $_ ne '' } @_;
380 return unless (@o);
381 $out->{$name}->{tag} = $name;
382 $out->{$name}->{display} = \@o;
383 }
384
385 =head2 search
386
387 Prepare values just for I<search>
388
389 @v = search('Title', rec('200','a') );
390
391 =cut
392
393 sub search {
394 my $name = shift or die "search needs name as first argument";
395 my @o = grep { defined($_) && $_ ne '' } @_;
396 return unless (@o);
397 $out->{$name}->{tag} = $name;
398 $out->{$name}->{search} = \@o;
399 }
400
401 =head2 marc_leader
402
403 Setup fields within MARC leader or get leader
404
405 marc_leader('05','c');
406 my $leader = marc_leader();
407
408 =cut
409
410 sub marc_leader {
411 my ($offset,$value) = @_;
412
413 if ($offset) {
414 $out->{' leader'}->{ $offset } = $value;
415 } else {
416 return $out->{' leader'};
417 }
418 }
419
420 =head2 marc
421
422 Save value for MARC field
423
424 marc('900','a', rec('200','a') );
425 marc('001', rec('000') );
426
427 =cut
428
429 sub marc {
430 my $f = shift or die "marc needs field";
431 die "marc field must be numer" unless ($f =~ /^\d+$/);
432
433 my $sf;
434 if ($f >= 10) {
435 $sf = shift or die "marc needs subfield";
436 }
437
438 foreach (@_) {
439 my $v = $_; # make var read-write for Encode
440 next unless (defined($v) && $v !~ /^\s*$/);
441 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
442 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
443 if (defined $sf) {
444 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
445 } else {
446 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
447 }
448 }
449 }
450
451 =head2 marc_repeatable_subfield
452
453 Save values for MARC repetable subfield
454
455 marc_repeatable_subfield('910', 'z', rec('909') );
456
457 =cut
458
459 sub marc_repeatable_subfield {
460 my ($f,$sf) = @_;
461 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
462 $marc_repeatable_subfield->{ $f . $sf }++;
463 marc(@_);
464 }
465
466 =head2 marc_indicators
467
468 Set both indicators for MARC field
469
470 marc_indicators('900', ' ', 1);
471
472 Any indicator value other than C<0-9> will be treated as undefined.
473
474 =cut
475
476 sub marc_indicators {
477 my $f = shift || die "marc_indicators need field!\n";
478 my ($i1,$i2) = @_;
479 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
480 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
481
482 $i1 = ' ' if ($i1 !~ /^\d$/);
483 $i2 = ' ' if ($i2 !~ /^\d$/);
484 @{ $marc_indicators->{$f} } = ($i1,$i2);
485 }
486
487 =head2 marc_compose
488
489 Save values for each MARC subfield explicitly
490
491 marc_compose('900',
492 'a', rec('200','a')
493 'b', rec('201','a')
494 'a', rec('200','b')
495 'c', rec('200','c')
496 );
497
498 =cut
499
500 sub marc_compose {
501 my $f = shift or die "marc_compose needs field";
502 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
503
504 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
505 my $m = [ $f, $i1, $i2 ];
506
507 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
508
509 while (@_) {
510 my $sf = shift or die "marc_compose $f needs subfield";
511 my $v = shift;
512
513 next unless (defined($v) && $v !~ /^\s*$/);
514 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
515 push @$m, ( $sf, $v );
516 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
517 }
518
519 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
520
521 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
522 }
523
524 =head2 marc_duplicate
525
526 Generate copy of current MARC record and continue working on copy
527
528 marc_duplicate();
529
530 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
531 C<< _get_marc_fields( offset => 42 ) >>.
532
533 =cut
534
535 sub marc_duplicate {
536 my $m = $marc_record->[ -1 ];
537 die "can't duplicate record which isn't defined" unless ($m);
538 push @{ $marc_record }, dclone( $m );
539 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
540 $marc_record_offset = $#{ $marc_record };
541 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
542 }
543
544 =head2 marc_remove
545
546 Remove some field or subfield from MARC record.
547
548 marc_remove('200');
549 marc_remove('200','a');
550
551 This will erase field C<200> or C<200^a> from current MARC record.
552
553 This is useful after calling C<marc_duplicate> or on it's own (but, you
554 should probably just remove that subfield definition if you are not
555 using C<marc_duplicate>).
556
557 FIXME: support fields < 10.
558
559 =cut
560
561 sub marc_remove {
562 my ($f, $sf) = @_;
563
564 die "marc_remove needs record number" unless defined($f);
565
566 my $marc = $marc_record->[ $marc_record_offset ];
567
568 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
569
570 my $i = 0;
571 foreach ( 0 .. $#{ $marc } ) {
572 last unless (defined $marc->[$i]);
573 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
574 if ($marc->[$i]->[0] eq $f) {
575 if (! defined $sf) {
576 # remove whole field
577 splice @$marc, $i, 1;
578 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
579 $i--;
580 } else {
581 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
582 my $o = ($j * 2) + 3;
583 if ($marc->[$i]->[$o] eq $sf) {
584 # remove subfield
585 splice @{$marc->[$i]}, $o, 2;
586 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
587 # is record now empty?
588 if ($#{ $marc->[$i] } == 2) {
589 splice @$marc, $i, 1;
590 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
591 $i--;
592 };
593 }
594 }
595 }
596 }
597 $i++;
598 }
599
600 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
601
602 $marc_record->[ $marc_record_offset ] = $marc;
603
604 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
605 }
606
607 =head1 Functions to extract data from input
608
609 This function should be used inside functions to create C<data_structure> described
610 above.
611
612 =head2 rec1
613
614 Return all values in some field
615
616 @v = rec1('200')
617
618 TODO: order of values is probably same as in source data, need to investigate that
619
620 =cut
621
622 sub rec1 {
623 my $f = shift;
624 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
625 return unless (defined($rec) && defined($rec->{$f}));
626 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
627 if (ref($rec->{$f}) eq 'ARRAY') {
628 return map {
629 if (ref($_) eq 'HASH') {
630 values %{$_};
631 } else {
632 $_;
633 }
634 } @{ $rec->{$f} };
635 } elsif( defined($rec->{$f}) ) {
636 return $rec->{$f};
637 }
638 }
639
640 =head2 rec2
641
642 Return all values in specific field and subfield
643
644 @v = rec2('200','a')
645
646 =cut
647
648 sub rec2 {
649 my $f = shift;
650 return unless (defined($rec && $rec->{$f}));
651 my $sf = shift;
652 return map {
653 if (ref($_->{$sf}) eq 'ARRAY') {
654 @{ $_->{$sf} };
655 } else {
656 $_->{$sf};
657 }
658 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
659 }
660
661 =head2 rec
662
663 syntaxtic sugar for
664
665 @v = rec('200')
666 @v = rec('200','a')
667
668 =cut
669
670 sub rec {
671 my @out;
672 if ($#_ == 0) {
673 @out = rec1(@_);
674 } elsif ($#_ == 1) {
675 @out = rec2(@_);
676 }
677 if (@out) {
678 return @out;
679 } else {
680 return '';
681 }
682 }
683
684 =head2 regex
685
686 Apply regex to some or all values
687
688 @v = regex( 's/foo/bar/g', @v );
689
690 =cut
691
692 sub regex {
693 my $r = shift;
694 my @out;
695 #warn "r: $r\n", dump(\@_);
696 foreach my $t (@_) {
697 next unless ($t);
698 eval "\$t =~ $r";
699 push @out, $t if ($t && $t ne '');
700 }
701 return @out;
702 }
703
704 =head2 prefix
705
706 Prefix all values with a string
707
708 @v = prefix( 'my_', @v );
709
710 =cut
711
712 sub prefix {
713 my $p = shift or return;
714 return map { $p . $_ } grep { defined($_) } @_;
715 }
716
717 =head2 suffix
718
719 suffix all values with a string
720
721 @v = suffix( '_my', @v );
722
723 =cut
724
725 sub suffix {
726 my $s = shift or die "suffix needs string as first argument";
727 return map { $_ . $s } grep { defined($_) } @_;
728 }
729
730 =head2 surround
731
732 surround all values with a two strings
733
734 @v = surround( 'prefix_', '_suffix', @v );
735
736 =cut
737
738 sub surround {
739 my $p = shift or die "surround need prefix as first argument";
740 my $s = shift or die "surround needs suffix as second argument";
741 return map { $p . $_ . $s } grep { defined($_) } @_;
742 }
743
744 =head2 first
745
746 Return first element
747
748 $v = first( @v );
749
750 =cut
751
752 sub first {
753 my $r = shift;
754 return $r;
755 }
756
757 =head2 lookup
758
759 Consult lookup hashes for some value
760
761 @v = lookup( $v );
762 @v = lookup( @v );
763
764 =cut
765
766 sub lookup {
767 my $k = shift or return;
768 return unless (defined($lookup->{$k}));
769 if (ref($lookup->{$k}) eq 'ARRAY') {
770 return @{ $lookup->{$k} };
771 } else {
772 return $lookup->{$k};
773 }
774 }
775
776 =head2 join_with
777
778 Joins walues with some delimiter
779
780 $v = join_with(", ", @v);
781
782 =cut
783
784 sub join_with {
785 my $d = shift;
786 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
787 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
788 return '' unless defined($v);
789 return $v;
790 }
791
792 =head2 split_rec_on
793
794 Split record subfield on some regex and take one of parts out
795
796 $a_before_semi_column =
797 split_rec_on('200','a', /\s*;\s*/, $part);
798
799 C<$part> is optional number of element. First element is
800 B<1>, not 0!
801
802 If there is no C<$part> parameter or C<$part> is 0, this function will
803 return all values produced by splitting.
804
805 =cut
806
807 sub split_rec_on {
808 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
809
810 my ($fld, $sf, $regex, $part) = @_;
811 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
812
813 my @r = rec( $fld, $sf );
814 my $v = shift @r;
815 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
816
817 return '' if( ! defined($v) || $v =~ /^\s*$/);
818
819 my @s = split( $regex, $v );
820 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
821 if ($part && $part > 0) {
822 return $s[ $part - 1 ];
823 } else {
824 return @s;
825 }
826 }
827
828 # END
829 1;

  ViewVC Help
Powered by ViewVC 1.1.26