/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 571 - (show annotations)
Mon Jul 3 14:30:22 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 13689 byte(s)
marc() now supports fields < 10 which don't have indicators and subfields

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11
12 rec1 rec2 rec
13 regex prefix suffix surround
14 first lookup join_with
15
16 split_rec_on
17 /;
18
19 use warnings;
20 use strict;
21
22 #use base qw/WebPAC::Common/;
23 use Data::Dump qw/dump/;
24 use Encode qw/from_to/;
25
26 # debugging warn(s)
27 my $debug = 0;
28
29
30 =head1 NAME
31
32 WebPAC::Normalize - describe normalisaton rules using sets
33
34 =head1 VERSION
35
36 Version 0.09
37
38 =cut
39
40 our $VERSION = '0.09';
41
42 =head1 SYNOPSIS
43
44 This module uses C<conf/normalize/*.pl> files to perform normalisation
45 from input records using perl functions which are specialized for set
46 processing.
47
48 Sets are implemented as arrays, and normalisation file is valid perl, which
49 means that you check it's validity before running WebPAC using
50 C<perl -c normalize.pl>.
51
52 Normalisation can generate multiple output normalized data. For now, supported output
53 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
54 C<marc>.
55
56 =head1 FUNCTIONS
57
58 Functions which start with C<_> are private and used by WebPAC internally.
59 All other functions are available for use within normalisation rules.
60
61 =head2 data_structure
62
63 Return data structure
64
65 my $ds = WebPAC::Normalize::data_structure(
66 lookup => $lookup->lookup_hash,
67 row => $row,
68 rules => $normalize_pl_config,
69 marc_encoding => 'utf-8',
70 );
71
72 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
73 other are optional.
74
75 This function will B<die> if normalizastion can't be evaled.
76
77 Since this function isn't exported you have to call it with
78 C<WebPAC::Normalize::data_structure>.
79
80 =cut
81
82 sub data_structure {
83 my $arg = {@_};
84
85 die "need row argument" unless ($arg->{row});
86 die "need normalisation argument" unless ($arg->{rules});
87
88 no strict 'subs';
89 _set_lookup( $arg->{lookup} );
90 _set_rec( $arg->{row} );
91 _clean_ds( %{ $arg } );
92 eval "$arg->{rules}";
93 die "error evaling $arg->{rules}: $@\n" if ($@);
94
95 return _get_ds();
96 }
97
98 =head2 _set_rec
99
100 Set current record hash
101
102 _set_rec( $rec );
103
104 =cut
105
106 my $rec;
107
108 sub _set_rec {
109 $rec = shift or die "no record hash";
110 }
111
112 =head2 _get_ds
113
114 Return hash formatted as data structure
115
116 my $ds = _get_ds();
117
118 =cut
119
120 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
121
122 sub _get_ds {
123 return $out;
124 }
125
126 =head2 _clean_ds
127
128 Clean data structure hash for next record
129
130 _clean_ds();
131
132 =cut
133
134 sub _clean_ds {
135 my $a = {@_};
136 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 $marc_encoding = $a->{marc_encoding};
138 }
139
140 =head2 _set_lookup
141
142 Set current lookup hash
143
144 _set_lookup( $lookup );
145
146 =cut
147
148 my $lookup;
149
150 sub _set_lookup {
151 $lookup = shift;
152 }
153
154 =head2 _get_marc_fields
155
156 Get all fields defined by calls to C<marc>
157
158 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
159
160 We are using I<magic> which detect repeatable fields only from
161 sequence of field/subfield data generated by normalization.
162
163 Repeatable field is created when there is second occurence of same subfield or
164 if any of indicators are different.
165
166 This is sane for most cases. Something like:
167
168 900a-1 900b-1 900c-1
169 900a-2 900b-2
170 900a-3
171
172 will be created from any combination of:
173
174 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
175
176 and following rules:
177
178 marc('900','a', rec('200','a') );
179 marc('900','b', rec('200','b') );
180 marc('900','c', rec('200','c') );
181
182 which might not be what you have in mind. If you need repeatable subfield,
183 define it using C<marc_repeatable_subfield> like this:
184
185 ....
186
187 =cut
188
189 sub _get_marc_fields {
190
191 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
192
193 # first, sort all existing fields
194 # XXX might not be needed, but modern perl might randomize elements in hash
195 my @sorted_marc_record = sort {
196 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
197 } @{ $marc_record };
198
199 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
200
201 # output marc fields
202 my @m;
203
204 # count unique field-subfields (used for offset when walking to next subfield)
205 my $u;
206 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
207
208 if ($debug) {
209 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210 warn "## marc_record ", dump( $marc_record ), $/;
211 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212 warn "## subfield count ", dump( $u ), $/;
213 }
214
215 my $len = $#sorted_marc_record;
216 my $visited;
217 my $i = 0;
218 my $field;
219
220 foreach ( 0 .. $len ) {
221
222 # find next element which isn't visited
223 while ($visited->{$i}) {
224 $i = ($i + 1) % ($len + 1);
225 }
226
227 # mark it visited
228 $visited->{$i}++;
229
230 my $row = $sorted_marc_record[$i];
231
232 # field and subfield which is key for
233 # marc_repeatable_subfield and u
234 my $fsf = $row->[0] . $row->[3];
235
236 if ($debug > 1) {
237
238 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239 print "### this [$i]: ", dump( $row ),$/;
240 print "### sf: ", $row->[3], " vs ", $field->[3],
241 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
242 if ($#$field >= 0);
243
244 }
245
246 # if field exists
247 if ( $#$field >= 0 ) {
248 if (
249 $row->[0] ne $field->[0] || # field
250 $row->[1] ne $field->[1] || # i1
251 $row->[2] ne $field->[2] # i2
252 ) {
253 push @m, $field;
254 warn "## saved/1 ", dump( $field ),$/ if ($debug);
255 $field = $row;
256
257 } elsif (
258 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
259 ||
260 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261 ! $marc_repeatable_subfield->{ $fsf }
262 )
263 ) {
264 push @m, $field;
265 warn "## saved/2 ", dump( $field ),$/ if ($debug);
266 $field = $row;
267
268 } else {
269 # append new subfields to existing field
270 push @$field, ( $row->[3], $row->[4] );
271 }
272 } else {
273 # insert first field
274 $field = $row;
275 }
276
277 if (! $marc_repeatable_subfield->{ $fsf }) {
278 # make step to next subfield
279 $i = ($i + $u->{ $fsf } ) % ($len + 1);
280 }
281 }
282
283 if ($#$field >= 0) {
284 push @m, $field;
285 warn "## saved/3 ", dump( $field ),$/ if ($debug);
286 }
287
288 return @m;
289 }
290
291 =head2 _debug
292
293 Change level of debug warnings
294
295 _debug( 2 );
296
297 =cut
298
299 sub _debug {
300 my $l = shift;
301 return $debug unless defined($l);
302 warn "debug level $l",$/ if ($l > 0);
303 $debug = $l;
304 }
305
306 =head1 Functions to create C<data_structure>
307
308 Those functions generally have to first in your normalization file.
309
310 =head2 tag
311
312 Define new tag for I<search> and I<display>.
313
314 tag('Title', rec('200','a') );
315
316
317 =cut
318
319 sub tag {
320 my $name = shift or die "tag needs name as first argument";
321 my @o = grep { defined($_) && $_ ne '' } @_;
322 return unless (@o);
323 $out->{$name}->{tag} = $name;
324 $out->{$name}->{search} = \@o;
325 $out->{$name}->{display} = \@o;
326 }
327
328 =head2 display
329
330 Define tag just for I<display>
331
332 @v = display('Title', rec('200','a') );
333
334 =cut
335
336 sub display {
337 my $name = shift or die "display needs name as first argument";
338 my @o = grep { defined($_) && $_ ne '' } @_;
339 return unless (@o);
340 $out->{$name}->{tag} = $name;
341 $out->{$name}->{display} = \@o;
342 }
343
344 =head2 search
345
346 Prepare values just for I<search>
347
348 @v = search('Title', rec('200','a') );
349
350 =cut
351
352 sub search {
353 my $name = shift or die "search needs name as first argument";
354 my @o = grep { defined($_) && $_ ne '' } @_;
355 return unless (@o);
356 $out->{$name}->{tag} = $name;
357 $out->{$name}->{search} = \@o;
358 }
359
360 =head2 marc_leader
361
362 Setup fields within MARC leader or get leader
363
364 marc_leader('05','c');
365 my $leader = marc_leader();
366
367 =cut
368
369 sub marc_leader {
370 my ($offset,$value) = @_;
371
372 if ($offset) {
373 $out->{' leader'}->{ $offset } = $value;
374 } else {
375 return $out->{' leader'};
376 }
377 }
378
379 =head2 marc
380
381 Save value for MARC field
382
383 marc('900','a', rec('200','a') );
384 marc('001', rec('000') );
385
386 =cut
387
388 sub marc {
389 my $f = shift or die "marc needs field";
390 die "marc field must be numer" unless ($f =~ /^\d+$/);
391
392 my $sf;
393 if ($f >= 10) {
394 $sf = shift or die "marc needs subfield";
395 }
396
397 foreach (@_) {
398 my $v = $_; # make var read-write for Encode
399 next unless (defined($v) && $v !~ /^\s*$/);
400 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
401 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
402 if (defined $sf) {
403 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
404 } else {
405 push @{ $marc_record }, [ $f, $v ];
406 }
407 }
408 }
409
410 =head2 marc_repeatable_subfield
411
412 Save values for MARC repetable subfield
413
414 marc_repeatable_subfield('910', 'z', rec('909') );
415
416 =cut
417
418 sub marc_repeatable_subfield {
419 my ($f,$sf) = @_;
420 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
421 $marc_repeatable_subfield->{ $f . $sf }++;
422 marc(@_);
423 }
424
425 =head2 marc_indicators
426
427 Set both indicators for MARC field
428
429 marc_indicators('900', ' ', 1);
430
431 Any indicator value other than C<0-9> will be treated as undefined.
432
433 =cut
434
435 sub marc_indicators {
436 my $f = shift || die "marc_indicators need field!\n";
437 my ($i1,$i2) = @_;
438 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
439 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
440
441 $i1 = ' ' if ($i1 !~ /^\d$/);
442 $i2 = ' ' if ($i2 !~ /^\d$/);
443 @{ $marc_indicators->{$f} } = ($i1,$i2);
444 }
445
446 =head2 marc_compose
447
448 Save values for each MARC subfield explicitly
449
450 marc_compose('900',
451 'a', rec('200','a')
452 'b', rec('201','a')
453 'a', rec('200','b')
454 'c', rec('200','c')
455 );
456
457 =cut
458
459 sub marc_compose {
460 my $f = shift or die "marc_compose needs field";
461 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
462
463 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
464 my $m = [ $f, $i1, $i2 ];
465
466 while (@_) {
467 my $sf = shift or die "marc_compose $f needs subfield";
468 my $v = shift;
469
470 next unless (defined($v) && $v !~ /^\s*$/);
471 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
472 push @$m, ( $sf, $v );
473 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
474 }
475
476 warn "## marc_compose(d) ", dump( $m ),$/ if ($debug > 1);
477
478 push @{ $marc_record }, $m if ($#{$m} > 2);
479 }
480
481
482 =head1 Functions to extract data from input
483
484 This function should be used inside functions to create C<data_structure> described
485 above.
486
487 =head2 rec1
488
489 Return all values in some field
490
491 @v = rec1('200')
492
493 TODO: order of values is probably same as in source data, need to investigate that
494
495 =cut
496
497 sub rec1 {
498 my $f = shift;
499 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
500 return unless (defined($rec) && defined($rec->{$f}));
501 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
502 if (ref($rec->{$f}) eq 'ARRAY') {
503 return map {
504 if (ref($_) eq 'HASH') {
505 values %{$_};
506 } else {
507 $_;
508 }
509 } @{ $rec->{$f} };
510 } elsif( defined($rec->{$f}) ) {
511 return $rec->{$f};
512 }
513 }
514
515 =head2 rec2
516
517 Return all values in specific field and subfield
518
519 @v = rec2('200','a')
520
521 =cut
522
523 sub rec2 {
524 my $f = shift;
525 return unless (defined($rec && $rec->{$f}));
526 my $sf = shift;
527 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
528 }
529
530 =head2 rec
531
532 syntaxtic sugar for
533
534 @v = rec('200')
535 @v = rec('200','a')
536
537 =cut
538
539 sub rec {
540 if ($#_ == 0) {
541 return rec1(@_);
542 } elsif ($#_ == 1) {
543 return rec2(@_);
544 }
545 }
546
547 =head2 regex
548
549 Apply regex to some or all values
550
551 @v = regex( 's/foo/bar/g', @v );
552
553 =cut
554
555 sub regex {
556 my $r = shift;
557 my @out;
558 #warn "r: $r\n", dump(\@_);
559 foreach my $t (@_) {
560 next unless ($t);
561 eval "\$t =~ $r";
562 push @out, $t if ($t && $t ne '');
563 }
564 return @out;
565 }
566
567 =head2 prefix
568
569 Prefix all values with a string
570
571 @v = prefix( 'my_', @v );
572
573 =cut
574
575 sub prefix {
576 my $p = shift or die "prefix needs string as first argument";
577 return map { $p . $_ } grep { defined($_) } @_;
578 }
579
580 =head2 suffix
581
582 suffix all values with a string
583
584 @v = suffix( '_my', @v );
585
586 =cut
587
588 sub suffix {
589 my $s = shift or die "suffix needs string as first argument";
590 return map { $_ . $s } grep { defined($_) } @_;
591 }
592
593 =head2 surround
594
595 surround all values with a two strings
596
597 @v = surround( 'prefix_', '_suffix', @v );
598
599 =cut
600
601 sub surround {
602 my $p = shift or die "surround need prefix as first argument";
603 my $s = shift or die "surround needs suffix as second argument";
604 return map { $p . $_ . $s } grep { defined($_) } @_;
605 }
606
607 =head2 first
608
609 Return first element
610
611 $v = first( @v );
612
613 =cut
614
615 sub first {
616 my $r = shift;
617 return $r;
618 }
619
620 =head2 lookup
621
622 Consult lookup hashes for some value
623
624 @v = lookup( $v );
625 @v = lookup( @v );
626
627 =cut
628
629 sub lookup {
630 my $k = shift or return;
631 return unless (defined($lookup->{$k}));
632 if (ref($lookup->{$k}) eq 'ARRAY') {
633 return @{ $lookup->{$k} };
634 } else {
635 return $lookup->{$k};
636 }
637 }
638
639 =head2 join_with
640
641 Joins walues with some delimiter
642
643 $v = join_with(", ", @v);
644
645 =cut
646
647 sub join_with {
648 my $d = shift;
649 return join($d, grep { defined($_) && $_ ne '' } @_);
650 }
651
652 =head2 split_rec_on
653
654 Split record subfield on some regex and take one of parts out
655
656 $a_before_semi_column =
657 split_rec_on('200','a', /\s*;\s*/, $part);
658
659 C<$part> is optional number of element. First element is
660 B<1>, not 0!
661
662 If there is no C<$part> parameter or C<$part> is 0, this function will
663 return all values produced by splitting.
664
665 =cut
666
667 sub split_rec_on {
668 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
669
670 my ($fld, $sf, $regex, $part) = @_;
671 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
672
673 my @r = rec( $fld, $sf );
674 my $v = shift @r;
675 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
676
677 return '' if( ! defined($v) || $v =~ /^\s*$/);
678
679 my @s = split( $regex, $v );
680 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
681 if ($part && $part > 0) {
682 return $s[ $part - 1 ];
683 } else {
684 return @s;
685 }
686 }
687
688 # END
689 1;

  ViewVC Help
Powered by ViewVC 1.1.26