/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 566 - (show annotations)
Sun Jul 2 21:17:54 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 13421 byte(s)
test split_rec_on corner cases, and fix one

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11
12 rec1 rec2 rec
13 regex prefix suffix surround
14 first lookup join_with
15
16 split_rec_on
17 /;
18
19 use warnings;
20 use strict;
21
22 #use base qw/WebPAC::Common/;
23 use Data::Dump qw/dump/;
24 use Encode qw/from_to/;
25
26 # debugging warn(s)
27 my $debug = 0;
28
29
30 =head1 NAME
31
32 WebPAC::Normalize - describe normalisaton rules using sets
33
34 =head1 VERSION
35
36 Version 0.09
37
38 =cut
39
40 our $VERSION = '0.09';
41
42 =head1 SYNOPSIS
43
44 This module uses C<conf/normalize/*.pl> files to perform normalisation
45 from input records using perl functions which are specialized for set
46 processing.
47
48 Sets are implemented as arrays, and normalisation file is valid perl, which
49 means that you check it's validity before running WebPAC using
50 C<perl -c normalize.pl>.
51
52 Normalisation can generate multiple output normalized data. For now, supported output
53 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
54 C<marc>.
55
56 =head1 FUNCTIONS
57
58 Functions which start with C<_> are private and used by WebPAC internally.
59 All other functions are available for use within normalisation rules.
60
61 =head2 data_structure
62
63 Return data structure
64
65 my $ds = WebPAC::Normalize::data_structure(
66 lookup => $lookup->lookup_hash,
67 row => $row,
68 rules => $normalize_pl_config,
69 marc_encoding => 'utf-8',
70 );
71
72 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
73 other are optional.
74
75 This function will B<die> if normalizastion can't be evaled.
76
77 Since this function isn't exported you have to call it with
78 C<WebPAC::Normalize::data_structure>.
79
80 =cut
81
82 sub data_structure {
83 my $arg = {@_};
84
85 die "need row argument" unless ($arg->{row});
86 die "need normalisation argument" unless ($arg->{rules});
87
88 no strict 'subs';
89 _set_lookup( $arg->{lookup} );
90 _set_rec( $arg->{row} );
91 _clean_ds( %{ $arg } );
92 eval "$arg->{rules}";
93 die "error evaling $arg->{rules}: $@\n" if ($@);
94
95 return _get_ds();
96 }
97
98 =head2 _set_rec
99
100 Set current record hash
101
102 _set_rec( $rec );
103
104 =cut
105
106 my $rec;
107
108 sub _set_rec {
109 $rec = shift or die "no record hash";
110 }
111
112 =head2 _get_ds
113
114 Return hash formatted as data structure
115
116 my $ds = _get_ds();
117
118 =cut
119
120 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
121
122 sub _get_ds {
123 return $out;
124 }
125
126 =head2 _clean_ds
127
128 Clean data structure hash for next record
129
130 _clean_ds();
131
132 =cut
133
134 sub _clean_ds {
135 my $a = {@_};
136 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 $marc_encoding = $a->{marc_encoding};
138 }
139
140 =head2 _set_lookup
141
142 Set current lookup hash
143
144 _set_lookup( $lookup );
145
146 =cut
147
148 my $lookup;
149
150 sub _set_lookup {
151 $lookup = shift;
152 }
153
154 =head2 _get_marc_fields
155
156 Get all fields defined by calls to C<marc>
157
158 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
159
160 We are using I<magic> which detect repeatable fields only from
161 sequence of field/subfield data generated by normalization.
162
163 Repeatable field is created when there is second occurence of same subfield or
164 if any of indicators are different.
165
166 This is sane for most cases. Something like:
167
168 900a-1 900b-1 900c-1
169 900a-2 900b-2
170 900a-3
171
172 will be created from any combination of:
173
174 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
175
176 and following rules:
177
178 marc('900','a', rec('200','a') );
179 marc('900','b', rec('200','b') );
180 marc('900','c', rec('200','c') );
181
182 which might not be what you have in mind. If you need repeatable subfield,
183 define it using C<marc_repeatable_subfield> like this:
184
185 ....
186
187 =cut
188
189 sub _get_marc_fields {
190
191 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
192
193 # first, sort all existing fields
194 # XXX might not be needed, but modern perl might randomize elements in hash
195 my @sorted_marc_record = sort {
196 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
197 } @{ $marc_record };
198
199 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
200
201 # output marc fields
202 my @m;
203
204 # count unique field-subfields (used for offset when walking to next subfield)
205 my $u;
206 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
207
208 if ($debug) {
209 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210 warn "## marc_record ", dump( $marc_record ), $/;
211 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212 warn "## subfield count ", dump( $u ), $/;
213 }
214
215 my $len = $#sorted_marc_record;
216 my $visited;
217 my $i = 0;
218 my $field;
219
220 foreach ( 0 .. $len ) {
221
222 # find next element which isn't visited
223 while ($visited->{$i}) {
224 $i = ($i + 1) % ($len + 1);
225 }
226
227 # mark it visited
228 $visited->{$i}++;
229
230 my $row = $sorted_marc_record[$i];
231
232 # field and subfield which is key for
233 # marc_repeatable_subfield and u
234 my $fsf = $row->[0] . $row->[3];
235
236 if ($debug > 1) {
237
238 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239 print "### this [$i]: ", dump( $row ),$/;
240 print "### sf: ", $row->[3], " vs ", $field->[3],
241 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
242 if ($#$field >= 0);
243
244 }
245
246 # if field exists
247 if ( $#$field >= 0 ) {
248 if (
249 $row->[0] ne $field->[0] || # field
250 $row->[1] ne $field->[1] || # i1
251 $row->[2] ne $field->[2] # i2
252 ) {
253 push @m, $field;
254 warn "## saved/1 ", dump( $field ),$/ if ($debug);
255 $field = $row;
256
257 } elsif (
258 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
259 ||
260 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261 ! $marc_repeatable_subfield->{ $fsf }
262 )
263 ) {
264 push @m, $field;
265 warn "## saved/2 ", dump( $field ),$/ if ($debug);
266 $field = $row;
267
268 } else {
269 # append new subfields to existing field
270 push @$field, ( $row->[3], $row->[4] );
271 }
272 } else {
273 # insert first field
274 $field = $row;
275 }
276
277 if (! $marc_repeatable_subfield->{ $fsf }) {
278 # make step to next subfield
279 $i = ($i + $u->{ $fsf } ) % ($len + 1);
280 }
281 }
282
283 if ($#$field >= 0) {
284 push @m, $field;
285 warn "## saved/3 ", dump( $field ),$/ if ($debug);
286 }
287
288 return @m;
289 }
290
291 =head2 _debug
292
293 Change level of debug warnings
294
295 _debug( 2 );
296
297 =cut
298
299 sub _debug {
300 my $l = shift;
301 return $debug unless defined($l);
302 warn "debug level $l" if ($l > 0);
303 $debug = $l;
304 }
305
306 =head1 Functions to create C<data_structure>
307
308 Those functions generally have to first in your normalization file.
309
310 =head2 tag
311
312 Define new tag for I<search> and I<display>.
313
314 tag('Title', rec('200','a') );
315
316
317 =cut
318
319 sub tag {
320 my $name = shift or die "tag needs name as first argument";
321 my @o = grep { defined($_) && $_ ne '' } @_;
322 return unless (@o);
323 $out->{$name}->{tag} = $name;
324 $out->{$name}->{search} = \@o;
325 $out->{$name}->{display} = \@o;
326 }
327
328 =head2 display
329
330 Define tag just for I<display>
331
332 @v = display('Title', rec('200','a') );
333
334 =cut
335
336 sub display {
337 my $name = shift or die "display needs name as first argument";
338 my @o = grep { defined($_) && $_ ne '' } @_;
339 return unless (@o);
340 $out->{$name}->{tag} = $name;
341 $out->{$name}->{display} = \@o;
342 }
343
344 =head2 search
345
346 Prepare values just for I<search>
347
348 @v = search('Title', rec('200','a') );
349
350 =cut
351
352 sub search {
353 my $name = shift or die "search needs name as first argument";
354 my @o = grep { defined($_) && $_ ne '' } @_;
355 return unless (@o);
356 $out->{$name}->{tag} = $name;
357 $out->{$name}->{search} = \@o;
358 }
359
360 =head2 marc_leader
361
362 Setup fields within MARC leader or get leader
363
364 marc_leader('05','c');
365 my $leader = marc_leader();
366
367 =cut
368
369 sub marc_leader {
370 my ($offset,$value) = @_;
371
372 if ($offset) {
373 $out->{' leader'}->{ $offset } = $value;
374 } else {
375 return $out->{' leader'};
376 }
377 }
378
379 =head2 marc
380
381 Save value for MARC field
382
383 marc('900','a', rec('200','a') );
384
385 =cut
386
387 sub marc {
388 my $f = shift or die "marc needs field";
389 die "marc field must be numer" unless ($f =~ /^\d+$/);
390
391 my $sf = shift or die "marc needs subfield";
392
393 foreach (@_) {
394 my $v = $_; # make var read-write for Encode
395 next unless (defined($v) && $v !~ /^\s*$/);
396 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
397 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
398 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
399 }
400 }
401
402 =head2 marc_repeatable_subfield
403
404 Save values for MARC repetable subfield
405
406 marc_repeatable_subfield('910', 'z', rec('909') );
407
408 =cut
409
410 sub marc_repeatable_subfield {
411 my ($f,$sf) = @_;
412 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
413 $marc_repeatable_subfield->{ $f . $sf }++;
414 marc(@_);
415 }
416
417 =head2 marc_indicators
418
419 Set both indicators for MARC field
420
421 marc_indicators('900', ' ', 1);
422
423 Any indicator value other than C<0-9> will be treated as undefined.
424
425 =cut
426
427 sub marc_indicators {
428 my $f = shift || die "marc_indicators need field!\n";
429 my ($i1,$i2) = @_;
430 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
431 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
432
433 $i1 = ' ' if ($i1 !~ /^\d$/);
434 $i2 = ' ' if ($i2 !~ /^\d$/);
435 @{ $marc_indicators->{$f} } = ($i1,$i2);
436 }
437
438 =head2 marc_compose
439
440 Save values for each MARC subfield explicitly
441
442 marc_compose('900',
443 'a', rec('200','a')
444 'b', rec('201','a')
445 'a', rec('200','b')
446 'c', rec('200','c')
447 );
448
449 =cut
450
451 sub marc_compose {
452 my $f = shift or die "marc_compose needs field";
453 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
454
455 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
456 my $m = [ $f, $i1, $i2 ];
457
458 while (@_) {
459 my $sf = shift or die "marc_compose $f needs subfield";
460 my $v = shift;
461
462 next unless (defined($v) && $v !~ /^\s*$/);
463 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
464 push @$m, ( $sf, $v );
465 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
466 }
467
468 warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
469
470 push @{ $marc_record }, $m if ($#{$m} > 2);
471 }
472
473
474 =head1 Functions to extract data from input
475
476 This function should be used inside functions to create C<data_structure> described
477 above.
478
479 =head2 rec1
480
481 Return all values in some field
482
483 @v = rec1('200')
484
485 TODO: order of values is probably same as in source data, need to investigate that
486
487 =cut
488
489 sub rec1 {
490 my $f = shift;
491 return unless (defined($rec) && defined($rec->{$f}));
492 if (ref($rec->{$f}) eq 'ARRAY') {
493 return map {
494 if (ref($_) eq 'HASH') {
495 values %{$_};
496 } else {
497 $_;
498 }
499 } @{ $rec->{$f} };
500 } elsif( defined($rec->{$f}) ) {
501 return $rec->{$f};
502 }
503 }
504
505 =head2 rec2
506
507 Return all values in specific field and subfield
508
509 @v = rec2('200','a')
510
511 =cut
512
513 sub rec2 {
514 my $f = shift;
515 return unless (defined($rec && $rec->{$f}));
516 my $sf = shift;
517 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
518 }
519
520 =head2 rec
521
522 syntaxtic sugar for
523
524 @v = rec('200')
525 @v = rec('200','a')
526
527 =cut
528
529 sub rec {
530 if ($#_ == 0) {
531 return rec1(@_);
532 } elsif ($#_ == 1) {
533 return rec2(@_);
534 }
535 }
536
537 =head2 regex
538
539 Apply regex to some or all values
540
541 @v = regex( 's/foo/bar/g', @v );
542
543 =cut
544
545 sub regex {
546 my $r = shift;
547 my @out;
548 #warn "r: $r\n", dump(\@_);
549 foreach my $t (@_) {
550 next unless ($t);
551 eval "\$t =~ $r";
552 push @out, $t if ($t && $t ne '');
553 }
554 return @out;
555 }
556
557 =head2 prefix
558
559 Prefix all values with a string
560
561 @v = prefix( 'my_', @v );
562
563 =cut
564
565 sub prefix {
566 my $p = shift or die "prefix needs string as first argument";
567 return map { $p . $_ } grep { defined($_) } @_;
568 }
569
570 =head2 suffix
571
572 suffix all values with a string
573
574 @v = suffix( '_my', @v );
575
576 =cut
577
578 sub suffix {
579 my $s = shift or die "suffix needs string as first argument";
580 return map { $_ . $s } grep { defined($_) } @_;
581 }
582
583 =head2 surround
584
585 surround all values with a two strings
586
587 @v = surround( 'prefix_', '_suffix', @v );
588
589 =cut
590
591 sub surround {
592 my $p = shift or die "surround need prefix as first argument";
593 my $s = shift or die "surround needs suffix as second argument";
594 return map { $p . $_ . $s } grep { defined($_) } @_;
595 }
596
597 =head2 first
598
599 Return first element
600
601 $v = first( @v );
602
603 =cut
604
605 sub first {
606 my $r = shift;
607 return $r;
608 }
609
610 =head2 lookup
611
612 Consult lookup hashes for some value
613
614 @v = lookup( $v );
615 @v = lookup( @v );
616
617 =cut
618
619 sub lookup {
620 my $k = shift or return;
621 return unless (defined($lookup->{$k}));
622 if (ref($lookup->{$k}) eq 'ARRAY') {
623 return @{ $lookup->{$k} };
624 } else {
625 return $lookup->{$k};
626 }
627 }
628
629 =head2 join_with
630
631 Joins walues with some delimiter
632
633 $v = join_with(", ", @v);
634
635 =cut
636
637 sub join_with {
638 my $d = shift;
639 return join($d, grep { defined($_) && $_ ne '' } @_);
640 }
641
642 =head2 split_rec_on
643
644 Split record subfield on some regex and take one of parts out
645
646 $a_before_semi_column =
647 split_rec_on('200','a', /\s*;\s*/, $part);
648
649 C<$part> is optional number of element. First element is
650 B<1>, not 0!
651
652 If there is no C<$part> parameter or C<$part> is 0, this function will
653 return all values produced by splitting.
654
655 =cut
656
657 sub split_rec_on {
658 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
659
660 my ($fld, $sf, $regex, $part) = @_;
661 warn "### regex ", ref($regex), $regex if ($debug > 2);
662
663 my @r = rec( $fld, $sf );
664 my $v = shift @r;
665 warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
666
667 return '' if( ! defined($v) || $v =~ /^\s*$/);
668
669 my @s = split( $regex, $v );
670 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
671 if ($part && $part > 0) {
672 return $s[ $part - 1 ];
673 } else {
674 return [ @s ];
675 }
676 }
677
678 # END
679 1;

  ViewVC Help
Powered by ViewVC 1.1.26