/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 562 - (show annotations)
Sun Jul 2 16:14:41 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 13107 byte(s)
added marc_compose to specify manually subfield order in MARC and
split_rec_on to split single field into parts based on regex

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose
11
12 rec1 rec2 rec
13 regex prefix suffix surround
14 first lookup join_with
15
16 split_rec_on
17 /;
18
19 use warnings;
20 use strict;
21
22 #use base qw/WebPAC::Common/;
23 use Data::Dump qw/dump/;
24 use Encode qw/from_to/;
25
26 # debugging warn(s)
27 my $debug = 0;
28
29
30 =head1 NAME
31
32 WebPAC::Normalize - describe normalisaton rules using sets
33
34 =head1 VERSION
35
36 Version 0.08
37
38 =cut
39
40 our $VERSION = '0.08';
41
42 =head1 SYNOPSIS
43
44 This module uses C<conf/normalize/*.pl> files to perform normalisation
45 from input records using perl functions which are specialized for set
46 processing.
47
48 Sets are implemented as arrays, and normalisation file is valid perl, which
49 means that you check it's validity before running WebPAC using
50 C<perl -c normalize.pl>.
51
52 Normalisation can generate multiple output normalized data. For now, supported output
53 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
54 C<marc>.
55
56 =head1 FUNCTIONS
57
58 Functions which start with C<_> are private and used by WebPAC internally.
59 All other functions are available for use within normalisation rules.
60
61 =head2 data_structure
62
63 Return data structure
64
65 my $ds = WebPAC::Normalize::data_structure(
66 lookup => $lookup->lookup_hash,
67 row => $row,
68 rules => $normalize_pl_config,
69 marc_encoding => 'utf-8',
70 );
71
72 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
73 other are optional.
74
75 This function will B<die> if normalizastion can't be evaled.
76
77 Since this function isn't exported you have to call it with
78 C<WebPAC::Normalize::data_structure>.
79
80 =cut
81
82 sub data_structure {
83 my $arg = {@_};
84
85 die "need row argument" unless ($arg->{row});
86 die "need normalisation argument" unless ($arg->{rules});
87
88 no strict 'subs';
89 _set_lookup( $arg->{lookup} );
90 _set_rec( $arg->{row} );
91 _clean_ds( %{ $arg } );
92 eval "$arg->{rules}";
93 die "error evaling $arg->{rules}: $@\n" if ($@);
94
95 return _get_ds();
96 }
97
98 =head2 _set_rec
99
100 Set current record hash
101
102 _set_rec( $rec );
103
104 =cut
105
106 my $rec;
107
108 sub _set_rec {
109 $rec = shift or die "no record hash";
110 }
111
112 =head2 _get_ds
113
114 Return hash formatted as data structure
115
116 my $ds = _get_ds();
117
118 =cut
119
120 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
121
122 sub _get_ds {
123 return $out;
124 }
125
126 =head2 _clean_ds
127
128 Clean data structure hash for next record
129
130 _clean_ds();
131
132 =cut
133
134 sub _clean_ds {
135 my $a = {@_};
136 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 $marc_encoding = $a->{marc_encoding};
138 }
139
140 =head2 _set_lookup
141
142 Set current lookup hash
143
144 _set_lookup( $lookup );
145
146 =cut
147
148 my $lookup;
149
150 sub _set_lookup {
151 $lookup = shift;
152 }
153
154 =head2 _get_marc_fields
155
156 Get all fields defined by calls to C<marc>
157
158 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
159
160 We are using I<magic> which detect repeatable fields only from
161 sequence of field/subfield data generated by normalization.
162
163 Repeatable field is created when there is second occurence of same subfield or
164 if any of indicators are different.
165
166 This is sane for most cases. Something like:
167
168 900a-1 900b-1 900c-1
169 900a-2 900b-2
170 900a-3
171
172 will be created from any combination of:
173
174 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
175
176 and following rules:
177
178 marc('900','a', rec('200','a') );
179 marc('900','b', rec('200','b') );
180 marc('900','c', rec('200','c') );
181
182 which might not be what you have in mind. If you need repeatable subfield,
183 define it using C<marc_repeatable_subfield> like this:
184
185 ....
186
187 =cut
188
189 sub _get_marc_fields {
190
191 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
192
193 # first, sort all existing fields
194 # XXX might not be needed, but modern perl might randomize elements in hash
195 my @sorted_marc_record = sort {
196 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
197 } @{ $marc_record };
198
199 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
200
201 # output marc fields
202 my @m;
203
204 # count unique field-subfields (used for offset when walking to next subfield)
205 my $u;
206 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
207
208 if ($debug) {
209 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210 warn "## marc_record ", dump( $marc_record ), $/;
211 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212 warn "## subfield count ", dump( $u ), $/;
213 }
214
215 my $len = $#sorted_marc_record;
216 my $visited;
217 my $i = 0;
218 my $field;
219
220 foreach ( 0 .. $len ) {
221
222 # find next element which isn't visited
223 while ($visited->{$i}) {
224 $i = ($i + 1) % ($len + 1);
225 }
226
227 # mark it visited
228 $visited->{$i}++;
229
230 my $row = $sorted_marc_record[$i];
231
232 # field and subfield which is key for
233 # marc_repeatable_subfield and u
234 my $fsf = $row->[0] . $row->[3];
235
236 if ($debug > 1) {
237
238 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239 print "### this [$i]: ", dump( $row ),$/;
240 print "### sf: ", $row->[3], " vs ", $field->[3],
241 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
242 if ($#$field >= 0);
243
244 }
245
246 # if field exists
247 if ( $#$field >= 0 ) {
248 if (
249 $row->[0] ne $field->[0] || # field
250 $row->[1] ne $field->[1] || # i1
251 $row->[2] ne $field->[2] # i2
252 ) {
253 push @m, $field;
254 warn "## saved/1 ", dump( $field ),$/ if ($debug);
255 $field = $row;
256
257 } elsif (
258 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
259 ||
260 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261 ! $marc_repeatable_subfield->{ $fsf }
262 )
263 ) {
264 push @m, $field;
265 warn "## saved/2 ", dump( $field ),$/ if ($debug);
266 $field = $row;
267
268 } else {
269 # append new subfields to existing field
270 push @$field, ( $row->[3], $row->[4] );
271 }
272 } else {
273 # insert first field
274 $field = $row;
275 }
276
277 if (! $marc_repeatable_subfield->{ $fsf }) {
278 # make step to next subfield
279 $i = ($i + $u->{ $fsf } ) % ($len + 1);
280 }
281 }
282
283 if ($#$field >= 0) {
284 push @m, $field;
285 warn "## saved/3 ", dump( $field ),$/ if ($debug);
286 }
287
288 return @m;
289 }
290
291 =head2 _debug
292
293 Change level of debug warnings
294
295 _debug( 2 );
296
297 =cut
298
299 sub _debug {
300 my $l = shift;
301 return $debug unless defined($l);
302 warn "debug level $l" if ($l > 0);
303 $debug = $l;
304 }
305
306 =head1 Functions to create C<data_structure>
307
308 Those functions generally have to first in your normalization file.
309
310 =head2 tag
311
312 Define new tag for I<search> and I<display>.
313
314 tag('Title', rec('200','a') );
315
316
317 =cut
318
319 sub tag {
320 my $name = shift or die "tag needs name as first argument";
321 my @o = grep { defined($_) && $_ ne '' } @_;
322 return unless (@o);
323 $out->{$name}->{tag} = $name;
324 $out->{$name}->{search} = \@o;
325 $out->{$name}->{display} = \@o;
326 }
327
328 =head2 display
329
330 Define tag just for I<display>
331
332 @v = display('Title', rec('200','a') );
333
334 =cut
335
336 sub display {
337 my $name = shift or die "display needs name as first argument";
338 my @o = grep { defined($_) && $_ ne '' } @_;
339 return unless (@o);
340 $out->{$name}->{tag} = $name;
341 $out->{$name}->{display} = \@o;
342 }
343
344 =head2 search
345
346 Prepare values just for I<search>
347
348 @v = search('Title', rec('200','a') );
349
350 =cut
351
352 sub search {
353 my $name = shift or die "search needs name as first argument";
354 my @o = grep { defined($_) && $_ ne '' } @_;
355 return unless (@o);
356 $out->{$name}->{tag} = $name;
357 $out->{$name}->{search} = \@o;
358 }
359
360 =head2 marc
361
362 Save value for MARC field
363
364 marc('900','a', rec('200','a') );
365
366 =cut
367
368 sub marc {
369 my $f = shift or die "marc needs field";
370 die "marc field must be numer" unless ($f =~ /^\d+$/);
371
372 my $sf = shift or die "marc needs subfield";
373
374 foreach (@_) {
375 my $v = $_; # make var read-write for Encode
376 next unless (defined($v) && $v !~ /^\s*$/);
377 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
378 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
379 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
380 }
381 }
382
383 =head2 marc_repeatable_subfield
384
385 Save values for MARC repetable subfield
386
387 marc_repeatable_subfield('910', 'z', rec('909') );
388
389 =cut
390
391 sub marc_repeatable_subfield {
392 my ($f,$sf) = @_;
393 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
394 $marc_repeatable_subfield->{ $f . $sf }++;
395 marc(@_);
396 }
397
398 =head2 marc_indicators
399
400 Set both indicators for MARC field
401
402 marc_indicators('900', ' ', 1);
403
404 Any indicator value other than C<0-9> will be treated as undefined.
405
406 =cut
407
408 sub marc_indicators {
409 my $f = shift || die "marc_indicators need field!\n";
410 my ($i1,$i2) = @_;
411 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
412 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
413
414 $i1 = ' ' if ($i1 !~ /^\d$/);
415 $i2 = ' ' if ($i2 !~ /^\d$/);
416 @{ $marc_indicators->{$f} } = ($i1,$i2);
417 }
418
419 =head2 marc_compose
420
421 Save values for each MARC subfield explicitly
422
423 marc_compose('900',
424 'a', rec('200','a')
425 'b', rec('201','a')
426 'a', rec('200','b')
427 'c', rec('200','c')
428 );
429
430 =cut
431
432 sub marc_compose {
433 my $f = shift or die "marc_compose needs field";
434 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
435
436 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
437 my $m = [ $f, $i1, $i2 ];
438
439 while (@_) {
440 my $sf = shift or die "marc_compose $f needs subfield";
441 my $v = shift or die "marc_compose $f needs value for subfield $sf";
442
443 next unless (defined($v) && $v !~ /^\s*$/);
444 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
445 push @$m, ( $sf, $v );
446 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
447 }
448
449 warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
450
451 push @{ $marc_record }, $m;
452 }
453
454
455 =head1 Functions to extract data from input
456
457 This function should be used inside functions to create C<data_structure> described
458 above.
459
460 =head2 rec1
461
462 Return all values in some field
463
464 @v = rec1('200')
465
466 TODO: order of values is probably same as in source data, need to investigate that
467
468 =cut
469
470 sub rec1 {
471 my $f = shift;
472 return unless (defined($rec) && defined($rec->{$f}));
473 if (ref($rec->{$f}) eq 'ARRAY') {
474 return map {
475 if (ref($_) eq 'HASH') {
476 values %{$_};
477 } else {
478 $_;
479 }
480 } @{ $rec->{$f} };
481 } elsif( defined($rec->{$f}) ) {
482 return $rec->{$f};
483 }
484 }
485
486 =head2 rec2
487
488 Return all values in specific field and subfield
489
490 @v = rec2('200','a')
491
492 =cut
493
494 sub rec2 {
495 my $f = shift;
496 return unless (defined($rec && $rec->{$f}));
497 my $sf = shift;
498 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
499 }
500
501 =head2 rec
502
503 syntaxtic sugar for
504
505 @v = rec('200')
506 @v = rec('200','a')
507
508 =cut
509
510 sub rec {
511 if ($#_ == 0) {
512 return rec1(@_);
513 } elsif ($#_ == 1) {
514 return rec2(@_);
515 }
516 }
517
518 =head2 regex
519
520 Apply regex to some or all values
521
522 @v = regex( 's/foo/bar/g', @v );
523
524 =cut
525
526 sub regex {
527 my $r = shift;
528 my @out;
529 #warn "r: $r\n", dump(\@_);
530 foreach my $t (@_) {
531 next unless ($t);
532 eval "\$t =~ $r";
533 push @out, $t if ($t && $t ne '');
534 }
535 return @out;
536 }
537
538 =head2 prefix
539
540 Prefix all values with a string
541
542 @v = prefix( 'my_', @v );
543
544 =cut
545
546 sub prefix {
547 my $p = shift or die "prefix needs string as first argument";
548 return map { $p . $_ } grep { defined($_) } @_;
549 }
550
551 =head2 suffix
552
553 suffix all values with a string
554
555 @v = suffix( '_my', @v );
556
557 =cut
558
559 sub suffix {
560 my $s = shift or die "suffix needs string as first argument";
561 return map { $_ . $s } grep { defined($_) } @_;
562 }
563
564 =head2 surround
565
566 surround all values with a two strings
567
568 @v = surround( 'prefix_', '_suffix', @v );
569
570 =cut
571
572 sub surround {
573 my $p = shift or die "surround need prefix as first argument";
574 my $s = shift or die "surround needs suffix as second argument";
575 return map { $p . $_ . $s } grep { defined($_) } @_;
576 }
577
578 =head2 first
579
580 Return first element
581
582 $v = first( @v );
583
584 =cut
585
586 sub first {
587 my $r = shift;
588 return $r;
589 }
590
591 =head2 lookup
592
593 Consult lookup hashes for some value
594
595 @v = lookup( $v );
596 @v = lookup( @v );
597
598 =cut
599
600 sub lookup {
601 my $k = shift or return;
602 return unless (defined($lookup->{$k}));
603 if (ref($lookup->{$k}) eq 'ARRAY') {
604 return @{ $lookup->{$k} };
605 } else {
606 return $lookup->{$k};
607 }
608 }
609
610 =head2 join_with
611
612 Joins walues with some delimiter
613
614 $v = join_with(", ", @v);
615
616 =cut
617
618 sub join_with {
619 my $d = shift;
620 return join($d, grep { defined($_) && $_ ne '' } @_);
621 }
622
623 =head2 split_rec_on
624
625 Split record subfield on some regex and take one of parts out
626
627 $a_before_semi_column =
628 split_rec_on('200','a', /\s*;\s*/, $part);
629
630 C<$part> is optional number of element. First element is
631 B<1>, not 0!
632
633 If there is no C<$part> parameter or C<$part> is 0, this function will
634 return all values produced by splitting.
635
636 =cut
637
638 sub split_rec_on {
639 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
640
641 my ($fld, $sf, $regex, $part) = @_;
642 warn "### regex ", ref($regex), $regex if ($debug > 2);
643
644 my @r = rec( $fld, $sf );
645 my $v = shift @r;
646 warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
647
648 my @s = split( $regex, $v );
649 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
650 if ($part > 0) {
651 return $s[ $part - 1 ];
652 } else {
653 return @s;
654 }
655 }
656
657 # END
658 1;

  ViewVC Help
Powered by ViewVC 1.1.26