/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 562 - (hide annotations)
Sun Jul 2 16:14:41 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 13107 byte(s)
added marc_compose to specify manually subfield order in MARC and
split_rec_on to split single field into parts based on regex

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 538
8 dpavlin 536 tag search display
9 dpavlin 547 marc marc_indicators marc_repeatable_subfield
10 dpavlin 562 marc_compose
11 dpavlin 540
12 dpavlin 536 rec1 rec2 rec
13     regex prefix suffix surround
14     first lookup join_with
15 dpavlin 562
16     split_rec_on
17 dpavlin 536 /;
18 dpavlin 10
19     use warnings;
20     use strict;
21 dpavlin 536
22     #use base qw/WebPAC::Common/;
23 dpavlin 550 use Data::Dump qw/dump/;
24 dpavlin 541 use Encode qw/from_to/;
25 dpavlin 10
26 dpavlin 550 # debugging warn(s)
27     my $debug = 0;
28    
29    
30 dpavlin 10 =head1 NAME
31    
32 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
33 dpavlin 10
34     =head1 VERSION
35    
36 dpavlin 562 Version 0.08
37 dpavlin 10
38     =cut
39    
40 dpavlin 562 our $VERSION = '0.08';
41 dpavlin 10
42     =head1 SYNOPSIS
43    
44 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
45     from input records using perl functions which are specialized for set
46     processing.
47 dpavlin 10
48 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
49     means that you check it's validity before running WebPAC using
50     C<perl -c normalize.pl>.
51 dpavlin 15
52 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
53 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
54 dpavlin 547 C<marc>.
55 dpavlin 15
56 dpavlin 10 =head1 FUNCTIONS
57    
58 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
59     All other functions are available for use within normalisation rules.
60    
61 dpavlin 536 =head2 data_structure
62 dpavlin 10
63 dpavlin 536 Return data structure
64 dpavlin 13
65 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
66 dpavlin 536 lookup => $lookup->lookup_hash,
67     row => $row,
68     rules => $normalize_pl_config,
69 dpavlin 541 marc_encoding => 'utf-8',
70 dpavlin 13 );
71    
72 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
73     other are optional.
74    
75 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
76 dpavlin 15
77 dpavlin 538 Since this function isn't exported you have to call it with
78     C<WebPAC::Normalize::data_structure>.
79    
80 dpavlin 10 =cut
81    
82 dpavlin 536 sub data_structure {
83     my $arg = {@_};
84 dpavlin 13
85 dpavlin 536 die "need row argument" unless ($arg->{row});
86     die "need normalisation argument" unless ($arg->{rules});
87 dpavlin 31
88 dpavlin 536 no strict 'subs';
89 dpavlin 538 _set_lookup( $arg->{lookup} );
90     _set_rec( $arg->{row} );
91 dpavlin 541 _clean_ds( %{ $arg } );
92 dpavlin 536 eval "$arg->{rules}";
93     die "error evaling $arg->{rules}: $@\n" if ($@);
94 dpavlin 540
95 dpavlin 538 return _get_ds();
96 dpavlin 10 }
97    
98 dpavlin 538 =head2 _set_rec
99 dpavlin 13
100 dpavlin 536 Set current record hash
101 dpavlin 433
102 dpavlin 538 _set_rec( $rec );
103 dpavlin 433
104     =cut
105    
106 dpavlin 536 my $rec;
107 dpavlin 433
108 dpavlin 538 sub _set_rec {
109 dpavlin 536 $rec = shift or die "no record hash";
110 dpavlin 433 }
111    
112 dpavlin 538 =head2 _get_ds
113    
114     Return hash formatted as data structure
115    
116     my $ds = _get_ds();
117    
118     =cut
119    
120 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
121 dpavlin 538
122     sub _get_ds {
123     return $out;
124     }
125    
126     =head2 _clean_ds
127    
128     Clean data structure hash for next record
129    
130     _clean_ds();
131    
132     =cut
133    
134     sub _clean_ds {
135 dpavlin 541 my $a = {@_};
136 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
137 dpavlin 541 $marc_encoding = $a->{marc_encoding};
138 dpavlin 538 }
139    
140     =head2 _set_lookup
141    
142     Set current lookup hash
143    
144     _set_lookup( $lookup );
145    
146     =cut
147    
148     my $lookup;
149    
150     sub _set_lookup {
151     $lookup = shift;
152     }
153    
154 dpavlin 547 =head2 _get_marc_fields
155 dpavlin 540
156 dpavlin 547 Get all fields defined by calls to C<marc>
157 dpavlin 540
158 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
159 dpavlin 540
160 dpavlin 543 We are using I<magic> which detect repeatable fields only from
161     sequence of field/subfield data generated by normalization.
162    
163 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
164     if any of indicators are different.
165 dpavlin 543
166 dpavlin 554 This is sane for most cases. Something like:
167 dpavlin 543
168 dpavlin 554 900a-1 900b-1 900c-1
169     900a-2 900b-2
170     900a-3
171    
172     will be created from any combination of:
173    
174     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
175    
176     and following rules:
177    
178     marc('900','a', rec('200','a') );
179     marc('900','b', rec('200','b') );
180     marc('900','c', rec('200','c') );
181    
182     which might not be what you have in mind. If you need repeatable subfield,
183     define it using C<marc_repeatable_subfield> like this:
184    
185     ....
186    
187 dpavlin 540 =cut
188    
189 dpavlin 547 sub _get_marc_fields {
190 dpavlin 550
191 dpavlin 551 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
192 dpavlin 550
193     # first, sort all existing fields
194     # XXX might not be needed, but modern perl might randomize elements in hash
195     my @sorted_marc_record = sort {
196     $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
197     } @{ $marc_record };
198    
199 dpavlin 562 @sorted_marc_record = @{ $marc_record }; ### FIXME disable sorting
200    
201 dpavlin 550 # output marc fields
202 dpavlin 542 my @m;
203 dpavlin 550
204     # count unique field-subfields (used for offset when walking to next subfield)
205     my $u;
206     map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
207    
208     if ($debug) {
209     warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
210     warn "## marc_record ", dump( $marc_record ), $/;
211     warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
212     warn "## subfield count ", dump( $u ), $/;
213     }
214    
215     my $len = $#sorted_marc_record;
216     my $visited;
217     my $i = 0;
218     my $field;
219    
220     foreach ( 0 .. $len ) {
221    
222     # find next element which isn't visited
223     while ($visited->{$i}) {
224     $i = ($i + 1) % ($len + 1);
225 dpavlin 542 }
226    
227 dpavlin 550 # mark it visited
228     $visited->{$i}++;
229    
230     my $row = $sorted_marc_record[$i];
231    
232     # field and subfield which is key for
233     # marc_repeatable_subfield and u
234     my $fsf = $row->[0] . $row->[3];
235    
236     if ($debug > 1) {
237    
238     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
239     print "### this [$i]: ", dump( $row ),$/;
240     print "### sf: ", $row->[3], " vs ", $field->[3],
241     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
242     if ($#$field >= 0);
243    
244     }
245    
246     # if field exists
247     if ( $#$field >= 0 ) {
248     if (
249     $row->[0] ne $field->[0] || # field
250     $row->[1] ne $field->[1] || # i1
251     $row->[2] ne $field->[2] # i2
252     ) {
253     push @m, $field;
254     warn "## saved/1 ", dump( $field ),$/ if ($debug);
255     $field = $row;
256    
257     } elsif (
258     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
259     ||
260     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
261     ! $marc_repeatable_subfield->{ $fsf }
262     )
263     ) {
264     push @m, $field;
265     warn "## saved/2 ", dump( $field ),$/ if ($debug);
266     $field = $row;
267    
268     } else {
269     # append new subfields to existing field
270     push @$field, ( $row->[3], $row->[4] );
271     }
272     } else {
273     # insert first field
274     $field = $row;
275     }
276    
277     if (! $marc_repeatable_subfield->{ $fsf }) {
278     # make step to next subfield
279     $i = ($i + $u->{ $fsf } ) % ($len + 1);
280     }
281 dpavlin 542 }
282    
283 dpavlin 550 if ($#$field >= 0) {
284     push @m, $field;
285     warn "## saved/3 ", dump( $field ),$/ if ($debug);
286     }
287 dpavlin 542
288     return @m;
289 dpavlin 540 }
290    
291 dpavlin 554 =head2 _debug
292    
293     Change level of debug warnings
294    
295     _debug( 2 );
296    
297     =cut
298    
299     sub _debug {
300     my $l = shift;
301     return $debug unless defined($l);
302 dpavlin 562 warn "debug level $l" if ($l > 0);
303 dpavlin 554 $debug = $l;
304     }
305    
306 dpavlin 540 =head1 Functions to create C<data_structure>
307    
308     Those functions generally have to first in your normalization file.
309    
310 dpavlin 536 =head2 tag
311 dpavlin 433
312 dpavlin 536 Define new tag for I<search> and I<display>.
313 dpavlin 433
314 dpavlin 536 tag('Title', rec('200','a') );
315 dpavlin 13
316    
317     =cut
318    
319 dpavlin 536 sub tag {
320     my $name = shift or die "tag needs name as first argument";
321     my @o = grep { defined($_) && $_ ne '' } @_;
322     return unless (@o);
323     $out->{$name}->{tag} = $name;
324     $out->{$name}->{search} = \@o;
325     $out->{$name}->{display} = \@o;
326     }
327 dpavlin 13
328 dpavlin 536 =head2 display
329 dpavlin 13
330 dpavlin 536 Define tag just for I<display>
331 dpavlin 125
332 dpavlin 536 @v = display('Title', rec('200','a') );
333 dpavlin 125
334 dpavlin 536 =cut
335 dpavlin 125
336 dpavlin 536 sub display {
337     my $name = shift or die "display needs name as first argument";
338     my @o = grep { defined($_) && $_ ne '' } @_;
339     return unless (@o);
340     $out->{$name}->{tag} = $name;
341     $out->{$name}->{display} = \@o;
342     }
343 dpavlin 13
344 dpavlin 536 =head2 search
345 dpavlin 13
346 dpavlin 536 Prepare values just for I<search>
347 dpavlin 13
348 dpavlin 536 @v = search('Title', rec('200','a') );
349 dpavlin 433
350 dpavlin 536 =cut
351 dpavlin 13
352 dpavlin 536 sub search {
353     my $name = shift or die "search needs name as first argument";
354     my @o = grep { defined($_) && $_ ne '' } @_;
355     return unless (@o);
356     $out->{$name}->{tag} = $name;
357     $out->{$name}->{search} = \@o;
358 dpavlin 13 }
359    
360 dpavlin 547 =head2 marc
361 dpavlin 540
362     Save value for MARC field
363    
364 dpavlin 547 marc('900','a', rec('200','a') );
365 dpavlin 540
366     =cut
367    
368 dpavlin 547 sub marc {
369     my $f = shift or die "marc needs field";
370     die "marc field must be numer" unless ($f =~ /^\d+$/);
371 dpavlin 540
372 dpavlin 547 my $sf = shift or die "marc needs subfield";
373 dpavlin 540
374 dpavlin 541 foreach (@_) {
375     my $v = $_; # make var read-write for Encode
376 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
377 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
378 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
379     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
380 dpavlin 540 }
381     }
382    
383 dpavlin 547 =head2 marc_repeatable_subfield
384    
385     Save values for MARC repetable subfield
386    
387     marc_repeatable_subfield('910', 'z', rec('909') );
388    
389     =cut
390    
391     sub marc_repeatable_subfield {
392 dpavlin 550 my ($f,$sf) = @_;
393     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
394     $marc_repeatable_subfield->{ $f . $sf }++;
395 dpavlin 547 marc(@_);
396     }
397    
398     =head2 marc_indicators
399    
400     Set both indicators for MARC field
401    
402     marc_indicators('900', ' ', 1);
403    
404     Any indicator value other than C<0-9> will be treated as undefined.
405    
406     =cut
407    
408     sub marc_indicators {
409     my $f = shift || die "marc_indicators need field!\n";
410     my ($i1,$i2) = @_;
411     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
412     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
413    
414     $i1 = ' ' if ($i1 !~ /^\d$/);
415     $i2 = ' ' if ($i2 !~ /^\d$/);
416 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
417 dpavlin 547 }
418    
419 dpavlin 562 =head2 marc_compose
420 dpavlin 547
421 dpavlin 562 Save values for each MARC subfield explicitly
422    
423     marc_compose('900',
424     'a', rec('200','a')
425     'b', rec('201','a')
426     'a', rec('200','b')
427     'c', rec('200','c')
428     );
429    
430     =cut
431    
432     sub marc_compose {
433     my $f = shift or die "marc_compose needs field";
434     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
435    
436     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
437     my $m = [ $f, $i1, $i2 ];
438    
439     while (@_) {
440     my $sf = shift or die "marc_compose $f needs subfield";
441     my $v = shift or die "marc_compose $f needs value for subfield $sf";
442    
443     next unless (defined($v) && $v !~ /^\s*$/);
444     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
445     push @$m, ( $sf, $v );
446     warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
447     }
448    
449     warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
450    
451     push @{ $marc_record }, $m;
452     }
453    
454    
455 dpavlin 540 =head1 Functions to extract data from input
456    
457     This function should be used inside functions to create C<data_structure> described
458     above.
459    
460 dpavlin 536 =head2 rec1
461 dpavlin 371
462 dpavlin 536 Return all values in some field
463 dpavlin 371
464 dpavlin 536 @v = rec1('200')
465 dpavlin 15
466 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
467 dpavlin 15
468 dpavlin 536 =cut
469 dpavlin 15
470 dpavlin 536 sub rec1 {
471     my $f = shift;
472     return unless (defined($rec) && defined($rec->{$f}));
473     if (ref($rec->{$f}) eq 'ARRAY') {
474     return map {
475     if (ref($_) eq 'HASH') {
476     values %{$_};
477 dpavlin 31 } else {
478 dpavlin 536 $_;
479 dpavlin 31 }
480 dpavlin 536 } @{ $rec->{$f} };
481     } elsif( defined($rec->{$f}) ) {
482     return $rec->{$f};
483 dpavlin 15 }
484     }
485    
486 dpavlin 536 =head2 rec2
487 dpavlin 15
488 dpavlin 536 Return all values in specific field and subfield
489 dpavlin 13
490 dpavlin 536 @v = rec2('200','a')
491 dpavlin 13
492     =cut
493    
494 dpavlin 536 sub rec2 {
495     my $f = shift;
496     return unless (defined($rec && $rec->{$f}));
497     my $sf = shift;
498     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
499     }
500 dpavlin 13
501 dpavlin 536 =head2 rec
502 dpavlin 13
503 dpavlin 536 syntaxtic sugar for
504 dpavlin 13
505 dpavlin 536 @v = rec('200')
506     @v = rec('200','a')
507 dpavlin 13
508 dpavlin 536 =cut
509 dpavlin 373
510 dpavlin 536 sub rec {
511     if ($#_ == 0) {
512     return rec1(@_);
513     } elsif ($#_ == 1) {
514     return rec2(@_);
515 dpavlin 13 }
516     }
517    
518 dpavlin 536 =head2 regex
519 dpavlin 15
520 dpavlin 536 Apply regex to some or all values
521 dpavlin 15
522 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
523 dpavlin 15
524     =cut
525    
526 dpavlin 536 sub regex {
527     my $r = shift;
528     my @out;
529 dpavlin 550 #warn "r: $r\n", dump(\@_);
530 dpavlin 536 foreach my $t (@_) {
531     next unless ($t);
532     eval "\$t =~ $r";
533     push @out, $t if ($t && $t ne '');
534 dpavlin 368 }
535 dpavlin 536 return @out;
536 dpavlin 15 }
537    
538 dpavlin 536 =head2 prefix
539 dpavlin 15
540 dpavlin 536 Prefix all values with a string
541 dpavlin 15
542 dpavlin 536 @v = prefix( 'my_', @v );
543 dpavlin 15
544     =cut
545    
546 dpavlin 536 sub prefix {
547     my $p = shift or die "prefix needs string as first argument";
548     return map { $p . $_ } grep { defined($_) } @_;
549     }
550 dpavlin 15
551 dpavlin 536 =head2 suffix
552 dpavlin 15
553 dpavlin 536 suffix all values with a string
554 dpavlin 15
555 dpavlin 536 @v = suffix( '_my', @v );
556 dpavlin 15
557 dpavlin 536 =cut
558 dpavlin 15
559 dpavlin 536 sub suffix {
560     my $s = shift or die "suffix needs string as first argument";
561     return map { $_ . $s } grep { defined($_) } @_;
562 dpavlin 15 }
563    
564 dpavlin 536 =head2 surround
565 dpavlin 13
566 dpavlin 536 surround all values with a two strings
567 dpavlin 13
568 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
569 dpavlin 13
570     =cut
571    
572 dpavlin 536 sub surround {
573     my $p = shift or die "surround need prefix as first argument";
574     my $s = shift or die "surround needs suffix as second argument";
575     return map { $p . $_ . $s } grep { defined($_) } @_;
576 dpavlin 13 }
577    
578 dpavlin 536 =head2 first
579 dpavlin 13
580 dpavlin 536 Return first element
581 dpavlin 15
582 dpavlin 536 $v = first( @v );
583 dpavlin 13
584     =cut
585    
586 dpavlin 536 sub first {
587     my $r = shift;
588     return $r;
589 dpavlin 13 }
590    
591 dpavlin 536 =head2 lookup
592 dpavlin 13
593 dpavlin 536 Consult lookup hashes for some value
594 dpavlin 13
595 dpavlin 536 @v = lookup( $v );
596     @v = lookup( @v );
597 dpavlin 13
598     =cut
599    
600 dpavlin 536 sub lookup {
601     my $k = shift or return;
602     return unless (defined($lookup->{$k}));
603     if (ref($lookup->{$k}) eq 'ARRAY') {
604     return @{ $lookup->{$k} };
605     } else {
606     return $lookup->{$k};
607     }
608 dpavlin 13 }
609    
610 dpavlin 536 =head2 join_with
611 dpavlin 13
612 dpavlin 536 Joins walues with some delimiter
613 dpavlin 10
614 dpavlin 536 $v = join_with(", ", @v);
615 dpavlin 10
616 dpavlin 536 =cut
617 dpavlin 10
618 dpavlin 536 sub join_with {
619     my $d = shift;
620     return join($d, grep { defined($_) && $_ ne '' } @_);
621     }
622 dpavlin 10
623 dpavlin 562 =head2 split_rec_on
624    
625     Split record subfield on some regex and take one of parts out
626    
627     $a_before_semi_column =
628     split_rec_on('200','a', /\s*;\s*/, $part);
629    
630     C<$part> is optional number of element. First element is
631     B<1>, not 0!
632    
633     If there is no C<$part> parameter or C<$part> is 0, this function will
634     return all values produced by splitting.
635    
636     =cut
637    
638     sub split_rec_on {
639     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
640    
641     my ($fld, $sf, $regex, $part) = @_;
642     warn "### regex ", ref($regex), $regex if ($debug > 2);
643    
644     my @r = rec( $fld, $sf );
645     my $v = shift @r;
646     warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
647    
648     my @s = split( $regex, $v );
649     warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
650     if ($part > 0) {
651     return $s[ $part - 1 ];
652     } else {
653     return @s;
654     }
655     }
656    
657 dpavlin 536 # END
658     1;

  ViewVC Help
Powered by ViewVC 1.1.26