/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 554 - (hide annotations)
Sat Jul 1 10:19:29 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 11273 byte(s)
 r756@llin:  dpavlin | 2006-07-01 12:17:24 +0200
 pod improvements, added _debug

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 538
8 dpavlin 536 tag search display
9 dpavlin 547 marc marc_indicators marc_repeatable_subfield
10 dpavlin 540
11 dpavlin 536 rec1 rec2 rec
12     regex prefix suffix surround
13     first lookup join_with
14     /;
15 dpavlin 10
16     use warnings;
17     use strict;
18 dpavlin 536
19     #use base qw/WebPAC::Common/;
20 dpavlin 550 use Data::Dump qw/dump/;
21 dpavlin 541 use Encode qw/from_to/;
22 dpavlin 10
23 dpavlin 550 # debugging warn(s)
24     my $debug = 0;
25    
26    
27 dpavlin 10 =head1 NAME
28    
29 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
30 dpavlin 10
31     =head1 VERSION
32    
33 dpavlin 554 Version 0.07
34 dpavlin 10
35     =cut
36    
37 dpavlin 554 our $VERSION = '0.07';
38 dpavlin 10
39     =head1 SYNOPSIS
40    
41 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
42     from input records using perl functions which are specialized for set
43     processing.
44 dpavlin 10
45 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
46     means that you check it's validity before running WebPAC using
47     C<perl -c normalize.pl>.
48 dpavlin 15
49 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
50 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
51 dpavlin 547 C<marc>.
52 dpavlin 15
53 dpavlin 10 =head1 FUNCTIONS
54    
55 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
56     All other functions are available for use within normalisation rules.
57    
58 dpavlin 536 =head2 data_structure
59 dpavlin 10
60 dpavlin 536 Return data structure
61 dpavlin 13
62 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
63 dpavlin 536 lookup => $lookup->lookup_hash,
64     row => $row,
65     rules => $normalize_pl_config,
66 dpavlin 541 marc_encoding => 'utf-8',
67 dpavlin 13 );
68    
69 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
70     other are optional.
71    
72 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
73 dpavlin 15
74 dpavlin 538 Since this function isn't exported you have to call it with
75     C<WebPAC::Normalize::data_structure>.
76    
77 dpavlin 10 =cut
78    
79 dpavlin 536 sub data_structure {
80     my $arg = {@_};
81 dpavlin 13
82 dpavlin 536 die "need row argument" unless ($arg->{row});
83     die "need normalisation argument" unless ($arg->{rules});
84 dpavlin 31
85 dpavlin 536 no strict 'subs';
86 dpavlin 538 _set_lookup( $arg->{lookup} );
87     _set_rec( $arg->{row} );
88 dpavlin 541 _clean_ds( %{ $arg } );
89 dpavlin 536 eval "$arg->{rules}";
90     die "error evaling $arg->{rules}: $@\n" if ($@);
91 dpavlin 540
92 dpavlin 538 return _get_ds();
93 dpavlin 10 }
94    
95 dpavlin 538 =head2 _set_rec
96 dpavlin 13
97 dpavlin 536 Set current record hash
98 dpavlin 433
99 dpavlin 538 _set_rec( $rec );
100 dpavlin 433
101     =cut
102    
103 dpavlin 536 my $rec;
104 dpavlin 433
105 dpavlin 538 sub _set_rec {
106 dpavlin 536 $rec = shift or die "no record hash";
107 dpavlin 433 }
108    
109 dpavlin 538 =head2 _get_ds
110    
111     Return hash formatted as data structure
112    
113     my $ds = _get_ds();
114    
115     =cut
116    
117 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
118 dpavlin 538
119     sub _get_ds {
120     return $out;
121     }
122    
123     =head2 _clean_ds
124    
125     Clean data structure hash for next record
126    
127     _clean_ds();
128    
129     =cut
130    
131     sub _clean_ds {
132 dpavlin 541 my $a = {@_};
133 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
134 dpavlin 541 $marc_encoding = $a->{marc_encoding};
135 dpavlin 538 }
136    
137     =head2 _set_lookup
138    
139     Set current lookup hash
140    
141     _set_lookup( $lookup );
142    
143     =cut
144    
145     my $lookup;
146    
147     sub _set_lookup {
148     $lookup = shift;
149     }
150    
151 dpavlin 547 =head2 _get_marc_fields
152 dpavlin 540
153 dpavlin 547 Get all fields defined by calls to C<marc>
154 dpavlin 540
155 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
156 dpavlin 540
157 dpavlin 543 We are using I<magic> which detect repeatable fields only from
158     sequence of field/subfield data generated by normalization.
159    
160 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
161     if any of indicators are different.
162 dpavlin 543
163 dpavlin 554 This is sane for most cases. Something like:
164 dpavlin 543
165 dpavlin 554 900a-1 900b-1 900c-1
166     900a-2 900b-2
167     900a-3
168    
169     will be created from any combination of:
170    
171     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
172    
173     and following rules:
174    
175     marc('900','a', rec('200','a') );
176     marc('900','b', rec('200','b') );
177     marc('900','c', rec('200','c') );
178    
179     which might not be what you have in mind. If you need repeatable subfield,
180     define it using C<marc_repeatable_subfield> like this:
181    
182     ....
183    
184 dpavlin 540 =cut
185    
186 dpavlin 547 sub _get_marc_fields {
187 dpavlin 550
188 dpavlin 551 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
189 dpavlin 550
190     # first, sort all existing fields
191     # XXX might not be needed, but modern perl might randomize elements in hash
192     my @sorted_marc_record = sort {
193     $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
194     } @{ $marc_record };
195    
196     # output marc fields
197 dpavlin 542 my @m;
198 dpavlin 550
199     # count unique field-subfields (used for offset when walking to next subfield)
200     my $u;
201     map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
202    
203     if ($debug) {
204     warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
205     warn "## marc_record ", dump( $marc_record ), $/;
206     warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
207     warn "## subfield count ", dump( $u ), $/;
208     }
209    
210     my $len = $#sorted_marc_record;
211     my $visited;
212     my $i = 0;
213     my $field;
214    
215     foreach ( 0 .. $len ) {
216    
217     # find next element which isn't visited
218     while ($visited->{$i}) {
219     $i = ($i + 1) % ($len + 1);
220 dpavlin 542 }
221    
222 dpavlin 550 # mark it visited
223     $visited->{$i}++;
224    
225     my $row = $sorted_marc_record[$i];
226    
227     # field and subfield which is key for
228     # marc_repeatable_subfield and u
229     my $fsf = $row->[0] . $row->[3];
230    
231     if ($debug > 1) {
232    
233     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
234     print "### this [$i]: ", dump( $row ),$/;
235     print "### sf: ", $row->[3], " vs ", $field->[3],
236     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
237     if ($#$field >= 0);
238    
239     }
240    
241     # if field exists
242     if ( $#$field >= 0 ) {
243     if (
244     $row->[0] ne $field->[0] || # field
245     $row->[1] ne $field->[1] || # i1
246     $row->[2] ne $field->[2] # i2
247     ) {
248     push @m, $field;
249     warn "## saved/1 ", dump( $field ),$/ if ($debug);
250     $field = $row;
251    
252     } elsif (
253     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
254     ||
255     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
256     ! $marc_repeatable_subfield->{ $fsf }
257     )
258     ) {
259     push @m, $field;
260     warn "## saved/2 ", dump( $field ),$/ if ($debug);
261     $field = $row;
262    
263     } else {
264     # append new subfields to existing field
265     push @$field, ( $row->[3], $row->[4] );
266     }
267     } else {
268     # insert first field
269     $field = $row;
270     }
271    
272     if (! $marc_repeatable_subfield->{ $fsf }) {
273     # make step to next subfield
274     $i = ($i + $u->{ $fsf } ) % ($len + 1);
275     }
276 dpavlin 542 }
277    
278 dpavlin 550 if ($#$field >= 0) {
279     push @m, $field;
280     warn "## saved/3 ", dump( $field ),$/ if ($debug);
281     }
282 dpavlin 542
283     return @m;
284 dpavlin 540 }
285    
286 dpavlin 554 =head2 _debug
287    
288     Change level of debug warnings
289    
290     _debug( 2 );
291    
292     =cut
293    
294     sub _debug {
295     my $l = shift;
296     return $debug unless defined($l);
297     $debug = $l;
298     }
299    
300 dpavlin 540 =head1 Functions to create C<data_structure>
301    
302     Those functions generally have to first in your normalization file.
303    
304 dpavlin 536 =head2 tag
305 dpavlin 433
306 dpavlin 536 Define new tag for I<search> and I<display>.
307 dpavlin 433
308 dpavlin 536 tag('Title', rec('200','a') );
309 dpavlin 13
310    
311     =cut
312    
313 dpavlin 536 sub tag {
314     my $name = shift or die "tag needs name as first argument";
315     my @o = grep { defined($_) && $_ ne '' } @_;
316     return unless (@o);
317     $out->{$name}->{tag} = $name;
318     $out->{$name}->{search} = \@o;
319     $out->{$name}->{display} = \@o;
320     }
321 dpavlin 13
322 dpavlin 536 =head2 display
323 dpavlin 13
324 dpavlin 536 Define tag just for I<display>
325 dpavlin 125
326 dpavlin 536 @v = display('Title', rec('200','a') );
327 dpavlin 125
328 dpavlin 536 =cut
329 dpavlin 125
330 dpavlin 536 sub display {
331     my $name = shift or die "display needs name as first argument";
332     my @o = grep { defined($_) && $_ ne '' } @_;
333     return unless (@o);
334     $out->{$name}->{tag} = $name;
335     $out->{$name}->{display} = \@o;
336     }
337 dpavlin 13
338 dpavlin 536 =head2 search
339 dpavlin 13
340 dpavlin 536 Prepare values just for I<search>
341 dpavlin 13
342 dpavlin 536 @v = search('Title', rec('200','a') );
343 dpavlin 433
344 dpavlin 536 =cut
345 dpavlin 13
346 dpavlin 536 sub search {
347     my $name = shift or die "search needs name as first argument";
348     my @o = grep { defined($_) && $_ ne '' } @_;
349     return unless (@o);
350     $out->{$name}->{tag} = $name;
351     $out->{$name}->{search} = \@o;
352 dpavlin 13 }
353    
354 dpavlin 547 =head2 marc
355 dpavlin 540
356     Save value for MARC field
357    
358 dpavlin 547 marc('900','a', rec('200','a') );
359 dpavlin 540
360     =cut
361    
362 dpavlin 547 sub marc {
363     my $f = shift or die "marc needs field";
364     die "marc field must be numer" unless ($f =~ /^\d+$/);
365 dpavlin 540
366 dpavlin 547 my $sf = shift or die "marc needs subfield";
367 dpavlin 540
368 dpavlin 541 foreach (@_) {
369     my $v = $_; # make var read-write for Encode
370 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
371 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
372 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
373     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
374 dpavlin 540 }
375     }
376    
377 dpavlin 547 =head2 marc_repeatable_subfield
378    
379     Save values for MARC repetable subfield
380    
381     marc_repeatable_subfield('910', 'z', rec('909') );
382    
383     =cut
384    
385     sub marc_repeatable_subfield {
386 dpavlin 550 my ($f,$sf) = @_;
387     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
388     $marc_repeatable_subfield->{ $f . $sf }++;
389 dpavlin 547 marc(@_);
390     }
391    
392     =head2 marc_indicators
393    
394     Set both indicators for MARC field
395    
396     marc_indicators('900', ' ', 1);
397    
398     Any indicator value other than C<0-9> will be treated as undefined.
399    
400     =cut
401    
402     sub marc_indicators {
403     my $f = shift || die "marc_indicators need field!\n";
404     my ($i1,$i2) = @_;
405     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
406     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
407    
408     $i1 = ' ' if ($i1 !~ /^\d$/);
409     $i2 = ' ' if ($i2 !~ /^\d$/);
410 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
411 dpavlin 547 }
412    
413    
414 dpavlin 540 =head1 Functions to extract data from input
415    
416     This function should be used inside functions to create C<data_structure> described
417     above.
418    
419 dpavlin 536 =head2 rec1
420 dpavlin 371
421 dpavlin 536 Return all values in some field
422 dpavlin 371
423 dpavlin 536 @v = rec1('200')
424 dpavlin 15
425 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
426 dpavlin 15
427 dpavlin 536 =cut
428 dpavlin 15
429 dpavlin 536 sub rec1 {
430     my $f = shift;
431     return unless (defined($rec) && defined($rec->{$f}));
432     if (ref($rec->{$f}) eq 'ARRAY') {
433     return map {
434     if (ref($_) eq 'HASH') {
435     values %{$_};
436 dpavlin 31 } else {
437 dpavlin 536 $_;
438 dpavlin 31 }
439 dpavlin 536 } @{ $rec->{$f} };
440     } elsif( defined($rec->{$f}) ) {
441     return $rec->{$f};
442 dpavlin 15 }
443     }
444    
445 dpavlin 536 =head2 rec2
446 dpavlin 15
447 dpavlin 536 Return all values in specific field and subfield
448 dpavlin 13
449 dpavlin 536 @v = rec2('200','a')
450 dpavlin 13
451     =cut
452    
453 dpavlin 536 sub rec2 {
454     my $f = shift;
455     return unless (defined($rec && $rec->{$f}));
456     my $sf = shift;
457     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
458     }
459 dpavlin 13
460 dpavlin 536 =head2 rec
461 dpavlin 13
462 dpavlin 536 syntaxtic sugar for
463 dpavlin 13
464 dpavlin 536 @v = rec('200')
465     @v = rec('200','a')
466 dpavlin 13
467 dpavlin 536 =cut
468 dpavlin 373
469 dpavlin 536 sub rec {
470     if ($#_ == 0) {
471     return rec1(@_);
472     } elsif ($#_ == 1) {
473     return rec2(@_);
474 dpavlin 13 }
475     }
476    
477 dpavlin 536 =head2 regex
478 dpavlin 15
479 dpavlin 536 Apply regex to some or all values
480 dpavlin 15
481 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
482 dpavlin 15
483     =cut
484    
485 dpavlin 536 sub regex {
486     my $r = shift;
487     my @out;
488 dpavlin 550 #warn "r: $r\n", dump(\@_);
489 dpavlin 536 foreach my $t (@_) {
490     next unless ($t);
491     eval "\$t =~ $r";
492     push @out, $t if ($t && $t ne '');
493 dpavlin 368 }
494 dpavlin 536 return @out;
495 dpavlin 15 }
496    
497 dpavlin 536 =head2 prefix
498 dpavlin 15
499 dpavlin 536 Prefix all values with a string
500 dpavlin 15
501 dpavlin 536 @v = prefix( 'my_', @v );
502 dpavlin 15
503     =cut
504    
505 dpavlin 536 sub prefix {
506     my $p = shift or die "prefix needs string as first argument";
507     return map { $p . $_ } grep { defined($_) } @_;
508     }
509 dpavlin 15
510 dpavlin 536 =head2 suffix
511 dpavlin 15
512 dpavlin 536 suffix all values with a string
513 dpavlin 15
514 dpavlin 536 @v = suffix( '_my', @v );
515 dpavlin 15
516 dpavlin 536 =cut
517 dpavlin 15
518 dpavlin 536 sub suffix {
519     my $s = shift or die "suffix needs string as first argument";
520     return map { $_ . $s } grep { defined($_) } @_;
521 dpavlin 15 }
522    
523 dpavlin 536 =head2 surround
524 dpavlin 13
525 dpavlin 536 surround all values with a two strings
526 dpavlin 13
527 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
528 dpavlin 13
529     =cut
530    
531 dpavlin 536 sub surround {
532     my $p = shift or die "surround need prefix as first argument";
533     my $s = shift or die "surround needs suffix as second argument";
534     return map { $p . $_ . $s } grep { defined($_) } @_;
535 dpavlin 13 }
536    
537 dpavlin 536 =head2 first
538 dpavlin 13
539 dpavlin 536 Return first element
540 dpavlin 15
541 dpavlin 536 $v = first( @v );
542 dpavlin 13
543     =cut
544    
545 dpavlin 536 sub first {
546     my $r = shift;
547     return $r;
548 dpavlin 13 }
549    
550 dpavlin 536 =head2 lookup
551 dpavlin 13
552 dpavlin 536 Consult lookup hashes for some value
553 dpavlin 13
554 dpavlin 536 @v = lookup( $v );
555     @v = lookup( @v );
556 dpavlin 13
557     =cut
558    
559 dpavlin 536 sub lookup {
560     my $k = shift or return;
561     return unless (defined($lookup->{$k}));
562     if (ref($lookup->{$k}) eq 'ARRAY') {
563     return @{ $lookup->{$k} };
564     } else {
565     return $lookup->{$k};
566     }
567 dpavlin 13 }
568    
569 dpavlin 536 =head2 join_with
570 dpavlin 13
571 dpavlin 536 Joins walues with some delimiter
572 dpavlin 10
573 dpavlin 536 $v = join_with(", ", @v);
574 dpavlin 10
575 dpavlin 536 =cut
576 dpavlin 10
577 dpavlin 536 sub join_with {
578     my $d = shift;
579     return join($d, grep { defined($_) && $_ ne '' } @_);
580     }
581 dpavlin 10
582 dpavlin 536 # END
583     1;

  ViewVC Help
Powered by ViewVC 1.1.26