/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 550 - (hide annotations)
Fri Jun 30 18:48:33 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 10735 byte(s)
 r748@llin:  dpavlin | 2006-06-30 20:48:29 +0200
 re-implement magic again (so that it actually work in all cases consistant).
 Depend on Data::Dump to enable nice output.

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 547 marc marc_indicators marc_repeatable_subfield
9 dpavlin 540
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 550 use Data::Dump qw/dump/;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22 dpavlin 550 # debugging warn(s)
23     my $debug = 0;
24    
25    
26 dpavlin 10 =head1 NAME
27    
28 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
29 dpavlin 10
30     =head1 VERSION
31    
32 dpavlin 541 Version 0.06
33 dpavlin 10
34     =cut
35    
36 dpavlin 541 our $VERSION = '0.06';
37 dpavlin 10
38     =head1 SYNOPSIS
39    
40 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
41     from input records using perl functions which are specialized for set
42     processing.
43 dpavlin 10
44 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
45     means that you check it's validity before running WebPAC using
46     C<perl -c normalize.pl>.
47 dpavlin 15
48 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
49 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
50 dpavlin 547 C<marc>.
51 dpavlin 15
52 dpavlin 10 =head1 FUNCTIONS
53    
54 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
55     All other functions are available for use within normalisation rules.
56    
57 dpavlin 536 =head2 data_structure
58 dpavlin 10
59 dpavlin 536 Return data structure
60 dpavlin 13
61 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
62 dpavlin 536 lookup => $lookup->lookup_hash,
63     row => $row,
64     rules => $normalize_pl_config,
65 dpavlin 541 marc_encoding => 'utf-8',
66 dpavlin 13 );
67    
68 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
69     other are optional.
70    
71 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
72 dpavlin 15
73 dpavlin 538 Since this function isn't exported you have to call it with
74     C<WebPAC::Normalize::data_structure>.
75    
76 dpavlin 10 =cut
77    
78 dpavlin 536 sub data_structure {
79     my $arg = {@_};
80 dpavlin 13
81 dpavlin 536 die "need row argument" unless ($arg->{row});
82     die "need normalisation argument" unless ($arg->{rules});
83 dpavlin 31
84 dpavlin 536 no strict 'subs';
85 dpavlin 538 _set_lookup( $arg->{lookup} );
86     _set_rec( $arg->{row} );
87 dpavlin 541 _clean_ds( %{ $arg } );
88 dpavlin 536 eval "$arg->{rules}";
89     die "error evaling $arg->{rules}: $@\n" if ($@);
90 dpavlin 540
91 dpavlin 538 return _get_ds();
92 dpavlin 10 }
93    
94 dpavlin 538 =head2 _set_rec
95 dpavlin 13
96 dpavlin 536 Set current record hash
97 dpavlin 433
98 dpavlin 538 _set_rec( $rec );
99 dpavlin 433
100     =cut
101    
102 dpavlin 536 my $rec;
103 dpavlin 433
104 dpavlin 538 sub _set_rec {
105 dpavlin 536 $rec = shift or die "no record hash";
106 dpavlin 433 }
107    
108 dpavlin 538 =head2 _get_ds
109    
110     Return hash formatted as data structure
111    
112     my $ds = _get_ds();
113    
114     =cut
115    
116 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
117 dpavlin 538
118     sub _get_ds {
119     return $out;
120     }
121    
122     =head2 _clean_ds
123    
124     Clean data structure hash for next record
125    
126     _clean_ds();
127    
128     =cut
129    
130     sub _clean_ds {
131 dpavlin 541 my $a = {@_};
132 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
133 dpavlin 541 $marc_encoding = $a->{marc_encoding};
134 dpavlin 538 }
135    
136     =head2 _set_lookup
137    
138     Set current lookup hash
139    
140     _set_lookup( $lookup );
141    
142     =cut
143    
144     my $lookup;
145    
146     sub _set_lookup {
147     $lookup = shift;
148     }
149    
150 dpavlin 547 =head2 _get_marc_fields
151 dpavlin 540
152 dpavlin 547 Get all fields defined by calls to C<marc>
153 dpavlin 540
154 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
155 dpavlin 540
156 dpavlin 542
157 dpavlin 543
158     We are using I<magic> which detect repeatable fields only from
159     sequence of field/subfield data generated by normalization.
160    
161     Repeatable field is created if there is second occurence of same subfield or
162     if any of indicators are different. This is sane for most cases except for
163     non-repeatable fields with repeatable subfields.
164    
165 dpavlin 547 You can change behaviour of that using C<marc_repeatable_subfield>.
166 dpavlin 543
167 dpavlin 540 =cut
168    
169 dpavlin 547 sub _get_marc_fields {
170 dpavlin 550
171    
172     # first, sort all existing fields
173     # XXX might not be needed, but modern perl might randomize elements in hash
174     my @sorted_marc_record = sort {
175     $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
176     } @{ $marc_record };
177    
178     # output marc fields
179 dpavlin 542 my @m;
180 dpavlin 550
181     # count unique field-subfields (used for offset when walking to next subfield)
182     my $u;
183     map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
184    
185     if ($debug) {
186     warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
187     warn "## marc_record ", dump( $marc_record ), $/;
188     warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
189     warn "## subfield count ", dump( $u ), $/;
190     }
191    
192     my $len = $#sorted_marc_record;
193     my $visited;
194     my $i = 0;
195     my $field;
196    
197     foreach ( 0 .. $len ) {
198    
199     # find next element which isn't visited
200     while ($visited->{$i}) {
201     $i = ($i + 1) % ($len + 1);
202 dpavlin 542 }
203    
204 dpavlin 550 # mark it visited
205     $visited->{$i}++;
206    
207     my $row = $sorted_marc_record[$i];
208    
209     # field and subfield which is key for
210     # marc_repeatable_subfield and u
211     my $fsf = $row->[0] . $row->[3];
212    
213     if ($debug > 1) {
214    
215     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
216     print "### this [$i]: ", dump( $row ),$/;
217     print "### sf: ", $row->[3], " vs ", $field->[3],
218     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
219     if ($#$field >= 0);
220    
221     }
222    
223     # if field exists
224     if ( $#$field >= 0 ) {
225     if (
226     $row->[0] ne $field->[0] || # field
227     $row->[1] ne $field->[1] || # i1
228     $row->[2] ne $field->[2] # i2
229     ) {
230     push @m, $field;
231     warn "## saved/1 ", dump( $field ),$/ if ($debug);
232     $field = $row;
233    
234     } elsif (
235     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
236     ||
237     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
238     ! $marc_repeatable_subfield->{ $fsf }
239     )
240     ) {
241     push @m, $field;
242     warn "## saved/2 ", dump( $field ),$/ if ($debug);
243     $field = $row;
244    
245     } else {
246     # append new subfields to existing field
247     push @$field, ( $row->[3], $row->[4] );
248     }
249     } else {
250     # insert first field
251     $field = $row;
252     }
253    
254     if (! $marc_repeatable_subfield->{ $fsf }) {
255     # make step to next subfield
256     $i = ($i + $u->{ $fsf } ) % ($len + 1);
257     }
258 dpavlin 542 }
259    
260 dpavlin 550 if ($#$field >= 0) {
261     push @m, $field;
262     warn "## saved/3 ", dump( $field ),$/ if ($debug);
263     }
264 dpavlin 542
265     return @m;
266 dpavlin 540 }
267    
268     =head1 Functions to create C<data_structure>
269    
270     Those functions generally have to first in your normalization file.
271    
272 dpavlin 536 =head2 tag
273 dpavlin 433
274 dpavlin 536 Define new tag for I<search> and I<display>.
275 dpavlin 433
276 dpavlin 536 tag('Title', rec('200','a') );
277 dpavlin 13
278    
279     =cut
280    
281 dpavlin 536 sub tag {
282     my $name = shift or die "tag needs name as first argument";
283     my @o = grep { defined($_) && $_ ne '' } @_;
284     return unless (@o);
285     $out->{$name}->{tag} = $name;
286     $out->{$name}->{search} = \@o;
287     $out->{$name}->{display} = \@o;
288     }
289 dpavlin 13
290 dpavlin 536 =head2 display
291 dpavlin 13
292 dpavlin 536 Define tag just for I<display>
293 dpavlin 125
294 dpavlin 536 @v = display('Title', rec('200','a') );
295 dpavlin 125
296 dpavlin 536 =cut
297 dpavlin 125
298 dpavlin 536 sub display {
299     my $name = shift or die "display needs name as first argument";
300     my @o = grep { defined($_) && $_ ne '' } @_;
301     return unless (@o);
302     $out->{$name}->{tag} = $name;
303     $out->{$name}->{display} = \@o;
304     }
305 dpavlin 13
306 dpavlin 536 =head2 search
307 dpavlin 13
308 dpavlin 536 Prepare values just for I<search>
309 dpavlin 13
310 dpavlin 536 @v = search('Title', rec('200','a') );
311 dpavlin 433
312 dpavlin 536 =cut
313 dpavlin 13
314 dpavlin 536 sub search {
315     my $name = shift or die "search needs name as first argument";
316     my @o = grep { defined($_) && $_ ne '' } @_;
317     return unless (@o);
318     $out->{$name}->{tag} = $name;
319     $out->{$name}->{search} = \@o;
320 dpavlin 13 }
321    
322 dpavlin 547 =head2 marc
323 dpavlin 540
324     Save value for MARC field
325    
326 dpavlin 547 marc('900','a', rec('200','a') );
327 dpavlin 540
328     =cut
329    
330 dpavlin 547 sub marc {
331     my $f = shift or die "marc needs field";
332     die "marc field must be numer" unless ($f =~ /^\d+$/);
333 dpavlin 540
334 dpavlin 547 my $sf = shift or die "marc needs subfield";
335 dpavlin 540
336 dpavlin 541 foreach (@_) {
337     my $v = $_; # make var read-write for Encode
338 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
339 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
340 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
341     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
342 dpavlin 540 }
343     }
344    
345 dpavlin 547 =head2 marc_repeatable_subfield
346    
347     Save values for MARC repetable subfield
348    
349     marc_repeatable_subfield('910', 'z', rec('909') );
350    
351     =cut
352    
353     sub marc_repeatable_subfield {
354 dpavlin 550 my ($f,$sf) = @_;
355     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
356     $marc_repeatable_subfield->{ $f . $sf }++;
357 dpavlin 547 marc(@_);
358     }
359    
360     =head2 marc_indicators
361    
362     Set both indicators for MARC field
363    
364     marc_indicators('900', ' ', 1);
365    
366     Any indicator value other than C<0-9> will be treated as undefined.
367    
368     =cut
369    
370     sub marc_indicators {
371     my $f = shift || die "marc_indicators need field!\n";
372     my ($i1,$i2) = @_;
373     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
374     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
375    
376     $i1 = ' ' if ($i1 !~ /^\d$/);
377     $i2 = ' ' if ($i2 !~ /^\d$/);
378 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
379 dpavlin 547 }
380    
381    
382 dpavlin 540 =head1 Functions to extract data from input
383    
384     This function should be used inside functions to create C<data_structure> described
385     above.
386    
387 dpavlin 536 =head2 rec1
388 dpavlin 371
389 dpavlin 536 Return all values in some field
390 dpavlin 371
391 dpavlin 536 @v = rec1('200')
392 dpavlin 15
393 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
394 dpavlin 15
395 dpavlin 536 =cut
396 dpavlin 15
397 dpavlin 536 sub rec1 {
398     my $f = shift;
399     return unless (defined($rec) && defined($rec->{$f}));
400     if (ref($rec->{$f}) eq 'ARRAY') {
401     return map {
402     if (ref($_) eq 'HASH') {
403     values %{$_};
404 dpavlin 31 } else {
405 dpavlin 536 $_;
406 dpavlin 31 }
407 dpavlin 536 } @{ $rec->{$f} };
408     } elsif( defined($rec->{$f}) ) {
409     return $rec->{$f};
410 dpavlin 15 }
411     }
412    
413 dpavlin 536 =head2 rec2
414 dpavlin 15
415 dpavlin 536 Return all values in specific field and subfield
416 dpavlin 13
417 dpavlin 536 @v = rec2('200','a')
418 dpavlin 13
419     =cut
420    
421 dpavlin 536 sub rec2 {
422     my $f = shift;
423     return unless (defined($rec && $rec->{$f}));
424     my $sf = shift;
425     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
426     }
427 dpavlin 13
428 dpavlin 536 =head2 rec
429 dpavlin 13
430 dpavlin 536 syntaxtic sugar for
431 dpavlin 13
432 dpavlin 536 @v = rec('200')
433     @v = rec('200','a')
434 dpavlin 13
435 dpavlin 536 =cut
436 dpavlin 373
437 dpavlin 536 sub rec {
438     if ($#_ == 0) {
439     return rec1(@_);
440     } elsif ($#_ == 1) {
441     return rec2(@_);
442 dpavlin 13 }
443     }
444    
445 dpavlin 536 =head2 regex
446 dpavlin 15
447 dpavlin 536 Apply regex to some or all values
448 dpavlin 15
449 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
450 dpavlin 15
451     =cut
452    
453 dpavlin 536 sub regex {
454     my $r = shift;
455     my @out;
456 dpavlin 550 #warn "r: $r\n", dump(\@_);
457 dpavlin 536 foreach my $t (@_) {
458     next unless ($t);
459     eval "\$t =~ $r";
460     push @out, $t if ($t && $t ne '');
461 dpavlin 368 }
462 dpavlin 536 return @out;
463 dpavlin 15 }
464    
465 dpavlin 536 =head2 prefix
466 dpavlin 15
467 dpavlin 536 Prefix all values with a string
468 dpavlin 15
469 dpavlin 536 @v = prefix( 'my_', @v );
470 dpavlin 15
471     =cut
472    
473 dpavlin 536 sub prefix {
474     my $p = shift or die "prefix needs string as first argument";
475     return map { $p . $_ } grep { defined($_) } @_;
476     }
477 dpavlin 15
478 dpavlin 536 =head2 suffix
479 dpavlin 15
480 dpavlin 536 suffix all values with a string
481 dpavlin 15
482 dpavlin 536 @v = suffix( '_my', @v );
483 dpavlin 15
484 dpavlin 536 =cut
485 dpavlin 15
486 dpavlin 536 sub suffix {
487     my $s = shift or die "suffix needs string as first argument";
488     return map { $_ . $s } grep { defined($_) } @_;
489 dpavlin 15 }
490    
491 dpavlin 536 =head2 surround
492 dpavlin 13
493 dpavlin 536 surround all values with a two strings
494 dpavlin 13
495 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
496 dpavlin 13
497     =cut
498    
499 dpavlin 536 sub surround {
500     my $p = shift or die "surround need prefix as first argument";
501     my $s = shift or die "surround needs suffix as second argument";
502     return map { $p . $_ . $s } grep { defined($_) } @_;
503 dpavlin 13 }
504    
505 dpavlin 536 =head2 first
506 dpavlin 13
507 dpavlin 536 Return first element
508 dpavlin 15
509 dpavlin 536 $v = first( @v );
510 dpavlin 13
511     =cut
512    
513 dpavlin 536 sub first {
514     my $r = shift;
515     return $r;
516 dpavlin 13 }
517    
518 dpavlin 536 =head2 lookup
519 dpavlin 13
520 dpavlin 536 Consult lookup hashes for some value
521 dpavlin 13
522 dpavlin 536 @v = lookup( $v );
523     @v = lookup( @v );
524 dpavlin 13
525     =cut
526    
527 dpavlin 536 sub lookup {
528     my $k = shift or return;
529     return unless (defined($lookup->{$k}));
530     if (ref($lookup->{$k}) eq 'ARRAY') {
531     return @{ $lookup->{$k} };
532     } else {
533     return $lookup->{$k};
534     }
535 dpavlin 13 }
536    
537 dpavlin 536 =head2 join_with
538 dpavlin 13
539 dpavlin 536 Joins walues with some delimiter
540 dpavlin 10
541 dpavlin 536 $v = join_with(", ", @v);
542 dpavlin 10
543 dpavlin 536 =cut
544 dpavlin 10
545 dpavlin 536 sub join_with {
546     my $d = shift;
547     return join($d, grep { defined($_) && $_ ne '' } @_);
548     }
549 dpavlin 10
550 dpavlin 536 # END
551     1;

  ViewVC Help
Powered by ViewVC 1.1.26