/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 551 - (hide annotations)
Fri Jun 30 20:43:09 2006 UTC (17 years, 10 months ago) by dpavlin
File size: 10822 byte(s)
 r750@llin:  dpavlin | 2006-06-30 22:34:44 +0200
 check if marc_record has values

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 547 marc marc_indicators marc_repeatable_subfield
9 dpavlin 540
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 550 use Data::Dump qw/dump/;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22 dpavlin 550 # debugging warn(s)
23     my $debug = 0;
24    
25    
26 dpavlin 10 =head1 NAME
27    
28 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
29 dpavlin 10
30     =head1 VERSION
31    
32 dpavlin 541 Version 0.06
33 dpavlin 10
34     =cut
35    
36 dpavlin 541 our $VERSION = '0.06';
37 dpavlin 10
38     =head1 SYNOPSIS
39    
40 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
41     from input records using perl functions which are specialized for set
42     processing.
43 dpavlin 10
44 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
45     means that you check it's validity before running WebPAC using
46     C<perl -c normalize.pl>.
47 dpavlin 15
48 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
49 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
50 dpavlin 547 C<marc>.
51 dpavlin 15
52 dpavlin 10 =head1 FUNCTIONS
53    
54 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
55     All other functions are available for use within normalisation rules.
56    
57 dpavlin 536 =head2 data_structure
58 dpavlin 10
59 dpavlin 536 Return data structure
60 dpavlin 13
61 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
62 dpavlin 536 lookup => $lookup->lookup_hash,
63     row => $row,
64     rules => $normalize_pl_config,
65 dpavlin 541 marc_encoding => 'utf-8',
66 dpavlin 13 );
67    
68 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
69     other are optional.
70    
71 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
72 dpavlin 15
73 dpavlin 538 Since this function isn't exported you have to call it with
74     C<WebPAC::Normalize::data_structure>.
75    
76 dpavlin 10 =cut
77    
78 dpavlin 536 sub data_structure {
79     my $arg = {@_};
80 dpavlin 13
81 dpavlin 536 die "need row argument" unless ($arg->{row});
82     die "need normalisation argument" unless ($arg->{rules});
83 dpavlin 31
84 dpavlin 536 no strict 'subs';
85 dpavlin 538 _set_lookup( $arg->{lookup} );
86     _set_rec( $arg->{row} );
87 dpavlin 541 _clean_ds( %{ $arg } );
88 dpavlin 536 eval "$arg->{rules}";
89     die "error evaling $arg->{rules}: $@\n" if ($@);
90 dpavlin 540
91 dpavlin 538 return _get_ds();
92 dpavlin 10 }
93    
94 dpavlin 538 =head2 _set_rec
95 dpavlin 13
96 dpavlin 536 Set current record hash
97 dpavlin 433
98 dpavlin 538 _set_rec( $rec );
99 dpavlin 433
100     =cut
101    
102 dpavlin 536 my $rec;
103 dpavlin 433
104 dpavlin 538 sub _set_rec {
105 dpavlin 536 $rec = shift or die "no record hash";
106 dpavlin 433 }
107    
108 dpavlin 538 =head2 _get_ds
109    
110     Return hash formatted as data structure
111    
112     my $ds = _get_ds();
113    
114     =cut
115    
116 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
117 dpavlin 538
118     sub _get_ds {
119     return $out;
120     }
121    
122     =head2 _clean_ds
123    
124     Clean data structure hash for next record
125    
126     _clean_ds();
127    
128     =cut
129    
130     sub _clean_ds {
131 dpavlin 541 my $a = {@_};
132 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
133 dpavlin 541 $marc_encoding = $a->{marc_encoding};
134 dpavlin 538 }
135    
136     =head2 _set_lookup
137    
138     Set current lookup hash
139    
140     _set_lookup( $lookup );
141    
142     =cut
143    
144     my $lookup;
145    
146     sub _set_lookup {
147     $lookup = shift;
148     }
149    
150 dpavlin 547 =head2 _get_marc_fields
151 dpavlin 540
152 dpavlin 547 Get all fields defined by calls to C<marc>
153 dpavlin 540
154 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
155 dpavlin 540
156 dpavlin 542
157 dpavlin 543
158     We are using I<magic> which detect repeatable fields only from
159     sequence of field/subfield data generated by normalization.
160    
161     Repeatable field is created if there is second occurence of same subfield or
162     if any of indicators are different. This is sane for most cases except for
163     non-repeatable fields with repeatable subfields.
164    
165 dpavlin 547 You can change behaviour of that using C<marc_repeatable_subfield>.
166 dpavlin 543
167 dpavlin 540 =cut
168    
169 dpavlin 547 sub _get_marc_fields {
170 dpavlin 550
171 dpavlin 551 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
172 dpavlin 550
173     # first, sort all existing fields
174     # XXX might not be needed, but modern perl might randomize elements in hash
175     my @sorted_marc_record = sort {
176     $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
177     } @{ $marc_record };
178    
179     # output marc fields
180 dpavlin 542 my @m;
181 dpavlin 550
182     # count unique field-subfields (used for offset when walking to next subfield)
183     my $u;
184     map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
185    
186     if ($debug) {
187     warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
188     warn "## marc_record ", dump( $marc_record ), $/;
189     warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
190     warn "## subfield count ", dump( $u ), $/;
191     }
192    
193     my $len = $#sorted_marc_record;
194     my $visited;
195     my $i = 0;
196     my $field;
197    
198     foreach ( 0 .. $len ) {
199    
200     # find next element which isn't visited
201     while ($visited->{$i}) {
202     $i = ($i + 1) % ($len + 1);
203 dpavlin 542 }
204    
205 dpavlin 550 # mark it visited
206     $visited->{$i}++;
207    
208     my $row = $sorted_marc_record[$i];
209    
210     # field and subfield which is key for
211     # marc_repeatable_subfield and u
212     my $fsf = $row->[0] . $row->[3];
213    
214     if ($debug > 1) {
215    
216     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
217     print "### this [$i]: ", dump( $row ),$/;
218     print "### sf: ", $row->[3], " vs ", $field->[3],
219     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
220     if ($#$field >= 0);
221    
222     }
223    
224     # if field exists
225     if ( $#$field >= 0 ) {
226     if (
227     $row->[0] ne $field->[0] || # field
228     $row->[1] ne $field->[1] || # i1
229     $row->[2] ne $field->[2] # i2
230     ) {
231     push @m, $field;
232     warn "## saved/1 ", dump( $field ),$/ if ($debug);
233     $field = $row;
234    
235     } elsif (
236     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
237     ||
238     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
239     ! $marc_repeatable_subfield->{ $fsf }
240     )
241     ) {
242     push @m, $field;
243     warn "## saved/2 ", dump( $field ),$/ if ($debug);
244     $field = $row;
245    
246     } else {
247     # append new subfields to existing field
248     push @$field, ( $row->[3], $row->[4] );
249     }
250     } else {
251     # insert first field
252     $field = $row;
253     }
254    
255     if (! $marc_repeatable_subfield->{ $fsf }) {
256     # make step to next subfield
257     $i = ($i + $u->{ $fsf } ) % ($len + 1);
258     }
259 dpavlin 542 }
260    
261 dpavlin 550 if ($#$field >= 0) {
262     push @m, $field;
263     warn "## saved/3 ", dump( $field ),$/ if ($debug);
264     }
265 dpavlin 542
266     return @m;
267 dpavlin 540 }
268    
269     =head1 Functions to create C<data_structure>
270    
271     Those functions generally have to first in your normalization file.
272    
273 dpavlin 536 =head2 tag
274 dpavlin 433
275 dpavlin 536 Define new tag for I<search> and I<display>.
276 dpavlin 433
277 dpavlin 536 tag('Title', rec('200','a') );
278 dpavlin 13
279    
280     =cut
281    
282 dpavlin 536 sub tag {
283     my $name = shift or die "tag needs name as first argument";
284     my @o = grep { defined($_) && $_ ne '' } @_;
285     return unless (@o);
286     $out->{$name}->{tag} = $name;
287     $out->{$name}->{search} = \@o;
288     $out->{$name}->{display} = \@o;
289     }
290 dpavlin 13
291 dpavlin 536 =head2 display
292 dpavlin 13
293 dpavlin 536 Define tag just for I<display>
294 dpavlin 125
295 dpavlin 536 @v = display('Title', rec('200','a') );
296 dpavlin 125
297 dpavlin 536 =cut
298 dpavlin 125
299 dpavlin 536 sub display {
300     my $name = shift or die "display needs name as first argument";
301     my @o = grep { defined($_) && $_ ne '' } @_;
302     return unless (@o);
303     $out->{$name}->{tag} = $name;
304     $out->{$name}->{display} = \@o;
305     }
306 dpavlin 13
307 dpavlin 536 =head2 search
308 dpavlin 13
309 dpavlin 536 Prepare values just for I<search>
310 dpavlin 13
311 dpavlin 536 @v = search('Title', rec('200','a') );
312 dpavlin 433
313 dpavlin 536 =cut
314 dpavlin 13
315 dpavlin 536 sub search {
316     my $name = shift or die "search needs name as first argument";
317     my @o = grep { defined($_) && $_ ne '' } @_;
318     return unless (@o);
319     $out->{$name}->{tag} = $name;
320     $out->{$name}->{search} = \@o;
321 dpavlin 13 }
322    
323 dpavlin 547 =head2 marc
324 dpavlin 540
325     Save value for MARC field
326    
327 dpavlin 547 marc('900','a', rec('200','a') );
328 dpavlin 540
329     =cut
330    
331 dpavlin 547 sub marc {
332     my $f = shift or die "marc needs field";
333     die "marc field must be numer" unless ($f =~ /^\d+$/);
334 dpavlin 540
335 dpavlin 547 my $sf = shift or die "marc needs subfield";
336 dpavlin 540
337 dpavlin 541 foreach (@_) {
338     my $v = $_; # make var read-write for Encode
339 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
340 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
341 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
342     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
343 dpavlin 540 }
344     }
345    
346 dpavlin 547 =head2 marc_repeatable_subfield
347    
348     Save values for MARC repetable subfield
349    
350     marc_repeatable_subfield('910', 'z', rec('909') );
351    
352     =cut
353    
354     sub marc_repeatable_subfield {
355 dpavlin 550 my ($f,$sf) = @_;
356     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
357     $marc_repeatable_subfield->{ $f . $sf }++;
358 dpavlin 547 marc(@_);
359     }
360    
361     =head2 marc_indicators
362    
363     Set both indicators for MARC field
364    
365     marc_indicators('900', ' ', 1);
366    
367     Any indicator value other than C<0-9> will be treated as undefined.
368    
369     =cut
370    
371     sub marc_indicators {
372     my $f = shift || die "marc_indicators need field!\n";
373     my ($i1,$i2) = @_;
374     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
375     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
376    
377     $i1 = ' ' if ($i1 !~ /^\d$/);
378     $i2 = ' ' if ($i2 !~ /^\d$/);
379 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
380 dpavlin 547 }
381    
382    
383 dpavlin 540 =head1 Functions to extract data from input
384    
385     This function should be used inside functions to create C<data_structure> described
386     above.
387    
388 dpavlin 536 =head2 rec1
389 dpavlin 371
390 dpavlin 536 Return all values in some field
391 dpavlin 371
392 dpavlin 536 @v = rec1('200')
393 dpavlin 15
394 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
395 dpavlin 15
396 dpavlin 536 =cut
397 dpavlin 15
398 dpavlin 536 sub rec1 {
399     my $f = shift;
400     return unless (defined($rec) && defined($rec->{$f}));
401     if (ref($rec->{$f}) eq 'ARRAY') {
402     return map {
403     if (ref($_) eq 'HASH') {
404     values %{$_};
405 dpavlin 31 } else {
406 dpavlin 536 $_;
407 dpavlin 31 }
408 dpavlin 536 } @{ $rec->{$f} };
409     } elsif( defined($rec->{$f}) ) {
410     return $rec->{$f};
411 dpavlin 15 }
412     }
413    
414 dpavlin 536 =head2 rec2
415 dpavlin 15
416 dpavlin 536 Return all values in specific field and subfield
417 dpavlin 13
418 dpavlin 536 @v = rec2('200','a')
419 dpavlin 13
420     =cut
421    
422 dpavlin 536 sub rec2 {
423     my $f = shift;
424     return unless (defined($rec && $rec->{$f}));
425     my $sf = shift;
426     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
427     }
428 dpavlin 13
429 dpavlin 536 =head2 rec
430 dpavlin 13
431 dpavlin 536 syntaxtic sugar for
432 dpavlin 13
433 dpavlin 536 @v = rec('200')
434     @v = rec('200','a')
435 dpavlin 13
436 dpavlin 536 =cut
437 dpavlin 373
438 dpavlin 536 sub rec {
439     if ($#_ == 0) {
440     return rec1(@_);
441     } elsif ($#_ == 1) {
442     return rec2(@_);
443 dpavlin 13 }
444     }
445    
446 dpavlin 536 =head2 regex
447 dpavlin 15
448 dpavlin 536 Apply regex to some or all values
449 dpavlin 15
450 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
451 dpavlin 15
452     =cut
453    
454 dpavlin 536 sub regex {
455     my $r = shift;
456     my @out;
457 dpavlin 550 #warn "r: $r\n", dump(\@_);
458 dpavlin 536 foreach my $t (@_) {
459     next unless ($t);
460     eval "\$t =~ $r";
461     push @out, $t if ($t && $t ne '');
462 dpavlin 368 }
463 dpavlin 536 return @out;
464 dpavlin 15 }
465    
466 dpavlin 536 =head2 prefix
467 dpavlin 15
468 dpavlin 536 Prefix all values with a string
469 dpavlin 15
470 dpavlin 536 @v = prefix( 'my_', @v );
471 dpavlin 15
472     =cut
473    
474 dpavlin 536 sub prefix {
475     my $p = shift or die "prefix needs string as first argument";
476     return map { $p . $_ } grep { defined($_) } @_;
477     }
478 dpavlin 15
479 dpavlin 536 =head2 suffix
480 dpavlin 15
481 dpavlin 536 suffix all values with a string
482 dpavlin 15
483 dpavlin 536 @v = suffix( '_my', @v );
484 dpavlin 15
485 dpavlin 536 =cut
486 dpavlin 15
487 dpavlin 536 sub suffix {
488     my $s = shift or die "suffix needs string as first argument";
489     return map { $_ . $s } grep { defined($_) } @_;
490 dpavlin 15 }
491    
492 dpavlin 536 =head2 surround
493 dpavlin 13
494 dpavlin 536 surround all values with a two strings
495 dpavlin 13
496 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
497 dpavlin 13
498     =cut
499    
500 dpavlin 536 sub surround {
501     my $p = shift or die "surround need prefix as first argument";
502     my $s = shift or die "surround needs suffix as second argument";
503     return map { $p . $_ . $s } grep { defined($_) } @_;
504 dpavlin 13 }
505    
506 dpavlin 536 =head2 first
507 dpavlin 13
508 dpavlin 536 Return first element
509 dpavlin 15
510 dpavlin 536 $v = first( @v );
511 dpavlin 13
512     =cut
513    
514 dpavlin 536 sub first {
515     my $r = shift;
516     return $r;
517 dpavlin 13 }
518    
519 dpavlin 536 =head2 lookup
520 dpavlin 13
521 dpavlin 536 Consult lookup hashes for some value
522 dpavlin 13
523 dpavlin 536 @v = lookup( $v );
524     @v = lookup( @v );
525 dpavlin 13
526     =cut
527    
528 dpavlin 536 sub lookup {
529     my $k = shift or return;
530     return unless (defined($lookup->{$k}));
531     if (ref($lookup->{$k}) eq 'ARRAY') {
532     return @{ $lookup->{$k} };
533     } else {
534     return $lookup->{$k};
535     }
536 dpavlin 13 }
537    
538 dpavlin 536 =head2 join_with
539 dpavlin 13
540 dpavlin 536 Joins walues with some delimiter
541 dpavlin 10
542 dpavlin 536 $v = join_with(", ", @v);
543 dpavlin 10
544 dpavlin 536 =cut
545 dpavlin 10
546 dpavlin 536 sub join_with {
547     my $d = shift;
548     return join($d, grep { defined($_) && $_ ne '' } @_);
549     }
550 dpavlin 10
551 dpavlin 536 # END
552     1;

  ViewVC Help
Powered by ViewVC 1.1.26