/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 548 - (hide annotations)
Thu Jun 29 23:29:02 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 8870 byte(s)
 r744@llin:  dpavlin | 2006-06-30 01:31:00 +0200
 don't chew indicators with 0 value, removed debugging warning

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 547 marc marc_indicators marc_repeatable_subfield
9 dpavlin 540
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 dpavlin 547 C<marc>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113 dpavlin 538
114     sub _get_ds {
115     return $out;
116     }
117    
118     =head2 _clean_ds
119    
120     Clean data structure hash for next record
121    
122     _clean_ds();
123    
124     =cut
125    
126     sub _clean_ds {
127 dpavlin 541 my $a = {@_};
128 dpavlin 547 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129 dpavlin 541 $marc_encoding = $a->{marc_encoding};
130 dpavlin 538 }
131    
132     =head2 _set_lookup
133    
134     Set current lookup hash
135    
136     _set_lookup( $lookup );
137    
138     =cut
139    
140     my $lookup;
141    
142     sub _set_lookup {
143     $lookup = shift;
144     }
145    
146 dpavlin 547 =head2 _get_marc_fields
147 dpavlin 540
148 dpavlin 547 Get all fields defined by calls to C<marc>
149 dpavlin 540
150 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151 dpavlin 540
152 dpavlin 542
153 dpavlin 543
154     We are using I<magic> which detect repeatable fields only from
155     sequence of field/subfield data generated by normalization.
156    
157     Repeatable field is created if there is second occurence of same subfield or
158     if any of indicators are different. This is sane for most cases except for
159     non-repeatable fields with repeatable subfields.
160    
161 dpavlin 547 You can change behaviour of that using C<marc_repeatable_subfield>.
162 dpavlin 543
163 dpavlin 540 =cut
164    
165 dpavlin 547 sub _get_marc_fields {
166 dpavlin 542 my @m;
167     my $last;
168 dpavlin 547 foreach my $row (@{ $marc_record }) {
169 dpavlin 543 if ($last &&
170     $last->[0] eq $row->[0] && # check if field is same
171     $last->[1] eq $row->[1] && # check for i1
172 dpavlin 544 $last->[2] eq $row->[2] && # and for i2
173 dpavlin 547 ( $last->[3] ne $row->[3] || # and subfield is different
174     $last->[3] eq $row->[3] && # or subfield is same,
175     $marc_repeatable_subfield->{ $row->[3] } # but is repeatable
176     )
177 dpavlin 543 ) {
178     push @$last, ( $row->[3] , $row->[4] );
179 dpavlin 542 next;
180     } elsif ($last) {
181     push @m, $last;
182     }
183    
184 dpavlin 543 $last = $row;
185 dpavlin 542 }
186    
187     push @m, $last if ($last);
188    
189     return @m;
190 dpavlin 540 }
191    
192     =head1 Functions to create C<data_structure>
193    
194     Those functions generally have to first in your normalization file.
195    
196 dpavlin 536 =head2 tag
197 dpavlin 433
198 dpavlin 536 Define new tag for I<search> and I<display>.
199 dpavlin 433
200 dpavlin 536 tag('Title', rec('200','a') );
201 dpavlin 13
202    
203     =cut
204    
205 dpavlin 536 sub tag {
206     my $name = shift or die "tag needs name as first argument";
207     my @o = grep { defined($_) && $_ ne '' } @_;
208     return unless (@o);
209     $out->{$name}->{tag} = $name;
210     $out->{$name}->{search} = \@o;
211     $out->{$name}->{display} = \@o;
212     }
213 dpavlin 13
214 dpavlin 536 =head2 display
215 dpavlin 13
216 dpavlin 536 Define tag just for I<display>
217 dpavlin 125
218 dpavlin 536 @v = display('Title', rec('200','a') );
219 dpavlin 125
220 dpavlin 536 =cut
221 dpavlin 125
222 dpavlin 536 sub display {
223     my $name = shift or die "display needs name as first argument";
224     my @o = grep { defined($_) && $_ ne '' } @_;
225     return unless (@o);
226     $out->{$name}->{tag} = $name;
227     $out->{$name}->{display} = \@o;
228     }
229 dpavlin 13
230 dpavlin 536 =head2 search
231 dpavlin 13
232 dpavlin 536 Prepare values just for I<search>
233 dpavlin 13
234 dpavlin 536 @v = search('Title', rec('200','a') );
235 dpavlin 433
236 dpavlin 536 =cut
237 dpavlin 13
238 dpavlin 536 sub search {
239     my $name = shift or die "search needs name as first argument";
240     my @o = grep { defined($_) && $_ ne '' } @_;
241     return unless (@o);
242     $out->{$name}->{tag} = $name;
243     $out->{$name}->{search} = \@o;
244 dpavlin 13 }
245    
246 dpavlin 547 =head2 marc
247 dpavlin 540
248     Save value for MARC field
249    
250 dpavlin 547 marc('900','a', rec('200','a') );
251 dpavlin 540
252     =cut
253    
254 dpavlin 547 sub marc {
255     my $f = shift or die "marc needs field";
256     die "marc field must be numer" unless ($f =~ /^\d+$/);
257 dpavlin 540
258 dpavlin 547 my $sf = shift or die "marc needs subfield";
259 dpavlin 540
260 dpavlin 541 foreach (@_) {
261     my $v = $_; # make var read-write for Encode
262 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
263 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
264 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
265     push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
266 dpavlin 540 }
267     }
268    
269 dpavlin 547 =head2 marc_repeatable_subfield
270    
271     Save values for MARC repetable subfield
272    
273     marc_repeatable_subfield('910', 'z', rec('909') );
274    
275     =cut
276    
277     sub marc_repeatable_subfield {
278     die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
279     $marc_repeatable_subfield->{ $_[1] }++;
280     marc(@_);
281     }
282    
283     =head2 marc_indicators
284    
285     Set both indicators for MARC field
286    
287     marc_indicators('900', ' ', 1);
288    
289     Any indicator value other than C<0-9> will be treated as undefined.
290    
291     =cut
292    
293     sub marc_indicators {
294     my $f = shift || die "marc_indicators need field!\n";
295     my ($i1,$i2) = @_;
296     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
297     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
298    
299     $i1 = ' ' if ($i1 !~ /^\d$/);
300     $i2 = ' ' if ($i2 !~ /^\d$/);
301 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
302 dpavlin 547 }
303    
304    
305 dpavlin 540 =head1 Functions to extract data from input
306    
307     This function should be used inside functions to create C<data_structure> described
308     above.
309    
310 dpavlin 536 =head2 rec1
311 dpavlin 371
312 dpavlin 536 Return all values in some field
313 dpavlin 371
314 dpavlin 536 @v = rec1('200')
315 dpavlin 15
316 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
317 dpavlin 15
318 dpavlin 536 =cut
319 dpavlin 15
320 dpavlin 536 sub rec1 {
321     my $f = shift;
322     return unless (defined($rec) && defined($rec->{$f}));
323     if (ref($rec->{$f}) eq 'ARRAY') {
324     return map {
325     if (ref($_) eq 'HASH') {
326     values %{$_};
327 dpavlin 31 } else {
328 dpavlin 536 $_;
329 dpavlin 31 }
330 dpavlin 536 } @{ $rec->{$f} };
331     } elsif( defined($rec->{$f}) ) {
332     return $rec->{$f};
333 dpavlin 15 }
334     }
335    
336 dpavlin 536 =head2 rec2
337 dpavlin 15
338 dpavlin 536 Return all values in specific field and subfield
339 dpavlin 13
340 dpavlin 536 @v = rec2('200','a')
341 dpavlin 13
342     =cut
343    
344 dpavlin 536 sub rec2 {
345     my $f = shift;
346     return unless (defined($rec && $rec->{$f}));
347     my $sf = shift;
348     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
349     }
350 dpavlin 13
351 dpavlin 536 =head2 rec
352 dpavlin 13
353 dpavlin 536 syntaxtic sugar for
354 dpavlin 13
355 dpavlin 536 @v = rec('200')
356     @v = rec('200','a')
357 dpavlin 13
358 dpavlin 536 =cut
359 dpavlin 373
360 dpavlin 536 sub rec {
361     if ($#_ == 0) {
362     return rec1(@_);
363     } elsif ($#_ == 1) {
364     return rec2(@_);
365 dpavlin 13 }
366     }
367    
368 dpavlin 536 =head2 regex
369 dpavlin 15
370 dpavlin 536 Apply regex to some or all values
371 dpavlin 15
372 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
373 dpavlin 15
374     =cut
375    
376 dpavlin 536 sub regex {
377     my $r = shift;
378     my @out;
379     #warn "r: $r\n",Dumper(\@_);
380     foreach my $t (@_) {
381     next unless ($t);
382     eval "\$t =~ $r";
383     push @out, $t if ($t && $t ne '');
384 dpavlin 368 }
385 dpavlin 536 return @out;
386 dpavlin 15 }
387    
388 dpavlin 536 =head2 prefix
389 dpavlin 15
390 dpavlin 536 Prefix all values with a string
391 dpavlin 15
392 dpavlin 536 @v = prefix( 'my_', @v );
393 dpavlin 15
394     =cut
395    
396 dpavlin 536 sub prefix {
397     my $p = shift or die "prefix needs string as first argument";
398     return map { $p . $_ } grep { defined($_) } @_;
399     }
400 dpavlin 15
401 dpavlin 536 =head2 suffix
402 dpavlin 15
403 dpavlin 536 suffix all values with a string
404 dpavlin 15
405 dpavlin 536 @v = suffix( '_my', @v );
406 dpavlin 15
407 dpavlin 536 =cut
408 dpavlin 15
409 dpavlin 536 sub suffix {
410     my $s = shift or die "suffix needs string as first argument";
411     return map { $_ . $s } grep { defined($_) } @_;
412 dpavlin 15 }
413    
414 dpavlin 536 =head2 surround
415 dpavlin 13
416 dpavlin 536 surround all values with a two strings
417 dpavlin 13
418 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
419 dpavlin 13
420     =cut
421    
422 dpavlin 536 sub surround {
423     my $p = shift or die "surround need prefix as first argument";
424     my $s = shift or die "surround needs suffix as second argument";
425     return map { $p . $_ . $s } grep { defined($_) } @_;
426 dpavlin 13 }
427    
428 dpavlin 536 =head2 first
429 dpavlin 13
430 dpavlin 536 Return first element
431 dpavlin 15
432 dpavlin 536 $v = first( @v );
433 dpavlin 13
434     =cut
435    
436 dpavlin 536 sub first {
437     my $r = shift;
438     return $r;
439 dpavlin 13 }
440    
441 dpavlin 536 =head2 lookup
442 dpavlin 13
443 dpavlin 536 Consult lookup hashes for some value
444 dpavlin 13
445 dpavlin 536 @v = lookup( $v );
446     @v = lookup( @v );
447 dpavlin 13
448     =cut
449    
450 dpavlin 536 sub lookup {
451     my $k = shift or return;
452     return unless (defined($lookup->{$k}));
453     if (ref($lookup->{$k}) eq 'ARRAY') {
454     return @{ $lookup->{$k} };
455     } else {
456     return $lookup->{$k};
457     }
458 dpavlin 13 }
459    
460 dpavlin 536 =head2 join_with
461 dpavlin 13
462 dpavlin 536 Joins walues with some delimiter
463 dpavlin 10
464 dpavlin 536 $v = join_with(", ", @v);
465 dpavlin 10
466 dpavlin 536 =cut
467 dpavlin 10
468 dpavlin 536 sub join_with {
469     my $d = shift;
470     return join($d, grep { defined($_) && $_ ne '' } @_);
471     }
472 dpavlin 10
473 dpavlin 536 # END
474     1;

  ViewVC Help
Powered by ViewVC 1.1.26