/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 547 - (hide annotations)
Thu Jun 29 23:19:26 2006 UTC (17 years, 10 months ago) by dpavlin
File size: 8948 byte(s)
 r742@llin:  dpavlin | 2006-06-30 01:21:24 +0200
 added marc_repetable_subfield and marc_indicators, renamed marc21 to marc [2.23]

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 547 marc marc_indicators marc_repeatable_subfield
9 dpavlin 540
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 dpavlin 547 C<marc>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112 dpavlin 547 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113 dpavlin 538
114     sub _get_ds {
115     return $out;
116     }
117    
118     =head2 _clean_ds
119    
120     Clean data structure hash for next record
121    
122     _clean_ds();
123    
124     =cut
125    
126     sub _clean_ds {
127 dpavlin 541 my $a = {@_};
128 dpavlin 547 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129 dpavlin 541 $marc_encoding = $a->{marc_encoding};
130 dpavlin 538 }
131    
132     =head2 _set_lookup
133    
134     Set current lookup hash
135    
136     _set_lookup( $lookup );
137    
138     =cut
139    
140     my $lookup;
141    
142     sub _set_lookup {
143     $lookup = shift;
144     }
145    
146 dpavlin 547 =head2 _get_marc_fields
147 dpavlin 540
148 dpavlin 547 Get all fields defined by calls to C<marc>
149 dpavlin 540
150 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151 dpavlin 540
152 dpavlin 542
153 dpavlin 543
154     We are using I<magic> which detect repeatable fields only from
155     sequence of field/subfield data generated by normalization.
156    
157     Repeatable field is created if there is second occurence of same subfield or
158     if any of indicators are different. This is sane for most cases except for
159     non-repeatable fields with repeatable subfields.
160    
161 dpavlin 547 You can change behaviour of that using C<marc_repeatable_subfield>.
162 dpavlin 543
163 dpavlin 540 =cut
164    
165 dpavlin 547 sub _get_marc_fields {
166 dpavlin 542 my @m;
167     my $last;
168 dpavlin 547 foreach my $row (@{ $marc_record }) {
169 dpavlin 543 if ($last &&
170     $last->[0] eq $row->[0] && # check if field is same
171     $last->[1] eq $row->[1] && # check for i1
172 dpavlin 544 $last->[2] eq $row->[2] && # and for i2
173 dpavlin 547 ( $last->[3] ne $row->[3] || # and subfield is different
174     $last->[3] eq $row->[3] && # or subfield is same,
175     $marc_repeatable_subfield->{ $row->[3] } # but is repeatable
176     )
177 dpavlin 543 ) {
178     push @$last, ( $row->[3] , $row->[4] );
179     warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180 dpavlin 542 next;
181     } elsif ($last) {
182     push @m, $last;
183     }
184    
185 dpavlin 543 $last = $row;
186 dpavlin 542 }
187    
188     push @m, $last if ($last);
189    
190     return @m;
191 dpavlin 540 }
192    
193     =head1 Functions to create C<data_structure>
194    
195     Those functions generally have to first in your normalization file.
196    
197 dpavlin 536 =head2 tag
198 dpavlin 433
199 dpavlin 536 Define new tag for I<search> and I<display>.
200 dpavlin 433
201 dpavlin 536 tag('Title', rec('200','a') );
202 dpavlin 13
203    
204     =cut
205    
206 dpavlin 536 sub tag {
207     my $name = shift or die "tag needs name as first argument";
208     my @o = grep { defined($_) && $_ ne '' } @_;
209     return unless (@o);
210     $out->{$name}->{tag} = $name;
211     $out->{$name}->{search} = \@o;
212     $out->{$name}->{display} = \@o;
213     }
214 dpavlin 13
215 dpavlin 536 =head2 display
216 dpavlin 13
217 dpavlin 536 Define tag just for I<display>
218 dpavlin 125
219 dpavlin 536 @v = display('Title', rec('200','a') );
220 dpavlin 125
221 dpavlin 536 =cut
222 dpavlin 125
223 dpavlin 536 sub display {
224     my $name = shift or die "display needs name as first argument";
225     my @o = grep { defined($_) && $_ ne '' } @_;
226     return unless (@o);
227     $out->{$name}->{tag} = $name;
228     $out->{$name}->{display} = \@o;
229     }
230 dpavlin 13
231 dpavlin 536 =head2 search
232 dpavlin 13
233 dpavlin 536 Prepare values just for I<search>
234 dpavlin 13
235 dpavlin 536 @v = search('Title', rec('200','a') );
236 dpavlin 433
237 dpavlin 536 =cut
238 dpavlin 13
239 dpavlin 536 sub search {
240     my $name = shift or die "search needs name as first argument";
241     my @o = grep { defined($_) && $_ ne '' } @_;
242     return unless (@o);
243     $out->{$name}->{tag} = $name;
244     $out->{$name}->{search} = \@o;
245 dpavlin 13 }
246    
247 dpavlin 547 =head2 marc
248 dpavlin 540
249     Save value for MARC field
250    
251 dpavlin 547 marc('900','a', rec('200','a') );
252 dpavlin 540
253     =cut
254    
255 dpavlin 547 sub marc {
256     my $f = shift or die "marc needs field";
257     die "marc field must be numer" unless ($f =~ /^\d+$/);
258 dpavlin 540
259 dpavlin 547 my $sf = shift or die "marc needs subfield";
260 dpavlin 540
261 dpavlin 541 foreach (@_) {
262     my $v = $_; # make var read-write for Encode
263 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
264 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265 dpavlin 547 push @{ $marc_record }, [
266     $f,
267     $marc_indicators->{$f}->{i1} || ' ',
268     $marc_indicators->{$f}->{i2} || ' ',
269     $sf => $v
270     ];
271 dpavlin 540 }
272     }
273    
274 dpavlin 547 =head2 marc_repeatable_subfield
275    
276     Save values for MARC repetable subfield
277    
278     marc_repeatable_subfield('910', 'z', rec('909') );
279    
280     =cut
281    
282     sub marc_repeatable_subfield {
283     die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
284     $marc_repeatable_subfield->{ $_[1] }++;
285     marc(@_);
286     }
287    
288     =head2 marc_indicators
289    
290     Set both indicators for MARC field
291    
292     marc_indicators('900', ' ', 1);
293    
294     Any indicator value other than C<0-9> will be treated as undefined.
295    
296     =cut
297    
298     sub marc_indicators {
299     my $f = shift || die "marc_indicators need field!\n";
300     my ($i1,$i2) = @_;
301     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
302     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
303    
304     $i1 = ' ' if ($i1 !~ /^\d$/);
305     $i2 = ' ' if ($i2 !~ /^\d$/);
306     $marc_indicators->{$f}->{i1} = $i1;
307     $marc_indicators->{$f}->{i2} = $i2;
308     }
309    
310    
311 dpavlin 540 =head1 Functions to extract data from input
312    
313     This function should be used inside functions to create C<data_structure> described
314     above.
315    
316 dpavlin 536 =head2 rec1
317 dpavlin 371
318 dpavlin 536 Return all values in some field
319 dpavlin 371
320 dpavlin 536 @v = rec1('200')
321 dpavlin 15
322 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
323 dpavlin 15
324 dpavlin 536 =cut
325 dpavlin 15
326 dpavlin 536 sub rec1 {
327     my $f = shift;
328     return unless (defined($rec) && defined($rec->{$f}));
329     if (ref($rec->{$f}) eq 'ARRAY') {
330     return map {
331     if (ref($_) eq 'HASH') {
332     values %{$_};
333 dpavlin 31 } else {
334 dpavlin 536 $_;
335 dpavlin 31 }
336 dpavlin 536 } @{ $rec->{$f} };
337     } elsif( defined($rec->{$f}) ) {
338     return $rec->{$f};
339 dpavlin 15 }
340     }
341    
342 dpavlin 536 =head2 rec2
343 dpavlin 15
344 dpavlin 536 Return all values in specific field and subfield
345 dpavlin 13
346 dpavlin 536 @v = rec2('200','a')
347 dpavlin 13
348     =cut
349    
350 dpavlin 536 sub rec2 {
351     my $f = shift;
352     return unless (defined($rec && $rec->{$f}));
353     my $sf = shift;
354     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
355     }
356 dpavlin 13
357 dpavlin 536 =head2 rec
358 dpavlin 13
359 dpavlin 536 syntaxtic sugar for
360 dpavlin 13
361 dpavlin 536 @v = rec('200')
362     @v = rec('200','a')
363 dpavlin 13
364 dpavlin 536 =cut
365 dpavlin 373
366 dpavlin 536 sub rec {
367     if ($#_ == 0) {
368     return rec1(@_);
369     } elsif ($#_ == 1) {
370     return rec2(@_);
371 dpavlin 13 }
372     }
373    
374 dpavlin 536 =head2 regex
375 dpavlin 15
376 dpavlin 536 Apply regex to some or all values
377 dpavlin 15
378 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
379 dpavlin 15
380     =cut
381    
382 dpavlin 536 sub regex {
383     my $r = shift;
384     my @out;
385     #warn "r: $r\n",Dumper(\@_);
386     foreach my $t (@_) {
387     next unless ($t);
388     eval "\$t =~ $r";
389     push @out, $t if ($t && $t ne '');
390 dpavlin 368 }
391 dpavlin 536 return @out;
392 dpavlin 15 }
393    
394 dpavlin 536 =head2 prefix
395 dpavlin 15
396 dpavlin 536 Prefix all values with a string
397 dpavlin 15
398 dpavlin 536 @v = prefix( 'my_', @v );
399 dpavlin 15
400     =cut
401    
402 dpavlin 536 sub prefix {
403     my $p = shift or die "prefix needs string as first argument";
404     return map { $p . $_ } grep { defined($_) } @_;
405     }
406 dpavlin 15
407 dpavlin 536 =head2 suffix
408 dpavlin 15
409 dpavlin 536 suffix all values with a string
410 dpavlin 15
411 dpavlin 536 @v = suffix( '_my', @v );
412 dpavlin 15
413 dpavlin 536 =cut
414 dpavlin 15
415 dpavlin 536 sub suffix {
416     my $s = shift or die "suffix needs string as first argument";
417     return map { $_ . $s } grep { defined($_) } @_;
418 dpavlin 15 }
419    
420 dpavlin 536 =head2 surround
421 dpavlin 13
422 dpavlin 536 surround all values with a two strings
423 dpavlin 13
424 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
425 dpavlin 13
426     =cut
427    
428 dpavlin 536 sub surround {
429     my $p = shift or die "surround need prefix as first argument";
430     my $s = shift or die "surround needs suffix as second argument";
431     return map { $p . $_ . $s } grep { defined($_) } @_;
432 dpavlin 13 }
433    
434 dpavlin 536 =head2 first
435 dpavlin 13
436 dpavlin 536 Return first element
437 dpavlin 15
438 dpavlin 536 $v = first( @v );
439 dpavlin 13
440     =cut
441    
442 dpavlin 536 sub first {
443     my $r = shift;
444     return $r;
445 dpavlin 13 }
446    
447 dpavlin 536 =head2 lookup
448 dpavlin 13
449 dpavlin 536 Consult lookup hashes for some value
450 dpavlin 13
451 dpavlin 536 @v = lookup( $v );
452     @v = lookup( @v );
453 dpavlin 13
454     =cut
455    
456 dpavlin 536 sub lookup {
457     my $k = shift or return;
458     return unless (defined($lookup->{$k}));
459     if (ref($lookup->{$k}) eq 'ARRAY') {
460     return @{ $lookup->{$k} };
461     } else {
462     return $lookup->{$k};
463     }
464 dpavlin 13 }
465    
466 dpavlin 536 =head2 join_with
467 dpavlin 13
468 dpavlin 536 Joins walues with some delimiter
469 dpavlin 10
470 dpavlin 536 $v = join_with(", ", @v);
471 dpavlin 10
472 dpavlin 536 =cut
473 dpavlin 10
474 dpavlin 536 sub join_with {
475     my $d = shift;
476     return join($d, grep { defined($_) && $_ ne '' } @_);
477     }
478 dpavlin 10
479 dpavlin 536 # END
480     1;

  ViewVC Help
Powered by ViewVC 1.1.26