/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 544 - (hide annotations)
Thu Jun 29 21:52:51 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 7737 byte(s)
 r736@llin:  dpavlin | 2006-06-29 23:54:24 +0200
 oh, another bit of magic missing...

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 540 marc21
9    
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46     C<marc21>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112     my $out;
113 dpavlin 540 my $marc21;
114 dpavlin 541 my $marc_encoding;
115 dpavlin 538
116     sub _get_ds {
117     return $out;
118     }
119    
120     =head2 _clean_ds
121    
122     Clean data structure hash for next record
123    
124     _clean_ds();
125    
126     =cut
127    
128     sub _clean_ds {
129 dpavlin 541 my $a = {@_};
130 dpavlin 538 $out = undef;
131 dpavlin 540 $marc21 = undef;
132 dpavlin 541 $marc_encoding = $a->{marc_encoding};
133 dpavlin 538 }
134    
135     =head2 _set_lookup
136    
137     Set current lookup hash
138    
139     _set_lookup( $lookup );
140    
141     =cut
142    
143     my $lookup;
144    
145     sub _set_lookup {
146     $lookup = shift;
147     }
148    
149 dpavlin 540 =head2 _get_marc21_fields
150    
151     Get all fields defined by calls to C<marc21>
152    
153     $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154    
155 dpavlin 542
156 dpavlin 543
157     We are using I<magic> which detect repeatable fields only from
158     sequence of field/subfield data generated by normalization.
159    
160     Repeatable field is created if there is second occurence of same subfield or
161     if any of indicators are different. This is sane for most cases except for
162     non-repeatable fields with repeatable subfields.
163    
164     B<TODO>: implement exceptions to magic
165    
166 dpavlin 540 =cut
167    
168     sub _get_marc21_fields {
169 dpavlin 542 my @m;
170     my $last;
171     foreach my $row (@{ $marc21 }) {
172 dpavlin 543 if ($last &&
173     $last->[0] eq $row->[0] && # check if field is same
174     $last->[1] eq $row->[1] && # check for i1
175 dpavlin 544 $last->[2] eq $row->[2] && # and for i2
176     $last->[3] ne $row->[3] # and subfield is different
177 dpavlin 543 ) {
178     push @$last, ( $row->[3] , $row->[4] );
179     warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180 dpavlin 542 next;
181     } elsif ($last) {
182     push @m, $last;
183     }
184    
185 dpavlin 543 $last = $row;
186 dpavlin 542 }
187    
188     push @m, $last if ($last);
189    
190     return @m;
191 dpavlin 540 }
192    
193     =head1 Functions to create C<data_structure>
194    
195     Those functions generally have to first in your normalization file.
196    
197 dpavlin 536 =head2 tag
198 dpavlin 433
199 dpavlin 536 Define new tag for I<search> and I<display>.
200 dpavlin 433
201 dpavlin 536 tag('Title', rec('200','a') );
202 dpavlin 13
203    
204     =cut
205    
206 dpavlin 536 sub tag {
207     my $name = shift or die "tag needs name as first argument";
208     my @o = grep { defined($_) && $_ ne '' } @_;
209     return unless (@o);
210     $out->{$name}->{tag} = $name;
211     $out->{$name}->{search} = \@o;
212     $out->{$name}->{display} = \@o;
213     }
214 dpavlin 13
215 dpavlin 536 =head2 display
216 dpavlin 13
217 dpavlin 536 Define tag just for I<display>
218 dpavlin 125
219 dpavlin 536 @v = display('Title', rec('200','a') );
220 dpavlin 125
221 dpavlin 536 =cut
222 dpavlin 125
223 dpavlin 536 sub display {
224     my $name = shift or die "display needs name as first argument";
225     my @o = grep { defined($_) && $_ ne '' } @_;
226     return unless (@o);
227     $out->{$name}->{tag} = $name;
228     $out->{$name}->{display} = \@o;
229     }
230 dpavlin 13
231 dpavlin 536 =head2 search
232 dpavlin 13
233 dpavlin 536 Prepare values just for I<search>
234 dpavlin 13
235 dpavlin 536 @v = search('Title', rec('200','a') );
236 dpavlin 433
237 dpavlin 536 =cut
238 dpavlin 13
239 dpavlin 536 sub search {
240     my $name = shift or die "search needs name as first argument";
241     my @o = grep { defined($_) && $_ ne '' } @_;
242     return unless (@o);
243     $out->{$name}->{tag} = $name;
244     $out->{$name}->{search} = \@o;
245 dpavlin 13 }
246    
247 dpavlin 540 =head2 marc21
248    
249     Save value for MARC field
250    
251     marc21('900','a', rec('200','a') );
252    
253     =cut
254    
255     sub marc21 {
256     my $f = shift or die "marc21 needs field";
257     die "marc21 field must be numer" unless ($f =~ /^\d+$/);
258    
259     my $sf = shift or die "marc21 needs subfield";
260    
261 dpavlin 541 foreach (@_) {
262     my $v = $_; # make var read-write for Encode
263 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
264 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265 dpavlin 540 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
266     }
267     }
268    
269     =head1 Functions to extract data from input
270    
271     This function should be used inside functions to create C<data_structure> described
272     above.
273    
274 dpavlin 536 =head2 rec1
275 dpavlin 371
276 dpavlin 536 Return all values in some field
277 dpavlin 371
278 dpavlin 536 @v = rec1('200')
279 dpavlin 15
280 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
281 dpavlin 15
282 dpavlin 536 =cut
283 dpavlin 15
284 dpavlin 536 sub rec1 {
285     my $f = shift;
286     return unless (defined($rec) && defined($rec->{$f}));
287     if (ref($rec->{$f}) eq 'ARRAY') {
288     return map {
289     if (ref($_) eq 'HASH') {
290     values %{$_};
291 dpavlin 31 } else {
292 dpavlin 536 $_;
293 dpavlin 31 }
294 dpavlin 536 } @{ $rec->{$f} };
295     } elsif( defined($rec->{$f}) ) {
296     return $rec->{$f};
297 dpavlin 15 }
298     }
299    
300 dpavlin 536 =head2 rec2
301 dpavlin 15
302 dpavlin 536 Return all values in specific field and subfield
303 dpavlin 13
304 dpavlin 536 @v = rec2('200','a')
305 dpavlin 13
306     =cut
307    
308 dpavlin 536 sub rec2 {
309     my $f = shift;
310     return unless (defined($rec && $rec->{$f}));
311     my $sf = shift;
312     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
313     }
314 dpavlin 13
315 dpavlin 536 =head2 rec
316 dpavlin 13
317 dpavlin 536 syntaxtic sugar for
318 dpavlin 13
319 dpavlin 536 @v = rec('200')
320     @v = rec('200','a')
321 dpavlin 13
322 dpavlin 536 =cut
323 dpavlin 373
324 dpavlin 536 sub rec {
325     if ($#_ == 0) {
326     return rec1(@_);
327     } elsif ($#_ == 1) {
328     return rec2(@_);
329 dpavlin 13 }
330     }
331    
332 dpavlin 536 =head2 regex
333 dpavlin 15
334 dpavlin 536 Apply regex to some or all values
335 dpavlin 15
336 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
337 dpavlin 15
338     =cut
339    
340 dpavlin 536 sub regex {
341     my $r = shift;
342     my @out;
343     #warn "r: $r\n",Dumper(\@_);
344     foreach my $t (@_) {
345     next unless ($t);
346     eval "\$t =~ $r";
347     push @out, $t if ($t && $t ne '');
348 dpavlin 368 }
349 dpavlin 536 return @out;
350 dpavlin 15 }
351    
352 dpavlin 536 =head2 prefix
353 dpavlin 15
354 dpavlin 536 Prefix all values with a string
355 dpavlin 15
356 dpavlin 536 @v = prefix( 'my_', @v );
357 dpavlin 15
358     =cut
359    
360 dpavlin 536 sub prefix {
361     my $p = shift or die "prefix needs string as first argument";
362     return map { $p . $_ } grep { defined($_) } @_;
363     }
364 dpavlin 15
365 dpavlin 536 =head2 suffix
366 dpavlin 15
367 dpavlin 536 suffix all values with a string
368 dpavlin 15
369 dpavlin 536 @v = suffix( '_my', @v );
370 dpavlin 15
371 dpavlin 536 =cut
372 dpavlin 15
373 dpavlin 536 sub suffix {
374     my $s = shift or die "suffix needs string as first argument";
375     return map { $_ . $s } grep { defined($_) } @_;
376 dpavlin 15 }
377    
378 dpavlin 536 =head2 surround
379 dpavlin 13
380 dpavlin 536 surround all values with a two strings
381 dpavlin 13
382 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
383 dpavlin 13
384     =cut
385    
386 dpavlin 536 sub surround {
387     my $p = shift or die "surround need prefix as first argument";
388     my $s = shift or die "surround needs suffix as second argument";
389     return map { $p . $_ . $s } grep { defined($_) } @_;
390 dpavlin 13 }
391    
392 dpavlin 536 =head2 first
393 dpavlin 13
394 dpavlin 536 Return first element
395 dpavlin 15
396 dpavlin 536 $v = first( @v );
397 dpavlin 13
398     =cut
399    
400 dpavlin 536 sub first {
401     my $r = shift;
402     return $r;
403 dpavlin 13 }
404    
405 dpavlin 536 =head2 lookup
406 dpavlin 13
407 dpavlin 536 Consult lookup hashes for some value
408 dpavlin 13
409 dpavlin 536 @v = lookup( $v );
410     @v = lookup( @v );
411 dpavlin 13
412     =cut
413    
414 dpavlin 536 sub lookup {
415     my $k = shift or return;
416     return unless (defined($lookup->{$k}));
417     if (ref($lookup->{$k}) eq 'ARRAY') {
418     return @{ $lookup->{$k} };
419     } else {
420     return $lookup->{$k};
421     }
422 dpavlin 13 }
423    
424 dpavlin 536 =head2 join_with
425 dpavlin 13
426 dpavlin 536 Joins walues with some delimiter
427 dpavlin 10
428 dpavlin 536 $v = join_with(", ", @v);
429 dpavlin 10
430 dpavlin 536 =cut
431 dpavlin 10
432 dpavlin 536 sub join_with {
433     my $d = shift;
434     return join($d, grep { defined($_) && $_ ne '' } @_);
435     }
436 dpavlin 10
437 dpavlin 536 # END
438     1;

  ViewVC Help
Powered by ViewVC 1.1.26