/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 542 - (hide annotations)
Thu Jun 29 21:18:59 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 7406 byte(s)
 r731@llin:  dpavlin | 2006-06-29 23:02:08 +0200
 implement magic to create fields and repeatable fields (which might be broken for some cases).

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 540 marc21
9    
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46     C<marc21>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112     my $out;
113 dpavlin 540 my $marc21;
114 dpavlin 541 my $marc_encoding;
115 dpavlin 538
116     sub _get_ds {
117     return $out;
118     }
119    
120     =head2 _clean_ds
121    
122     Clean data structure hash for next record
123    
124     _clean_ds();
125    
126     =cut
127    
128     sub _clean_ds {
129 dpavlin 541 my $a = {@_};
130 dpavlin 538 $out = undef;
131 dpavlin 540 $marc21 = undef;
132 dpavlin 541 $marc_encoding = $a->{marc_encoding};
133 dpavlin 538 }
134    
135     =head2 _set_lookup
136    
137     Set current lookup hash
138    
139     _set_lookup( $lookup );
140    
141     =cut
142    
143     my $lookup;
144    
145     sub _set_lookup {
146     $lookup = shift;
147     }
148    
149 dpavlin 540 =head2 _get_marc21_fields
150    
151     Get all fields defined by calls to C<marc21>
152    
153     $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154    
155 dpavlin 542 B<TODO>: implement exceptions to magic which unrolls repeated subfields
156     as new field with that subfield.
157    
158 dpavlin 540 =cut
159    
160     sub _get_marc21_fields {
161 dpavlin 542 my @m;
162     my $last;
163     foreach my $row (@{ $marc21 }) {
164     if ($last && (
165     $last->[0] eq $row->[0] || # check if field is same
166     $last->[1] eq $row->[1] || # check for i1
167     $last->[2] eq $row->[2] # and for i2
168     ) ) {
169     $last->[3]->{ $row->[3] } = $row->[4];
170     warn "## ++ added $row->[0] ^$row->[3]\n";
171     next;
172     } elsif ($last) {
173     push @m, $last;
174     }
175    
176     push @m, $row;
177     }
178    
179     push @m, $last if ($last);
180    
181     return @m;
182 dpavlin 540 }
183    
184     =head1 Functions to create C<data_structure>
185    
186     Those functions generally have to first in your normalization file.
187    
188 dpavlin 536 =head2 tag
189 dpavlin 433
190 dpavlin 536 Define new tag for I<search> and I<display>.
191 dpavlin 433
192 dpavlin 536 tag('Title', rec('200','a') );
193 dpavlin 13
194    
195     =cut
196    
197 dpavlin 536 sub tag {
198     my $name = shift or die "tag needs name as first argument";
199     my @o = grep { defined($_) && $_ ne '' } @_;
200     return unless (@o);
201     $out->{$name}->{tag} = $name;
202     $out->{$name}->{search} = \@o;
203     $out->{$name}->{display} = \@o;
204     }
205 dpavlin 13
206 dpavlin 536 =head2 display
207 dpavlin 13
208 dpavlin 536 Define tag just for I<display>
209 dpavlin 125
210 dpavlin 536 @v = display('Title', rec('200','a') );
211 dpavlin 125
212 dpavlin 536 =cut
213 dpavlin 125
214 dpavlin 536 sub display {
215     my $name = shift or die "display needs name as first argument";
216     my @o = grep { defined($_) && $_ ne '' } @_;
217     return unless (@o);
218     $out->{$name}->{tag} = $name;
219     $out->{$name}->{display} = \@o;
220     }
221 dpavlin 13
222 dpavlin 536 =head2 search
223 dpavlin 13
224 dpavlin 536 Prepare values just for I<search>
225 dpavlin 13
226 dpavlin 536 @v = search('Title', rec('200','a') );
227 dpavlin 433
228 dpavlin 536 =cut
229 dpavlin 13
230 dpavlin 536 sub search {
231     my $name = shift or die "search needs name as first argument";
232     my @o = grep { defined($_) && $_ ne '' } @_;
233     return unless (@o);
234     $out->{$name}->{tag} = $name;
235     $out->{$name}->{search} = \@o;
236 dpavlin 13 }
237    
238 dpavlin 540 =head2 marc21
239    
240     Save value for MARC field
241    
242     marc21('900','a', rec('200','a') );
243    
244     =cut
245    
246     sub marc21 {
247     my $f = shift or die "marc21 needs field";
248     die "marc21 field must be numer" unless ($f =~ /^\d+$/);
249    
250     my $sf = shift or die "marc21 needs subfield";
251    
252 dpavlin 541 foreach (@_) {
253     my $v = $_; # make var read-write for Encode
254     next unless (defined($v) && $v !~ /^\s+$/);
255     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
256 dpavlin 540 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
257     }
258     }
259    
260     =head1 Functions to extract data from input
261    
262     This function should be used inside functions to create C<data_structure> described
263     above.
264    
265 dpavlin 536 =head2 rec1
266 dpavlin 371
267 dpavlin 536 Return all values in some field
268 dpavlin 371
269 dpavlin 536 @v = rec1('200')
270 dpavlin 15
271 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
272 dpavlin 15
273 dpavlin 536 =cut
274 dpavlin 15
275 dpavlin 536 sub rec1 {
276     my $f = shift;
277     return unless (defined($rec) && defined($rec->{$f}));
278     if (ref($rec->{$f}) eq 'ARRAY') {
279     return map {
280     if (ref($_) eq 'HASH') {
281     values %{$_};
282 dpavlin 31 } else {
283 dpavlin 536 $_;
284 dpavlin 31 }
285 dpavlin 536 } @{ $rec->{$f} };
286     } elsif( defined($rec->{$f}) ) {
287     return $rec->{$f};
288 dpavlin 15 }
289     }
290    
291 dpavlin 536 =head2 rec2
292 dpavlin 15
293 dpavlin 536 Return all values in specific field and subfield
294 dpavlin 13
295 dpavlin 536 @v = rec2('200','a')
296 dpavlin 13
297     =cut
298    
299 dpavlin 536 sub rec2 {
300     my $f = shift;
301     return unless (defined($rec && $rec->{$f}));
302     my $sf = shift;
303     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
304     }
305 dpavlin 13
306 dpavlin 536 =head2 rec
307 dpavlin 13
308 dpavlin 536 syntaxtic sugar for
309 dpavlin 13
310 dpavlin 536 @v = rec('200')
311     @v = rec('200','a')
312 dpavlin 13
313 dpavlin 536 =cut
314 dpavlin 373
315 dpavlin 536 sub rec {
316     if ($#_ == 0) {
317     return rec1(@_);
318     } elsif ($#_ == 1) {
319     return rec2(@_);
320 dpavlin 13 }
321     }
322    
323 dpavlin 536 =head2 regex
324 dpavlin 15
325 dpavlin 536 Apply regex to some or all values
326 dpavlin 15
327 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
328 dpavlin 15
329     =cut
330    
331 dpavlin 536 sub regex {
332     my $r = shift;
333     my @out;
334     #warn "r: $r\n",Dumper(\@_);
335     foreach my $t (@_) {
336     next unless ($t);
337     eval "\$t =~ $r";
338     push @out, $t if ($t && $t ne '');
339 dpavlin 368 }
340 dpavlin 536 return @out;
341 dpavlin 15 }
342    
343 dpavlin 536 =head2 prefix
344 dpavlin 15
345 dpavlin 536 Prefix all values with a string
346 dpavlin 15
347 dpavlin 536 @v = prefix( 'my_', @v );
348 dpavlin 15
349     =cut
350    
351 dpavlin 536 sub prefix {
352     my $p = shift or die "prefix needs string as first argument";
353     return map { $p . $_ } grep { defined($_) } @_;
354     }
355 dpavlin 15
356 dpavlin 536 =head2 suffix
357 dpavlin 15
358 dpavlin 536 suffix all values with a string
359 dpavlin 15
360 dpavlin 536 @v = suffix( '_my', @v );
361 dpavlin 15
362 dpavlin 536 =cut
363 dpavlin 15
364 dpavlin 536 sub suffix {
365     my $s = shift or die "suffix needs string as first argument";
366     return map { $_ . $s } grep { defined($_) } @_;
367 dpavlin 15 }
368    
369 dpavlin 536 =head2 surround
370 dpavlin 13
371 dpavlin 536 surround all values with a two strings
372 dpavlin 13
373 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
374 dpavlin 13
375     =cut
376    
377 dpavlin 536 sub surround {
378     my $p = shift or die "surround need prefix as first argument";
379     my $s = shift or die "surround needs suffix as second argument";
380     return map { $p . $_ . $s } grep { defined($_) } @_;
381 dpavlin 13 }
382    
383 dpavlin 536 =head2 first
384 dpavlin 13
385 dpavlin 536 Return first element
386 dpavlin 15
387 dpavlin 536 $v = first( @v );
388 dpavlin 13
389     =cut
390    
391 dpavlin 536 sub first {
392     my $r = shift;
393     return $r;
394 dpavlin 13 }
395    
396 dpavlin 536 =head2 lookup
397 dpavlin 13
398 dpavlin 536 Consult lookup hashes for some value
399 dpavlin 13
400 dpavlin 536 @v = lookup( $v );
401     @v = lookup( @v );
402 dpavlin 13
403     =cut
404    
405 dpavlin 536 sub lookup {
406     my $k = shift or return;
407     return unless (defined($lookup->{$k}));
408     if (ref($lookup->{$k}) eq 'ARRAY') {
409     return @{ $lookup->{$k} };
410     } else {
411     return $lookup->{$k};
412     }
413 dpavlin 13 }
414    
415 dpavlin 536 =head2 join_with
416 dpavlin 13
417 dpavlin 536 Joins walues with some delimiter
418 dpavlin 10
419 dpavlin 536 $v = join_with(", ", @v);
420 dpavlin 10
421 dpavlin 536 =cut
422 dpavlin 10
423 dpavlin 536 sub join_with {
424     my $d = shift;
425     return join($d, grep { defined($_) && $_ ne '' } @_);
426     }
427 dpavlin 10
428 dpavlin 536 # END
429     1;

  ViewVC Help
Powered by ViewVC 1.1.26