/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 543 - (hide annotations)
Thu Jun 29 21:19:08 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 7678 byte(s)
 r732@llin:  dpavlin | 2006-06-29 23:20:46 +0200
 document magic (that is how WebPAC detects repeatable fields) and fix it to actually work :-)

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 540 marc21
9    
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46     C<marc21>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112     my $out;
113 dpavlin 540 my $marc21;
114 dpavlin 541 my $marc_encoding;
115 dpavlin 538
116     sub _get_ds {
117     return $out;
118     }
119    
120     =head2 _clean_ds
121    
122     Clean data structure hash for next record
123    
124     _clean_ds();
125    
126     =cut
127    
128     sub _clean_ds {
129 dpavlin 541 my $a = {@_};
130 dpavlin 538 $out = undef;
131 dpavlin 540 $marc21 = undef;
132 dpavlin 541 $marc_encoding = $a->{marc_encoding};
133 dpavlin 538 }
134    
135     =head2 _set_lookup
136    
137     Set current lookup hash
138    
139     _set_lookup( $lookup );
140    
141     =cut
142    
143     my $lookup;
144    
145     sub _set_lookup {
146     $lookup = shift;
147     }
148    
149 dpavlin 540 =head2 _get_marc21_fields
150    
151     Get all fields defined by calls to C<marc21>
152    
153     $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154    
155 dpavlin 542
156 dpavlin 543
157     We are using I<magic> which detect repeatable fields only from
158     sequence of field/subfield data generated by normalization.
159    
160     Repeatable field is created if there is second occurence of same subfield or
161     if any of indicators are different. This is sane for most cases except for
162     non-repeatable fields with repeatable subfields.
163    
164     B<TODO>: implement exceptions to magic
165    
166 dpavlin 540 =cut
167    
168     sub _get_marc21_fields {
169 dpavlin 542 my @m;
170     my $last;
171     foreach my $row (@{ $marc21 }) {
172 dpavlin 543 if ($last &&
173     $last->[0] eq $row->[0] && # check if field is same
174     $last->[1] eq $row->[1] && # check for i1
175 dpavlin 542 $last->[2] eq $row->[2] # and for i2
176 dpavlin 543 ) {
177     push @$last, ( $row->[3] , $row->[4] );
178     warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
179 dpavlin 542 next;
180     } elsif ($last) {
181     push @m, $last;
182     }
183    
184 dpavlin 543 $last = $row;
185 dpavlin 542 }
186    
187     push @m, $last if ($last);
188    
189     return @m;
190 dpavlin 540 }
191    
192     =head1 Functions to create C<data_structure>
193    
194     Those functions generally have to first in your normalization file.
195    
196 dpavlin 536 =head2 tag
197 dpavlin 433
198 dpavlin 536 Define new tag for I<search> and I<display>.
199 dpavlin 433
200 dpavlin 536 tag('Title', rec('200','a') );
201 dpavlin 13
202    
203     =cut
204    
205 dpavlin 536 sub tag {
206     my $name = shift or die "tag needs name as first argument";
207     my @o = grep { defined($_) && $_ ne '' } @_;
208     return unless (@o);
209     $out->{$name}->{tag} = $name;
210     $out->{$name}->{search} = \@o;
211     $out->{$name}->{display} = \@o;
212     }
213 dpavlin 13
214 dpavlin 536 =head2 display
215 dpavlin 13
216 dpavlin 536 Define tag just for I<display>
217 dpavlin 125
218 dpavlin 536 @v = display('Title', rec('200','a') );
219 dpavlin 125
220 dpavlin 536 =cut
221 dpavlin 125
222 dpavlin 536 sub display {
223     my $name = shift or die "display needs name as first argument";
224     my @o = grep { defined($_) && $_ ne '' } @_;
225     return unless (@o);
226     $out->{$name}->{tag} = $name;
227     $out->{$name}->{display} = \@o;
228     }
229 dpavlin 13
230 dpavlin 536 =head2 search
231 dpavlin 13
232 dpavlin 536 Prepare values just for I<search>
233 dpavlin 13
234 dpavlin 536 @v = search('Title', rec('200','a') );
235 dpavlin 433
236 dpavlin 536 =cut
237 dpavlin 13
238 dpavlin 536 sub search {
239     my $name = shift or die "search needs name as first argument";
240     my @o = grep { defined($_) && $_ ne '' } @_;
241     return unless (@o);
242     $out->{$name}->{tag} = $name;
243     $out->{$name}->{search} = \@o;
244 dpavlin 13 }
245    
246 dpavlin 540 =head2 marc21
247    
248     Save value for MARC field
249    
250     marc21('900','a', rec('200','a') );
251    
252     =cut
253    
254     sub marc21 {
255     my $f = shift or die "marc21 needs field";
256     die "marc21 field must be numer" unless ($f =~ /^\d+$/);
257    
258     my $sf = shift or die "marc21 needs subfield";
259    
260 dpavlin 541 foreach (@_) {
261     my $v = $_; # make var read-write for Encode
262 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
263 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
264 dpavlin 540 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
265     }
266     }
267    
268     =head1 Functions to extract data from input
269    
270     This function should be used inside functions to create C<data_structure> described
271     above.
272    
273 dpavlin 536 =head2 rec1
274 dpavlin 371
275 dpavlin 536 Return all values in some field
276 dpavlin 371
277 dpavlin 536 @v = rec1('200')
278 dpavlin 15
279 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
280 dpavlin 15
281 dpavlin 536 =cut
282 dpavlin 15
283 dpavlin 536 sub rec1 {
284     my $f = shift;
285     return unless (defined($rec) && defined($rec->{$f}));
286     if (ref($rec->{$f}) eq 'ARRAY') {
287     return map {
288     if (ref($_) eq 'HASH') {
289     values %{$_};
290 dpavlin 31 } else {
291 dpavlin 536 $_;
292 dpavlin 31 }
293 dpavlin 536 } @{ $rec->{$f} };
294     } elsif( defined($rec->{$f}) ) {
295     return $rec->{$f};
296 dpavlin 15 }
297     }
298    
299 dpavlin 536 =head2 rec2
300 dpavlin 15
301 dpavlin 536 Return all values in specific field and subfield
302 dpavlin 13
303 dpavlin 536 @v = rec2('200','a')
304 dpavlin 13
305     =cut
306    
307 dpavlin 536 sub rec2 {
308     my $f = shift;
309     return unless (defined($rec && $rec->{$f}));
310     my $sf = shift;
311     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
312     }
313 dpavlin 13
314 dpavlin 536 =head2 rec
315 dpavlin 13
316 dpavlin 536 syntaxtic sugar for
317 dpavlin 13
318 dpavlin 536 @v = rec('200')
319     @v = rec('200','a')
320 dpavlin 13
321 dpavlin 536 =cut
322 dpavlin 373
323 dpavlin 536 sub rec {
324     if ($#_ == 0) {
325     return rec1(@_);
326     } elsif ($#_ == 1) {
327     return rec2(@_);
328 dpavlin 13 }
329     }
330    
331 dpavlin 536 =head2 regex
332 dpavlin 15
333 dpavlin 536 Apply regex to some or all values
334 dpavlin 15
335 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
336 dpavlin 15
337     =cut
338    
339 dpavlin 536 sub regex {
340     my $r = shift;
341     my @out;
342     #warn "r: $r\n",Dumper(\@_);
343     foreach my $t (@_) {
344     next unless ($t);
345     eval "\$t =~ $r";
346     push @out, $t if ($t && $t ne '');
347 dpavlin 368 }
348 dpavlin 536 return @out;
349 dpavlin 15 }
350    
351 dpavlin 536 =head2 prefix
352 dpavlin 15
353 dpavlin 536 Prefix all values with a string
354 dpavlin 15
355 dpavlin 536 @v = prefix( 'my_', @v );
356 dpavlin 15
357     =cut
358    
359 dpavlin 536 sub prefix {
360     my $p = shift or die "prefix needs string as first argument";
361     return map { $p . $_ } grep { defined($_) } @_;
362     }
363 dpavlin 15
364 dpavlin 536 =head2 suffix
365 dpavlin 15
366 dpavlin 536 suffix all values with a string
367 dpavlin 15
368 dpavlin 536 @v = suffix( '_my', @v );
369 dpavlin 15
370 dpavlin 536 =cut
371 dpavlin 15
372 dpavlin 536 sub suffix {
373     my $s = shift or die "suffix needs string as first argument";
374     return map { $_ . $s } grep { defined($_) } @_;
375 dpavlin 15 }
376    
377 dpavlin 536 =head2 surround
378 dpavlin 13
379 dpavlin 536 surround all values with a two strings
380 dpavlin 13
381 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
382 dpavlin 13
383     =cut
384    
385 dpavlin 536 sub surround {
386     my $p = shift or die "surround need prefix as first argument";
387     my $s = shift or die "surround needs suffix as second argument";
388     return map { $p . $_ . $s } grep { defined($_) } @_;
389 dpavlin 13 }
390    
391 dpavlin 536 =head2 first
392 dpavlin 13
393 dpavlin 536 Return first element
394 dpavlin 15
395 dpavlin 536 $v = first( @v );
396 dpavlin 13
397     =cut
398    
399 dpavlin 536 sub first {
400     my $r = shift;
401     return $r;
402 dpavlin 13 }
403    
404 dpavlin 536 =head2 lookup
405 dpavlin 13
406 dpavlin 536 Consult lookup hashes for some value
407 dpavlin 13
408 dpavlin 536 @v = lookup( $v );
409     @v = lookup( @v );
410 dpavlin 13
411     =cut
412    
413 dpavlin 536 sub lookup {
414     my $k = shift or return;
415     return unless (defined($lookup->{$k}));
416     if (ref($lookup->{$k}) eq 'ARRAY') {
417     return @{ $lookup->{$k} };
418     } else {
419     return $lookup->{$k};
420     }
421 dpavlin 13 }
422    
423 dpavlin 536 =head2 join_with
424 dpavlin 13
425 dpavlin 536 Joins walues with some delimiter
426 dpavlin 10
427 dpavlin 536 $v = join_with(", ", @v);
428 dpavlin 10
429 dpavlin 536 =cut
430 dpavlin 10
431 dpavlin 536 sub join_with {
432     my $d = shift;
433     return join($d, grep { defined($_) && $_ ne '' } @_);
434     }
435 dpavlin 10
436 dpavlin 536 # END
437     1;

  ViewVC Help
Powered by ViewVC 1.1.26