/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 541 - (hide annotations)
Thu Jun 29 21:18:50 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 6895 byte(s)
 r730@llin:  dpavlin | 2006-06-29 21:33:48 +0200
 use MARC::Record 2.0 to support utf-8 encoding in MARC
 http://marcpm.sourceforge.net/

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6    
7 dpavlin 536 tag search display
8 dpavlin 540 marc21
9    
10 dpavlin 536 rec1 rec2 rec
11     regex prefix suffix surround
12     first lookup join_with
13     /;
14 dpavlin 10
15     use warnings;
16     use strict;
17 dpavlin 536
18     #use base qw/WebPAC::Common/;
19 dpavlin 13 use Data::Dumper;
20 dpavlin 541 use Encode qw/from_to/;
21 dpavlin 10
22     =head1 NAME
23    
24 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
25 dpavlin 10
26     =head1 VERSION
27    
28 dpavlin 541 Version 0.06
29 dpavlin 10
30     =cut
31    
32 dpavlin 541 our $VERSION = '0.06';
33 dpavlin 10
34     =head1 SYNOPSIS
35    
36 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
37     from input records using perl functions which are specialized for set
38     processing.
39 dpavlin 10
40 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
41     means that you check it's validity before running WebPAC using
42     C<perl -c normalize.pl>.
43 dpavlin 15
44 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
45 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46     C<marc21>.
47 dpavlin 15
48 dpavlin 10 =head1 FUNCTIONS
49    
50 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
51     All other functions are available for use within normalisation rules.
52    
53 dpavlin 536 =head2 data_structure
54 dpavlin 10
55 dpavlin 536 Return data structure
56 dpavlin 13
57 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
58 dpavlin 536 lookup => $lookup->lookup_hash,
59     row => $row,
60     rules => $normalize_pl_config,
61 dpavlin 541 marc_encoding => 'utf-8',
62 dpavlin 13 );
63    
64 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65     other are optional.
66    
67 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
68 dpavlin 15
69 dpavlin 538 Since this function isn't exported you have to call it with
70     C<WebPAC::Normalize::data_structure>.
71    
72 dpavlin 10 =cut
73    
74 dpavlin 536 sub data_structure {
75     my $arg = {@_};
76 dpavlin 13
77 dpavlin 536 die "need row argument" unless ($arg->{row});
78     die "need normalisation argument" unless ($arg->{rules});
79 dpavlin 31
80 dpavlin 536 no strict 'subs';
81 dpavlin 538 _set_lookup( $arg->{lookup} );
82     _set_rec( $arg->{row} );
83 dpavlin 541 _clean_ds( %{ $arg } );
84 dpavlin 536 eval "$arg->{rules}";
85     die "error evaling $arg->{rules}: $@\n" if ($@);
86 dpavlin 540
87 dpavlin 538 return _get_ds();
88 dpavlin 10 }
89    
90 dpavlin 538 =head2 _set_rec
91 dpavlin 13
92 dpavlin 536 Set current record hash
93 dpavlin 433
94 dpavlin 538 _set_rec( $rec );
95 dpavlin 433
96     =cut
97    
98 dpavlin 536 my $rec;
99 dpavlin 433
100 dpavlin 538 sub _set_rec {
101 dpavlin 536 $rec = shift or die "no record hash";
102 dpavlin 433 }
103    
104 dpavlin 538 =head2 _get_ds
105    
106     Return hash formatted as data structure
107    
108     my $ds = _get_ds();
109    
110     =cut
111    
112     my $out;
113 dpavlin 540 my $marc21;
114 dpavlin 541 my $marc_encoding;
115 dpavlin 538
116     sub _get_ds {
117     return $out;
118     }
119    
120     =head2 _clean_ds
121    
122     Clean data structure hash for next record
123    
124     _clean_ds();
125    
126     =cut
127    
128     sub _clean_ds {
129 dpavlin 541 my $a = {@_};
130 dpavlin 538 $out = undef;
131 dpavlin 540 $marc21 = undef;
132 dpavlin 541 $marc_encoding = $a->{marc_encoding};
133 dpavlin 538 }
134    
135     =head2 _set_lookup
136    
137     Set current lookup hash
138    
139     _set_lookup( $lookup );
140    
141     =cut
142    
143     my $lookup;
144    
145     sub _set_lookup {
146     $lookup = shift;
147     }
148    
149 dpavlin 540 =head2 _get_marc21_fields
150    
151     Get all fields defined by calls to C<marc21>
152    
153     $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154    
155     =cut
156    
157     sub _get_marc21_fields {
158     return @{$marc21};
159     }
160    
161     =head1 Functions to create C<data_structure>
162    
163     Those functions generally have to first in your normalization file.
164    
165 dpavlin 536 =head2 tag
166 dpavlin 433
167 dpavlin 536 Define new tag for I<search> and I<display>.
168 dpavlin 433
169 dpavlin 536 tag('Title', rec('200','a') );
170 dpavlin 13
171    
172     =cut
173    
174 dpavlin 536 sub tag {
175     my $name = shift or die "tag needs name as first argument";
176     my @o = grep { defined($_) && $_ ne '' } @_;
177     return unless (@o);
178     $out->{$name}->{tag} = $name;
179     $out->{$name}->{search} = \@o;
180     $out->{$name}->{display} = \@o;
181     }
182 dpavlin 13
183 dpavlin 536 =head2 display
184 dpavlin 13
185 dpavlin 536 Define tag just for I<display>
186 dpavlin 125
187 dpavlin 536 @v = display('Title', rec('200','a') );
188 dpavlin 125
189 dpavlin 536 =cut
190 dpavlin 125
191 dpavlin 536 sub display {
192     my $name = shift or die "display needs name as first argument";
193     my @o = grep { defined($_) && $_ ne '' } @_;
194     return unless (@o);
195     $out->{$name}->{tag} = $name;
196     $out->{$name}->{display} = \@o;
197     }
198 dpavlin 13
199 dpavlin 536 =head2 search
200 dpavlin 13
201 dpavlin 536 Prepare values just for I<search>
202 dpavlin 13
203 dpavlin 536 @v = search('Title', rec('200','a') );
204 dpavlin 433
205 dpavlin 536 =cut
206 dpavlin 13
207 dpavlin 536 sub search {
208     my $name = shift or die "search needs name as first argument";
209     my @o = grep { defined($_) && $_ ne '' } @_;
210     return unless (@o);
211     $out->{$name}->{tag} = $name;
212     $out->{$name}->{search} = \@o;
213 dpavlin 13 }
214    
215 dpavlin 540 =head2 marc21
216    
217     Save value for MARC field
218    
219     marc21('900','a', rec('200','a') );
220    
221     =cut
222    
223     sub marc21 {
224     my $f = shift or die "marc21 needs field";
225     die "marc21 field must be numer" unless ($f =~ /^\d+$/);
226    
227     my $sf = shift or die "marc21 needs subfield";
228    
229 dpavlin 541 foreach (@_) {
230     my $v = $_; # make var read-write for Encode
231     next unless (defined($v) && $v !~ /^\s+$/);
232     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
233 dpavlin 540 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
234     }
235     }
236    
237     =head1 Functions to extract data from input
238    
239     This function should be used inside functions to create C<data_structure> described
240     above.
241    
242 dpavlin 536 =head2 rec1
243 dpavlin 371
244 dpavlin 536 Return all values in some field
245 dpavlin 371
246 dpavlin 536 @v = rec1('200')
247 dpavlin 15
248 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
249 dpavlin 15
250 dpavlin 536 =cut
251 dpavlin 15
252 dpavlin 536 sub rec1 {
253     my $f = shift;
254     return unless (defined($rec) && defined($rec->{$f}));
255     if (ref($rec->{$f}) eq 'ARRAY') {
256     return map {
257     if (ref($_) eq 'HASH') {
258     values %{$_};
259 dpavlin 31 } else {
260 dpavlin 536 $_;
261 dpavlin 31 }
262 dpavlin 536 } @{ $rec->{$f} };
263     } elsif( defined($rec->{$f}) ) {
264     return $rec->{$f};
265 dpavlin 15 }
266     }
267    
268 dpavlin 536 =head2 rec2
269 dpavlin 15
270 dpavlin 536 Return all values in specific field and subfield
271 dpavlin 13
272 dpavlin 536 @v = rec2('200','a')
273 dpavlin 13
274     =cut
275    
276 dpavlin 536 sub rec2 {
277     my $f = shift;
278     return unless (defined($rec && $rec->{$f}));
279     my $sf = shift;
280     return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
281     }
282 dpavlin 13
283 dpavlin 536 =head2 rec
284 dpavlin 13
285 dpavlin 536 syntaxtic sugar for
286 dpavlin 13
287 dpavlin 536 @v = rec('200')
288     @v = rec('200','a')
289 dpavlin 13
290 dpavlin 536 =cut
291 dpavlin 373
292 dpavlin 536 sub rec {
293     if ($#_ == 0) {
294     return rec1(@_);
295     } elsif ($#_ == 1) {
296     return rec2(@_);
297 dpavlin 13 }
298     }
299    
300 dpavlin 536 =head2 regex
301 dpavlin 15
302 dpavlin 536 Apply regex to some or all values
303 dpavlin 15
304 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
305 dpavlin 15
306     =cut
307    
308 dpavlin 536 sub regex {
309     my $r = shift;
310     my @out;
311     #warn "r: $r\n",Dumper(\@_);
312     foreach my $t (@_) {
313     next unless ($t);
314     eval "\$t =~ $r";
315     push @out, $t if ($t && $t ne '');
316 dpavlin 368 }
317 dpavlin 536 return @out;
318 dpavlin 15 }
319    
320 dpavlin 536 =head2 prefix
321 dpavlin 15
322 dpavlin 536 Prefix all values with a string
323 dpavlin 15
324 dpavlin 536 @v = prefix( 'my_', @v );
325 dpavlin 15
326     =cut
327    
328 dpavlin 536 sub prefix {
329     my $p = shift or die "prefix needs string as first argument";
330     return map { $p . $_ } grep { defined($_) } @_;
331     }
332 dpavlin 15
333 dpavlin 536 =head2 suffix
334 dpavlin 15
335 dpavlin 536 suffix all values with a string
336 dpavlin 15
337 dpavlin 536 @v = suffix( '_my', @v );
338 dpavlin 15
339 dpavlin 536 =cut
340 dpavlin 15
341 dpavlin 536 sub suffix {
342     my $s = shift or die "suffix needs string as first argument";
343     return map { $_ . $s } grep { defined($_) } @_;
344 dpavlin 15 }
345    
346 dpavlin 536 =head2 surround
347 dpavlin 13
348 dpavlin 536 surround all values with a two strings
349 dpavlin 13
350 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
351 dpavlin 13
352     =cut
353    
354 dpavlin 536 sub surround {
355     my $p = shift or die "surround need prefix as first argument";
356     my $s = shift or die "surround needs suffix as second argument";
357     return map { $p . $_ . $s } grep { defined($_) } @_;
358 dpavlin 13 }
359    
360 dpavlin 536 =head2 first
361 dpavlin 13
362 dpavlin 536 Return first element
363 dpavlin 15
364 dpavlin 536 $v = first( @v );
365 dpavlin 13
366     =cut
367    
368 dpavlin 536 sub first {
369     my $r = shift;
370     return $r;
371 dpavlin 13 }
372    
373 dpavlin 536 =head2 lookup
374 dpavlin 13
375 dpavlin 536 Consult lookup hashes for some value
376 dpavlin 13
377 dpavlin 536 @v = lookup( $v );
378     @v = lookup( @v );
379 dpavlin 13
380     =cut
381    
382 dpavlin 536 sub lookup {
383     my $k = shift or return;
384     return unless (defined($lookup->{$k}));
385     if (ref($lookup->{$k}) eq 'ARRAY') {
386     return @{ $lookup->{$k} };
387     } else {
388     return $lookup->{$k};
389     }
390 dpavlin 13 }
391    
392 dpavlin 536 =head2 join_with
393 dpavlin 13
394 dpavlin 536 Joins walues with some delimiter
395 dpavlin 10
396 dpavlin 536 $v = join_with(", ", @v);
397 dpavlin 10
398 dpavlin 536 =cut
399 dpavlin 10
400 dpavlin 536 sub join_with {
401     my $d = shift;
402     return join($d, grep { defined($_) && $_ ne '' } @_);
403     }
404 dpavlin 10
405 dpavlin 536 # END
406     1;

  ViewVC Help
Powered by ViewVC 1.1.26