/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 543 - (show annotations)
Thu Jun 29 21:19:08 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 7678 byte(s)
 r732@llin:  dpavlin | 2006-06-29 23:20:46 +0200
 document magic (that is how WebPAC detects repeatable fields) and fix it to actually work :-)

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc21
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dumper;
20 use Encode qw/from_to/;
21
22 =head1 NAME
23
24 WebPAC::Normalize - describe normalisaton rules using sets
25
26 =head1 VERSION
27
28 Version 0.06
29
30 =cut
31
32 our $VERSION = '0.06';
33
34 =head1 SYNOPSIS
35
36 This module uses C<conf/normalize/*.pl> files to perform normalisation
37 from input records using perl functions which are specialized for set
38 processing.
39
40 Sets are implemented as arrays, and normalisation file is valid perl, which
41 means that you check it's validity before running WebPAC using
42 C<perl -c normalize.pl>.
43
44 Normalisation can generate multiple output normalized data. For now, supported output
45 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 C<marc21>.
47
48 =head1 FUNCTIONS
49
50 Functions which start with C<_> are private and used by WebPAC internally.
51 All other functions are available for use within normalisation rules.
52
53 =head2 data_structure
54
55 Return data structure
56
57 my $ds = WebPAC::Normalize::data_structure(
58 lookup => $lookup->lookup_hash,
59 row => $row,
60 rules => $normalize_pl_config,
61 marc_encoding => 'utf-8',
62 );
63
64 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65 other are optional.
66
67 This function will B<die> if normalizastion can't be evaled.
68
69 Since this function isn't exported you have to call it with
70 C<WebPAC::Normalize::data_structure>.
71
72 =cut
73
74 sub data_structure {
75 my $arg = {@_};
76
77 die "need row argument" unless ($arg->{row});
78 die "need normalisation argument" unless ($arg->{rules});
79
80 no strict 'subs';
81 _set_lookup( $arg->{lookup} );
82 _set_rec( $arg->{row} );
83 _clean_ds( %{ $arg } );
84 eval "$arg->{rules}";
85 die "error evaling $arg->{rules}: $@\n" if ($@);
86
87 return _get_ds();
88 }
89
90 =head2 _set_rec
91
92 Set current record hash
93
94 _set_rec( $rec );
95
96 =cut
97
98 my $rec;
99
100 sub _set_rec {
101 $rec = shift or die "no record hash";
102 }
103
104 =head2 _get_ds
105
106 Return hash formatted as data structure
107
108 my $ds = _get_ds();
109
110 =cut
111
112 my $out;
113 my $marc21;
114 my $marc_encoding;
115
116 sub _get_ds {
117 return $out;
118 }
119
120 =head2 _clean_ds
121
122 Clean data structure hash for next record
123
124 _clean_ds();
125
126 =cut
127
128 sub _clean_ds {
129 my $a = {@_};
130 $out = undef;
131 $marc21 = undef;
132 $marc_encoding = $a->{marc_encoding};
133 }
134
135 =head2 _set_lookup
136
137 Set current lookup hash
138
139 _set_lookup( $lookup );
140
141 =cut
142
143 my $lookup;
144
145 sub _set_lookup {
146 $lookup = shift;
147 }
148
149 =head2 _get_marc21_fields
150
151 Get all fields defined by calls to C<marc21>
152
153 $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154
155
156
157 We are using I<magic> which detect repeatable fields only from
158 sequence of field/subfield data generated by normalization.
159
160 Repeatable field is created if there is second occurence of same subfield or
161 if any of indicators are different. This is sane for most cases except for
162 non-repeatable fields with repeatable subfields.
163
164 B<TODO>: implement exceptions to magic
165
166 =cut
167
168 sub _get_marc21_fields {
169 my @m;
170 my $last;
171 foreach my $row (@{ $marc21 }) {
172 if ($last &&
173 $last->[0] eq $row->[0] && # check if field is same
174 $last->[1] eq $row->[1] && # check for i1
175 $last->[2] eq $row->[2] # and for i2
176 ) {
177 push @$last, ( $row->[3] , $row->[4] );
178 warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
179 next;
180 } elsif ($last) {
181 push @m, $last;
182 }
183
184 $last = $row;
185 }
186
187 push @m, $last if ($last);
188
189 return @m;
190 }
191
192 =head1 Functions to create C<data_structure>
193
194 Those functions generally have to first in your normalization file.
195
196 =head2 tag
197
198 Define new tag for I<search> and I<display>.
199
200 tag('Title', rec('200','a') );
201
202
203 =cut
204
205 sub tag {
206 my $name = shift or die "tag needs name as first argument";
207 my @o = grep { defined($_) && $_ ne '' } @_;
208 return unless (@o);
209 $out->{$name}->{tag} = $name;
210 $out->{$name}->{search} = \@o;
211 $out->{$name}->{display} = \@o;
212 }
213
214 =head2 display
215
216 Define tag just for I<display>
217
218 @v = display('Title', rec('200','a') );
219
220 =cut
221
222 sub display {
223 my $name = shift or die "display needs name as first argument";
224 my @o = grep { defined($_) && $_ ne '' } @_;
225 return unless (@o);
226 $out->{$name}->{tag} = $name;
227 $out->{$name}->{display} = \@o;
228 }
229
230 =head2 search
231
232 Prepare values just for I<search>
233
234 @v = search('Title', rec('200','a') );
235
236 =cut
237
238 sub search {
239 my $name = shift or die "search needs name as first argument";
240 my @o = grep { defined($_) && $_ ne '' } @_;
241 return unless (@o);
242 $out->{$name}->{tag} = $name;
243 $out->{$name}->{search} = \@o;
244 }
245
246 =head2 marc21
247
248 Save value for MARC field
249
250 marc21('900','a', rec('200','a') );
251
252 =cut
253
254 sub marc21 {
255 my $f = shift or die "marc21 needs field";
256 die "marc21 field must be numer" unless ($f =~ /^\d+$/);
257
258 my $sf = shift or die "marc21 needs subfield";
259
260 foreach (@_) {
261 my $v = $_; # make var read-write for Encode
262 next unless (defined($v) && $v !~ /^\s*$/);
263 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
264 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
265 }
266 }
267
268 =head1 Functions to extract data from input
269
270 This function should be used inside functions to create C<data_structure> described
271 above.
272
273 =head2 rec1
274
275 Return all values in some field
276
277 @v = rec1('200')
278
279 TODO: order of values is probably same as in source data, need to investigate that
280
281 =cut
282
283 sub rec1 {
284 my $f = shift;
285 return unless (defined($rec) && defined($rec->{$f}));
286 if (ref($rec->{$f}) eq 'ARRAY') {
287 return map {
288 if (ref($_) eq 'HASH') {
289 values %{$_};
290 } else {
291 $_;
292 }
293 } @{ $rec->{$f} };
294 } elsif( defined($rec->{$f}) ) {
295 return $rec->{$f};
296 }
297 }
298
299 =head2 rec2
300
301 Return all values in specific field and subfield
302
303 @v = rec2('200','a')
304
305 =cut
306
307 sub rec2 {
308 my $f = shift;
309 return unless (defined($rec && $rec->{$f}));
310 my $sf = shift;
311 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
312 }
313
314 =head2 rec
315
316 syntaxtic sugar for
317
318 @v = rec('200')
319 @v = rec('200','a')
320
321 =cut
322
323 sub rec {
324 if ($#_ == 0) {
325 return rec1(@_);
326 } elsif ($#_ == 1) {
327 return rec2(@_);
328 }
329 }
330
331 =head2 regex
332
333 Apply regex to some or all values
334
335 @v = regex( 's/foo/bar/g', @v );
336
337 =cut
338
339 sub regex {
340 my $r = shift;
341 my @out;
342 #warn "r: $r\n",Dumper(\@_);
343 foreach my $t (@_) {
344 next unless ($t);
345 eval "\$t =~ $r";
346 push @out, $t if ($t && $t ne '');
347 }
348 return @out;
349 }
350
351 =head2 prefix
352
353 Prefix all values with a string
354
355 @v = prefix( 'my_', @v );
356
357 =cut
358
359 sub prefix {
360 my $p = shift or die "prefix needs string as first argument";
361 return map { $p . $_ } grep { defined($_) } @_;
362 }
363
364 =head2 suffix
365
366 suffix all values with a string
367
368 @v = suffix( '_my', @v );
369
370 =cut
371
372 sub suffix {
373 my $s = shift or die "suffix needs string as first argument";
374 return map { $_ . $s } grep { defined($_) } @_;
375 }
376
377 =head2 surround
378
379 surround all values with a two strings
380
381 @v = surround( 'prefix_', '_suffix', @v );
382
383 =cut
384
385 sub surround {
386 my $p = shift or die "surround need prefix as first argument";
387 my $s = shift or die "surround needs suffix as second argument";
388 return map { $p . $_ . $s } grep { defined($_) } @_;
389 }
390
391 =head2 first
392
393 Return first element
394
395 $v = first( @v );
396
397 =cut
398
399 sub first {
400 my $r = shift;
401 return $r;
402 }
403
404 =head2 lookup
405
406 Consult lookup hashes for some value
407
408 @v = lookup( $v );
409 @v = lookup( @v );
410
411 =cut
412
413 sub lookup {
414 my $k = shift or return;
415 return unless (defined($lookup->{$k}));
416 if (ref($lookup->{$k}) eq 'ARRAY') {
417 return @{ $lookup->{$k} };
418 } else {
419 return $lookup->{$k};
420 }
421 }
422
423 =head2 join_with
424
425 Joins walues with some delimiter
426
427 $v = join_with(", ", @v);
428
429 =cut
430
431 sub join_with {
432 my $d = shift;
433 return join($d, grep { defined($_) && $_ ne '' } @_);
434 }
435
436 # END
437 1;

  ViewVC Help
Powered by ViewVC 1.1.26