/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 547 - (show annotations)
Thu Jun 29 23:19:26 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 8948 byte(s)
 r742@llin:  dpavlin | 2006-06-30 01:21:24 +0200
 added marc_repetable_subfield and marc_indicators, renamed marc21 to marc [2.23]

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc marc_indicators marc_repeatable_subfield
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dumper;
20 use Encode qw/from_to/;
21
22 =head1 NAME
23
24 WebPAC::Normalize - describe normalisaton rules using sets
25
26 =head1 VERSION
27
28 Version 0.06
29
30 =cut
31
32 our $VERSION = '0.06';
33
34 =head1 SYNOPSIS
35
36 This module uses C<conf/normalize/*.pl> files to perform normalisation
37 from input records using perl functions which are specialized for set
38 processing.
39
40 Sets are implemented as arrays, and normalisation file is valid perl, which
41 means that you check it's validity before running WebPAC using
42 C<perl -c normalize.pl>.
43
44 Normalisation can generate multiple output normalized data. For now, supported output
45 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 C<marc>.
47
48 =head1 FUNCTIONS
49
50 Functions which start with C<_> are private and used by WebPAC internally.
51 All other functions are available for use within normalisation rules.
52
53 =head2 data_structure
54
55 Return data structure
56
57 my $ds = WebPAC::Normalize::data_structure(
58 lookup => $lookup->lookup_hash,
59 row => $row,
60 rules => $normalize_pl_config,
61 marc_encoding => 'utf-8',
62 );
63
64 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65 other are optional.
66
67 This function will B<die> if normalizastion can't be evaled.
68
69 Since this function isn't exported you have to call it with
70 C<WebPAC::Normalize::data_structure>.
71
72 =cut
73
74 sub data_structure {
75 my $arg = {@_};
76
77 die "need row argument" unless ($arg->{row});
78 die "need normalisation argument" unless ($arg->{rules});
79
80 no strict 'subs';
81 _set_lookup( $arg->{lookup} );
82 _set_rec( $arg->{row} );
83 _clean_ds( %{ $arg } );
84 eval "$arg->{rules}";
85 die "error evaling $arg->{rules}: $@\n" if ($@);
86
87 return _get_ds();
88 }
89
90 =head2 _set_rec
91
92 Set current record hash
93
94 _set_rec( $rec );
95
96 =cut
97
98 my $rec;
99
100 sub _set_rec {
101 $rec = shift or die "no record hash";
102 }
103
104 =head2 _get_ds
105
106 Return hash formatted as data structure
107
108 my $ds = _get_ds();
109
110 =cut
111
112 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113
114 sub _get_ds {
115 return $out;
116 }
117
118 =head2 _clean_ds
119
120 Clean data structure hash for next record
121
122 _clean_ds();
123
124 =cut
125
126 sub _clean_ds {
127 my $a = {@_};
128 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129 $marc_encoding = $a->{marc_encoding};
130 }
131
132 =head2 _set_lookup
133
134 Set current lookup hash
135
136 _set_lookup( $lookup );
137
138 =cut
139
140 my $lookup;
141
142 sub _set_lookup {
143 $lookup = shift;
144 }
145
146 =head2 _get_marc_fields
147
148 Get all fields defined by calls to C<marc>
149
150 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151
152
153
154 We are using I<magic> which detect repeatable fields only from
155 sequence of field/subfield data generated by normalization.
156
157 Repeatable field is created if there is second occurence of same subfield or
158 if any of indicators are different. This is sane for most cases except for
159 non-repeatable fields with repeatable subfields.
160
161 You can change behaviour of that using C<marc_repeatable_subfield>.
162
163 =cut
164
165 sub _get_marc_fields {
166 my @m;
167 my $last;
168 foreach my $row (@{ $marc_record }) {
169 if ($last &&
170 $last->[0] eq $row->[0] && # check if field is same
171 $last->[1] eq $row->[1] && # check for i1
172 $last->[2] eq $row->[2] && # and for i2
173 ( $last->[3] ne $row->[3] || # and subfield is different
174 $last->[3] eq $row->[3] && # or subfield is same,
175 $marc_repeatable_subfield->{ $row->[3] } # but is repeatable
176 )
177 ) {
178 push @$last, ( $row->[3] , $row->[4] );
179 warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180 next;
181 } elsif ($last) {
182 push @m, $last;
183 }
184
185 $last = $row;
186 }
187
188 push @m, $last if ($last);
189
190 return @m;
191 }
192
193 =head1 Functions to create C<data_structure>
194
195 Those functions generally have to first in your normalization file.
196
197 =head2 tag
198
199 Define new tag for I<search> and I<display>.
200
201 tag('Title', rec('200','a') );
202
203
204 =cut
205
206 sub tag {
207 my $name = shift or die "tag needs name as first argument";
208 my @o = grep { defined($_) && $_ ne '' } @_;
209 return unless (@o);
210 $out->{$name}->{tag} = $name;
211 $out->{$name}->{search} = \@o;
212 $out->{$name}->{display} = \@o;
213 }
214
215 =head2 display
216
217 Define tag just for I<display>
218
219 @v = display('Title', rec('200','a') );
220
221 =cut
222
223 sub display {
224 my $name = shift or die "display needs name as first argument";
225 my @o = grep { defined($_) && $_ ne '' } @_;
226 return unless (@o);
227 $out->{$name}->{tag} = $name;
228 $out->{$name}->{display} = \@o;
229 }
230
231 =head2 search
232
233 Prepare values just for I<search>
234
235 @v = search('Title', rec('200','a') );
236
237 =cut
238
239 sub search {
240 my $name = shift or die "search needs name as first argument";
241 my @o = grep { defined($_) && $_ ne '' } @_;
242 return unless (@o);
243 $out->{$name}->{tag} = $name;
244 $out->{$name}->{search} = \@o;
245 }
246
247 =head2 marc
248
249 Save value for MARC field
250
251 marc('900','a', rec('200','a') );
252
253 =cut
254
255 sub marc {
256 my $f = shift or die "marc needs field";
257 die "marc field must be numer" unless ($f =~ /^\d+$/);
258
259 my $sf = shift or die "marc needs subfield";
260
261 foreach (@_) {
262 my $v = $_; # make var read-write for Encode
263 next unless (defined($v) && $v !~ /^\s*$/);
264 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265 push @{ $marc_record }, [
266 $f,
267 $marc_indicators->{$f}->{i1} || ' ',
268 $marc_indicators->{$f}->{i2} || ' ',
269 $sf => $v
270 ];
271 }
272 }
273
274 =head2 marc_repeatable_subfield
275
276 Save values for MARC repetable subfield
277
278 marc_repeatable_subfield('910', 'z', rec('909') );
279
280 =cut
281
282 sub marc_repeatable_subfield {
283 die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
284 $marc_repeatable_subfield->{ $_[1] }++;
285 marc(@_);
286 }
287
288 =head2 marc_indicators
289
290 Set both indicators for MARC field
291
292 marc_indicators('900', ' ', 1);
293
294 Any indicator value other than C<0-9> will be treated as undefined.
295
296 =cut
297
298 sub marc_indicators {
299 my $f = shift || die "marc_indicators need field!\n";
300 my ($i1,$i2) = @_;
301 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
302 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
303
304 $i1 = ' ' if ($i1 !~ /^\d$/);
305 $i2 = ' ' if ($i2 !~ /^\d$/);
306 $marc_indicators->{$f}->{i1} = $i1;
307 $marc_indicators->{$f}->{i2} = $i2;
308 }
309
310
311 =head1 Functions to extract data from input
312
313 This function should be used inside functions to create C<data_structure> described
314 above.
315
316 =head2 rec1
317
318 Return all values in some field
319
320 @v = rec1('200')
321
322 TODO: order of values is probably same as in source data, need to investigate that
323
324 =cut
325
326 sub rec1 {
327 my $f = shift;
328 return unless (defined($rec) && defined($rec->{$f}));
329 if (ref($rec->{$f}) eq 'ARRAY') {
330 return map {
331 if (ref($_) eq 'HASH') {
332 values %{$_};
333 } else {
334 $_;
335 }
336 } @{ $rec->{$f} };
337 } elsif( defined($rec->{$f}) ) {
338 return $rec->{$f};
339 }
340 }
341
342 =head2 rec2
343
344 Return all values in specific field and subfield
345
346 @v = rec2('200','a')
347
348 =cut
349
350 sub rec2 {
351 my $f = shift;
352 return unless (defined($rec && $rec->{$f}));
353 my $sf = shift;
354 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
355 }
356
357 =head2 rec
358
359 syntaxtic sugar for
360
361 @v = rec('200')
362 @v = rec('200','a')
363
364 =cut
365
366 sub rec {
367 if ($#_ == 0) {
368 return rec1(@_);
369 } elsif ($#_ == 1) {
370 return rec2(@_);
371 }
372 }
373
374 =head2 regex
375
376 Apply regex to some or all values
377
378 @v = regex( 's/foo/bar/g', @v );
379
380 =cut
381
382 sub regex {
383 my $r = shift;
384 my @out;
385 #warn "r: $r\n",Dumper(\@_);
386 foreach my $t (@_) {
387 next unless ($t);
388 eval "\$t =~ $r";
389 push @out, $t if ($t && $t ne '');
390 }
391 return @out;
392 }
393
394 =head2 prefix
395
396 Prefix all values with a string
397
398 @v = prefix( 'my_', @v );
399
400 =cut
401
402 sub prefix {
403 my $p = shift or die "prefix needs string as first argument";
404 return map { $p . $_ } grep { defined($_) } @_;
405 }
406
407 =head2 suffix
408
409 suffix all values with a string
410
411 @v = suffix( '_my', @v );
412
413 =cut
414
415 sub suffix {
416 my $s = shift or die "suffix needs string as first argument";
417 return map { $_ . $s } grep { defined($_) } @_;
418 }
419
420 =head2 surround
421
422 surround all values with a two strings
423
424 @v = surround( 'prefix_', '_suffix', @v );
425
426 =cut
427
428 sub surround {
429 my $p = shift or die "surround need prefix as first argument";
430 my $s = shift or die "surround needs suffix as second argument";
431 return map { $p . $_ . $s } grep { defined($_) } @_;
432 }
433
434 =head2 first
435
436 Return first element
437
438 $v = first( @v );
439
440 =cut
441
442 sub first {
443 my $r = shift;
444 return $r;
445 }
446
447 =head2 lookup
448
449 Consult lookup hashes for some value
450
451 @v = lookup( $v );
452 @v = lookup( @v );
453
454 =cut
455
456 sub lookup {
457 my $k = shift or return;
458 return unless (defined($lookup->{$k}));
459 if (ref($lookup->{$k}) eq 'ARRAY') {
460 return @{ $lookup->{$k} };
461 } else {
462 return $lookup->{$k};
463 }
464 }
465
466 =head2 join_with
467
468 Joins walues with some delimiter
469
470 $v = join_with(", ", @v);
471
472 =cut
473
474 sub join_with {
475 my $d = shift;
476 return join($d, grep { defined($_) && $_ ne '' } @_);
477 }
478
479 # END
480 1;

  ViewVC Help
Powered by ViewVC 1.1.26