/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 548 - (show annotations)
Thu Jun 29 23:29:02 2006 UTC (17 years, 10 months ago) by dpavlin
File size: 8870 byte(s)
 r744@llin:  dpavlin | 2006-06-30 01:31:00 +0200
 don't chew indicators with 0 value, removed debugging warning

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc marc_indicators marc_repeatable_subfield
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dumper;
20 use Encode qw/from_to/;
21
22 =head1 NAME
23
24 WebPAC::Normalize - describe normalisaton rules using sets
25
26 =head1 VERSION
27
28 Version 0.06
29
30 =cut
31
32 our $VERSION = '0.06';
33
34 =head1 SYNOPSIS
35
36 This module uses C<conf/normalize/*.pl> files to perform normalisation
37 from input records using perl functions which are specialized for set
38 processing.
39
40 Sets are implemented as arrays, and normalisation file is valid perl, which
41 means that you check it's validity before running WebPAC using
42 C<perl -c normalize.pl>.
43
44 Normalisation can generate multiple output normalized data. For now, supported output
45 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 C<marc>.
47
48 =head1 FUNCTIONS
49
50 Functions which start with C<_> are private and used by WebPAC internally.
51 All other functions are available for use within normalisation rules.
52
53 =head2 data_structure
54
55 Return data structure
56
57 my $ds = WebPAC::Normalize::data_structure(
58 lookup => $lookup->lookup_hash,
59 row => $row,
60 rules => $normalize_pl_config,
61 marc_encoding => 'utf-8',
62 );
63
64 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65 other are optional.
66
67 This function will B<die> if normalizastion can't be evaled.
68
69 Since this function isn't exported you have to call it with
70 C<WebPAC::Normalize::data_structure>.
71
72 =cut
73
74 sub data_structure {
75 my $arg = {@_};
76
77 die "need row argument" unless ($arg->{row});
78 die "need normalisation argument" unless ($arg->{rules});
79
80 no strict 'subs';
81 _set_lookup( $arg->{lookup} );
82 _set_rec( $arg->{row} );
83 _clean_ds( %{ $arg } );
84 eval "$arg->{rules}";
85 die "error evaling $arg->{rules}: $@\n" if ($@);
86
87 return _get_ds();
88 }
89
90 =head2 _set_rec
91
92 Set current record hash
93
94 _set_rec( $rec );
95
96 =cut
97
98 my $rec;
99
100 sub _set_rec {
101 $rec = shift or die "no record hash";
102 }
103
104 =head2 _get_ds
105
106 Return hash formatted as data structure
107
108 my $ds = _get_ds();
109
110 =cut
111
112 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113
114 sub _get_ds {
115 return $out;
116 }
117
118 =head2 _clean_ds
119
120 Clean data structure hash for next record
121
122 _clean_ds();
123
124 =cut
125
126 sub _clean_ds {
127 my $a = {@_};
128 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129 $marc_encoding = $a->{marc_encoding};
130 }
131
132 =head2 _set_lookup
133
134 Set current lookup hash
135
136 _set_lookup( $lookup );
137
138 =cut
139
140 my $lookup;
141
142 sub _set_lookup {
143 $lookup = shift;
144 }
145
146 =head2 _get_marc_fields
147
148 Get all fields defined by calls to C<marc>
149
150 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151
152
153
154 We are using I<magic> which detect repeatable fields only from
155 sequence of field/subfield data generated by normalization.
156
157 Repeatable field is created if there is second occurence of same subfield or
158 if any of indicators are different. This is sane for most cases except for
159 non-repeatable fields with repeatable subfields.
160
161 You can change behaviour of that using C<marc_repeatable_subfield>.
162
163 =cut
164
165 sub _get_marc_fields {
166 my @m;
167 my $last;
168 foreach my $row (@{ $marc_record }) {
169 if ($last &&
170 $last->[0] eq $row->[0] && # check if field is same
171 $last->[1] eq $row->[1] && # check for i1
172 $last->[2] eq $row->[2] && # and for i2
173 ( $last->[3] ne $row->[3] || # and subfield is different
174 $last->[3] eq $row->[3] && # or subfield is same,
175 $marc_repeatable_subfield->{ $row->[3] } # but is repeatable
176 )
177 ) {
178 push @$last, ( $row->[3] , $row->[4] );
179 next;
180 } elsif ($last) {
181 push @m, $last;
182 }
183
184 $last = $row;
185 }
186
187 push @m, $last if ($last);
188
189 return @m;
190 }
191
192 =head1 Functions to create C<data_structure>
193
194 Those functions generally have to first in your normalization file.
195
196 =head2 tag
197
198 Define new tag for I<search> and I<display>.
199
200 tag('Title', rec('200','a') );
201
202
203 =cut
204
205 sub tag {
206 my $name = shift or die "tag needs name as first argument";
207 my @o = grep { defined($_) && $_ ne '' } @_;
208 return unless (@o);
209 $out->{$name}->{tag} = $name;
210 $out->{$name}->{search} = \@o;
211 $out->{$name}->{display} = \@o;
212 }
213
214 =head2 display
215
216 Define tag just for I<display>
217
218 @v = display('Title', rec('200','a') );
219
220 =cut
221
222 sub display {
223 my $name = shift or die "display needs name as first argument";
224 my @o = grep { defined($_) && $_ ne '' } @_;
225 return unless (@o);
226 $out->{$name}->{tag} = $name;
227 $out->{$name}->{display} = \@o;
228 }
229
230 =head2 search
231
232 Prepare values just for I<search>
233
234 @v = search('Title', rec('200','a') );
235
236 =cut
237
238 sub search {
239 my $name = shift or die "search needs name as first argument";
240 my @o = grep { defined($_) && $_ ne '' } @_;
241 return unless (@o);
242 $out->{$name}->{tag} = $name;
243 $out->{$name}->{search} = \@o;
244 }
245
246 =head2 marc
247
248 Save value for MARC field
249
250 marc('900','a', rec('200','a') );
251
252 =cut
253
254 sub marc {
255 my $f = shift or die "marc needs field";
256 die "marc field must be numer" unless ($f =~ /^\d+$/);
257
258 my $sf = shift or die "marc needs subfield";
259
260 foreach (@_) {
261 my $v = $_; # make var read-write for Encode
262 next unless (defined($v) && $v !~ /^\s*$/);
263 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
264 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
265 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
266 }
267 }
268
269 =head2 marc_repeatable_subfield
270
271 Save values for MARC repetable subfield
272
273 marc_repeatable_subfield('910', 'z', rec('909') );
274
275 =cut
276
277 sub marc_repeatable_subfield {
278 die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
279 $marc_repeatable_subfield->{ $_[1] }++;
280 marc(@_);
281 }
282
283 =head2 marc_indicators
284
285 Set both indicators for MARC field
286
287 marc_indicators('900', ' ', 1);
288
289 Any indicator value other than C<0-9> will be treated as undefined.
290
291 =cut
292
293 sub marc_indicators {
294 my $f = shift || die "marc_indicators need field!\n";
295 my ($i1,$i2) = @_;
296 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
297 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
298
299 $i1 = ' ' if ($i1 !~ /^\d$/);
300 $i2 = ' ' if ($i2 !~ /^\d$/);
301 @{ $marc_indicators->{$f} } = ($i1,$i2);
302 }
303
304
305 =head1 Functions to extract data from input
306
307 This function should be used inside functions to create C<data_structure> described
308 above.
309
310 =head2 rec1
311
312 Return all values in some field
313
314 @v = rec1('200')
315
316 TODO: order of values is probably same as in source data, need to investigate that
317
318 =cut
319
320 sub rec1 {
321 my $f = shift;
322 return unless (defined($rec) && defined($rec->{$f}));
323 if (ref($rec->{$f}) eq 'ARRAY') {
324 return map {
325 if (ref($_) eq 'HASH') {
326 values %{$_};
327 } else {
328 $_;
329 }
330 } @{ $rec->{$f} };
331 } elsif( defined($rec->{$f}) ) {
332 return $rec->{$f};
333 }
334 }
335
336 =head2 rec2
337
338 Return all values in specific field and subfield
339
340 @v = rec2('200','a')
341
342 =cut
343
344 sub rec2 {
345 my $f = shift;
346 return unless (defined($rec && $rec->{$f}));
347 my $sf = shift;
348 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
349 }
350
351 =head2 rec
352
353 syntaxtic sugar for
354
355 @v = rec('200')
356 @v = rec('200','a')
357
358 =cut
359
360 sub rec {
361 if ($#_ == 0) {
362 return rec1(@_);
363 } elsif ($#_ == 1) {
364 return rec2(@_);
365 }
366 }
367
368 =head2 regex
369
370 Apply regex to some or all values
371
372 @v = regex( 's/foo/bar/g', @v );
373
374 =cut
375
376 sub regex {
377 my $r = shift;
378 my @out;
379 #warn "r: $r\n",Dumper(\@_);
380 foreach my $t (@_) {
381 next unless ($t);
382 eval "\$t =~ $r";
383 push @out, $t if ($t && $t ne '');
384 }
385 return @out;
386 }
387
388 =head2 prefix
389
390 Prefix all values with a string
391
392 @v = prefix( 'my_', @v );
393
394 =cut
395
396 sub prefix {
397 my $p = shift or die "prefix needs string as first argument";
398 return map { $p . $_ } grep { defined($_) } @_;
399 }
400
401 =head2 suffix
402
403 suffix all values with a string
404
405 @v = suffix( '_my', @v );
406
407 =cut
408
409 sub suffix {
410 my $s = shift or die "suffix needs string as first argument";
411 return map { $_ . $s } grep { defined($_) } @_;
412 }
413
414 =head2 surround
415
416 surround all values with a two strings
417
418 @v = surround( 'prefix_', '_suffix', @v );
419
420 =cut
421
422 sub surround {
423 my $p = shift or die "surround need prefix as first argument";
424 my $s = shift or die "surround needs suffix as second argument";
425 return map { $p . $_ . $s } grep { defined($_) } @_;
426 }
427
428 =head2 first
429
430 Return first element
431
432 $v = first( @v );
433
434 =cut
435
436 sub first {
437 my $r = shift;
438 return $r;
439 }
440
441 =head2 lookup
442
443 Consult lookup hashes for some value
444
445 @v = lookup( $v );
446 @v = lookup( @v );
447
448 =cut
449
450 sub lookup {
451 my $k = shift or return;
452 return unless (defined($lookup->{$k}));
453 if (ref($lookup->{$k}) eq 'ARRAY') {
454 return @{ $lookup->{$k} };
455 } else {
456 return $lookup->{$k};
457 }
458 }
459
460 =head2 join_with
461
462 Joins walues with some delimiter
463
464 $v = join_with(", ", @v);
465
466 =cut
467
468 sub join_with {
469 my $d = shift;
470 return join($d, grep { defined($_) && $_ ne '' } @_);
471 }
472
473 # END
474 1;

  ViewVC Help
Powered by ViewVC 1.1.26