/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 554 - (show annotations)
Sat Jul 1 10:19:29 2006 UTC (16 years, 7 months ago) by dpavlin
File size: 11273 byte(s)
 r756@llin:  dpavlin | 2006-07-01 12:17:24 +0200
 pod improvements, added _debug

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10
11 rec1 rec2 rec
12 regex prefix suffix surround
13 first lookup join_with
14 /;
15
16 use warnings;
17 use strict;
18
19 #use base qw/WebPAC::Common/;
20 use Data::Dump qw/dump/;
21 use Encode qw/from_to/;
22
23 # debugging warn(s)
24 my $debug = 0;
25
26
27 =head1 NAME
28
29 WebPAC::Normalize - describe normalisaton rules using sets
30
31 =head1 VERSION
32
33 Version 0.07
34
35 =cut
36
37 our $VERSION = '0.07';
38
39 =head1 SYNOPSIS
40
41 This module uses C<conf/normalize/*.pl> files to perform normalisation
42 from input records using perl functions which are specialized for set
43 processing.
44
45 Sets are implemented as arrays, and normalisation file is valid perl, which
46 means that you check it's validity before running WebPAC using
47 C<perl -c normalize.pl>.
48
49 Normalisation can generate multiple output normalized data. For now, supported output
50 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
51 C<marc>.
52
53 =head1 FUNCTIONS
54
55 Functions which start with C<_> are private and used by WebPAC internally.
56 All other functions are available for use within normalisation rules.
57
58 =head2 data_structure
59
60 Return data structure
61
62 my $ds = WebPAC::Normalize::data_structure(
63 lookup => $lookup->lookup_hash,
64 row => $row,
65 rules => $normalize_pl_config,
66 marc_encoding => 'utf-8',
67 );
68
69 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
70 other are optional.
71
72 This function will B<die> if normalizastion can't be evaled.
73
74 Since this function isn't exported you have to call it with
75 C<WebPAC::Normalize::data_structure>.
76
77 =cut
78
79 sub data_structure {
80 my $arg = {@_};
81
82 die "need row argument" unless ($arg->{row});
83 die "need normalisation argument" unless ($arg->{rules});
84
85 no strict 'subs';
86 _set_lookup( $arg->{lookup} );
87 _set_rec( $arg->{row} );
88 _clean_ds( %{ $arg } );
89 eval "$arg->{rules}";
90 die "error evaling $arg->{rules}: $@\n" if ($@);
91
92 return _get_ds();
93 }
94
95 =head2 _set_rec
96
97 Set current record hash
98
99 _set_rec( $rec );
100
101 =cut
102
103 my $rec;
104
105 sub _set_rec {
106 $rec = shift or die "no record hash";
107 }
108
109 =head2 _get_ds
110
111 Return hash formatted as data structure
112
113 my $ds = _get_ds();
114
115 =cut
116
117 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
118
119 sub _get_ds {
120 return $out;
121 }
122
123 =head2 _clean_ds
124
125 Clean data structure hash for next record
126
127 _clean_ds();
128
129 =cut
130
131 sub _clean_ds {
132 my $a = {@_};
133 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
134 $marc_encoding = $a->{marc_encoding};
135 }
136
137 =head2 _set_lookup
138
139 Set current lookup hash
140
141 _set_lookup( $lookup );
142
143 =cut
144
145 my $lookup;
146
147 sub _set_lookup {
148 $lookup = shift;
149 }
150
151 =head2 _get_marc_fields
152
153 Get all fields defined by calls to C<marc>
154
155 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
156
157 We are using I<magic> which detect repeatable fields only from
158 sequence of field/subfield data generated by normalization.
159
160 Repeatable field is created when there is second occurence of same subfield or
161 if any of indicators are different.
162
163 This is sane for most cases. Something like:
164
165 900a-1 900b-1 900c-1
166 900a-2 900b-2
167 900a-3
168
169 will be created from any combination of:
170
171 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
172
173 and following rules:
174
175 marc('900','a', rec('200','a') );
176 marc('900','b', rec('200','b') );
177 marc('900','c', rec('200','c') );
178
179 which might not be what you have in mind. If you need repeatable subfield,
180 define it using C<marc_repeatable_subfield> like this:
181
182 ....
183
184 =cut
185
186 sub _get_marc_fields {
187
188 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
189
190 # first, sort all existing fields
191 # XXX might not be needed, but modern perl might randomize elements in hash
192 my @sorted_marc_record = sort {
193 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
194 } @{ $marc_record };
195
196 # output marc fields
197 my @m;
198
199 # count unique field-subfields (used for offset when walking to next subfield)
200 my $u;
201 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
202
203 if ($debug) {
204 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
205 warn "## marc_record ", dump( $marc_record ), $/;
206 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
207 warn "## subfield count ", dump( $u ), $/;
208 }
209
210 my $len = $#sorted_marc_record;
211 my $visited;
212 my $i = 0;
213 my $field;
214
215 foreach ( 0 .. $len ) {
216
217 # find next element which isn't visited
218 while ($visited->{$i}) {
219 $i = ($i + 1) % ($len + 1);
220 }
221
222 # mark it visited
223 $visited->{$i}++;
224
225 my $row = $sorted_marc_record[$i];
226
227 # field and subfield which is key for
228 # marc_repeatable_subfield and u
229 my $fsf = $row->[0] . $row->[3];
230
231 if ($debug > 1) {
232
233 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
234 print "### this [$i]: ", dump( $row ),$/;
235 print "### sf: ", $row->[3], " vs ", $field->[3],
236 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
237 if ($#$field >= 0);
238
239 }
240
241 # if field exists
242 if ( $#$field >= 0 ) {
243 if (
244 $row->[0] ne $field->[0] || # field
245 $row->[1] ne $field->[1] || # i1
246 $row->[2] ne $field->[2] # i2
247 ) {
248 push @m, $field;
249 warn "## saved/1 ", dump( $field ),$/ if ($debug);
250 $field = $row;
251
252 } elsif (
253 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
254 ||
255 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
256 ! $marc_repeatable_subfield->{ $fsf }
257 )
258 ) {
259 push @m, $field;
260 warn "## saved/2 ", dump( $field ),$/ if ($debug);
261 $field = $row;
262
263 } else {
264 # append new subfields to existing field
265 push @$field, ( $row->[3], $row->[4] );
266 }
267 } else {
268 # insert first field
269 $field = $row;
270 }
271
272 if (! $marc_repeatable_subfield->{ $fsf }) {
273 # make step to next subfield
274 $i = ($i + $u->{ $fsf } ) % ($len + 1);
275 }
276 }
277
278 if ($#$field >= 0) {
279 push @m, $field;
280 warn "## saved/3 ", dump( $field ),$/ if ($debug);
281 }
282
283 return @m;
284 }
285
286 =head2 _debug
287
288 Change level of debug warnings
289
290 _debug( 2 );
291
292 =cut
293
294 sub _debug {
295 my $l = shift;
296 return $debug unless defined($l);
297 $debug = $l;
298 }
299
300 =head1 Functions to create C<data_structure>
301
302 Those functions generally have to first in your normalization file.
303
304 =head2 tag
305
306 Define new tag for I<search> and I<display>.
307
308 tag('Title', rec('200','a') );
309
310
311 =cut
312
313 sub tag {
314 my $name = shift or die "tag needs name as first argument";
315 my @o = grep { defined($_) && $_ ne '' } @_;
316 return unless (@o);
317 $out->{$name}->{tag} = $name;
318 $out->{$name}->{search} = \@o;
319 $out->{$name}->{display} = \@o;
320 }
321
322 =head2 display
323
324 Define tag just for I<display>
325
326 @v = display('Title', rec('200','a') );
327
328 =cut
329
330 sub display {
331 my $name = shift or die "display needs name as first argument";
332 my @o = grep { defined($_) && $_ ne '' } @_;
333 return unless (@o);
334 $out->{$name}->{tag} = $name;
335 $out->{$name}->{display} = \@o;
336 }
337
338 =head2 search
339
340 Prepare values just for I<search>
341
342 @v = search('Title', rec('200','a') );
343
344 =cut
345
346 sub search {
347 my $name = shift or die "search needs name as first argument";
348 my @o = grep { defined($_) && $_ ne '' } @_;
349 return unless (@o);
350 $out->{$name}->{tag} = $name;
351 $out->{$name}->{search} = \@o;
352 }
353
354 =head2 marc
355
356 Save value for MARC field
357
358 marc('900','a', rec('200','a') );
359
360 =cut
361
362 sub marc {
363 my $f = shift or die "marc needs field";
364 die "marc field must be numer" unless ($f =~ /^\d+$/);
365
366 my $sf = shift or die "marc needs subfield";
367
368 foreach (@_) {
369 my $v = $_; # make var read-write for Encode
370 next unless (defined($v) && $v !~ /^\s*$/);
371 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
372 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
373 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
374 }
375 }
376
377 =head2 marc_repeatable_subfield
378
379 Save values for MARC repetable subfield
380
381 marc_repeatable_subfield('910', 'z', rec('909') );
382
383 =cut
384
385 sub marc_repeatable_subfield {
386 my ($f,$sf) = @_;
387 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
388 $marc_repeatable_subfield->{ $f . $sf }++;
389 marc(@_);
390 }
391
392 =head2 marc_indicators
393
394 Set both indicators for MARC field
395
396 marc_indicators('900', ' ', 1);
397
398 Any indicator value other than C<0-9> will be treated as undefined.
399
400 =cut
401
402 sub marc_indicators {
403 my $f = shift || die "marc_indicators need field!\n";
404 my ($i1,$i2) = @_;
405 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
406 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
407
408 $i1 = ' ' if ($i1 !~ /^\d$/);
409 $i2 = ' ' if ($i2 !~ /^\d$/);
410 @{ $marc_indicators->{$f} } = ($i1,$i2);
411 }
412
413
414 =head1 Functions to extract data from input
415
416 This function should be used inside functions to create C<data_structure> described
417 above.
418
419 =head2 rec1
420
421 Return all values in some field
422
423 @v = rec1('200')
424
425 TODO: order of values is probably same as in source data, need to investigate that
426
427 =cut
428
429 sub rec1 {
430 my $f = shift;
431 return unless (defined($rec) && defined($rec->{$f}));
432 if (ref($rec->{$f}) eq 'ARRAY') {
433 return map {
434 if (ref($_) eq 'HASH') {
435 values %{$_};
436 } else {
437 $_;
438 }
439 } @{ $rec->{$f} };
440 } elsif( defined($rec->{$f}) ) {
441 return $rec->{$f};
442 }
443 }
444
445 =head2 rec2
446
447 Return all values in specific field and subfield
448
449 @v = rec2('200','a')
450
451 =cut
452
453 sub rec2 {
454 my $f = shift;
455 return unless (defined($rec && $rec->{$f}));
456 my $sf = shift;
457 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
458 }
459
460 =head2 rec
461
462 syntaxtic sugar for
463
464 @v = rec('200')
465 @v = rec('200','a')
466
467 =cut
468
469 sub rec {
470 if ($#_ == 0) {
471 return rec1(@_);
472 } elsif ($#_ == 1) {
473 return rec2(@_);
474 }
475 }
476
477 =head2 regex
478
479 Apply regex to some or all values
480
481 @v = regex( 's/foo/bar/g', @v );
482
483 =cut
484
485 sub regex {
486 my $r = shift;
487 my @out;
488 #warn "r: $r\n", dump(\@_);
489 foreach my $t (@_) {
490 next unless ($t);
491 eval "\$t =~ $r";
492 push @out, $t if ($t && $t ne '');
493 }
494 return @out;
495 }
496
497 =head2 prefix
498
499 Prefix all values with a string
500
501 @v = prefix( 'my_', @v );
502
503 =cut
504
505 sub prefix {
506 my $p = shift or die "prefix needs string as first argument";
507 return map { $p . $_ } grep { defined($_) } @_;
508 }
509
510 =head2 suffix
511
512 suffix all values with a string
513
514 @v = suffix( '_my', @v );
515
516 =cut
517
518 sub suffix {
519 my $s = shift or die "suffix needs string as first argument";
520 return map { $_ . $s } grep { defined($_) } @_;
521 }
522
523 =head2 surround
524
525 surround all values with a two strings
526
527 @v = surround( 'prefix_', '_suffix', @v );
528
529 =cut
530
531 sub surround {
532 my $p = shift or die "surround need prefix as first argument";
533 my $s = shift or die "surround needs suffix as second argument";
534 return map { $p . $_ . $s } grep { defined($_) } @_;
535 }
536
537 =head2 first
538
539 Return first element
540
541 $v = first( @v );
542
543 =cut
544
545 sub first {
546 my $r = shift;
547 return $r;
548 }
549
550 =head2 lookup
551
552 Consult lookup hashes for some value
553
554 @v = lookup( $v );
555 @v = lookup( @v );
556
557 =cut
558
559 sub lookup {
560 my $k = shift or return;
561 return unless (defined($lookup->{$k}));
562 if (ref($lookup->{$k}) eq 'ARRAY') {
563 return @{ $lookup->{$k} };
564 } else {
565 return $lookup->{$k};
566 }
567 }
568
569 =head2 join_with
570
571 Joins walues with some delimiter
572
573 $v = join_with(", ", @v);
574
575 =cut
576
577 sub join_with {
578 my $d = shift;
579 return join($d, grep { defined($_) && $_ ne '' } @_);
580 }
581
582 # END
583 1;

  ViewVC Help
Powered by ViewVC 1.1.26