/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 550 - (show annotations)
Fri Jun 30 18:48:33 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 10735 byte(s)
 r748@llin:  dpavlin | 2006-06-30 20:48:29 +0200
 re-implement magic again (so that it actually work in all cases consistant).
 Depend on Data::Dump to enable nice output.

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc marc_indicators marc_repeatable_subfield
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dump qw/dump/;
20 use Encode qw/from_to/;
21
22 # debugging warn(s)
23 my $debug = 0;
24
25
26 =head1 NAME
27
28 WebPAC::Normalize - describe normalisaton rules using sets
29
30 =head1 VERSION
31
32 Version 0.06
33
34 =cut
35
36 our $VERSION = '0.06';
37
38 =head1 SYNOPSIS
39
40 This module uses C<conf/normalize/*.pl> files to perform normalisation
41 from input records using perl functions which are specialized for set
42 processing.
43
44 Sets are implemented as arrays, and normalisation file is valid perl, which
45 means that you check it's validity before running WebPAC using
46 C<perl -c normalize.pl>.
47
48 Normalisation can generate multiple output normalized data. For now, supported output
49 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
50 C<marc>.
51
52 =head1 FUNCTIONS
53
54 Functions which start with C<_> are private and used by WebPAC internally.
55 All other functions are available for use within normalisation rules.
56
57 =head2 data_structure
58
59 Return data structure
60
61 my $ds = WebPAC::Normalize::data_structure(
62 lookup => $lookup->lookup_hash,
63 row => $row,
64 rules => $normalize_pl_config,
65 marc_encoding => 'utf-8',
66 );
67
68 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
69 other are optional.
70
71 This function will B<die> if normalizastion can't be evaled.
72
73 Since this function isn't exported you have to call it with
74 C<WebPAC::Normalize::data_structure>.
75
76 =cut
77
78 sub data_structure {
79 my $arg = {@_};
80
81 die "need row argument" unless ($arg->{row});
82 die "need normalisation argument" unless ($arg->{rules});
83
84 no strict 'subs';
85 _set_lookup( $arg->{lookup} );
86 _set_rec( $arg->{row} );
87 _clean_ds( %{ $arg } );
88 eval "$arg->{rules}";
89 die "error evaling $arg->{rules}: $@\n" if ($@);
90
91 return _get_ds();
92 }
93
94 =head2 _set_rec
95
96 Set current record hash
97
98 _set_rec( $rec );
99
100 =cut
101
102 my $rec;
103
104 sub _set_rec {
105 $rec = shift or die "no record hash";
106 }
107
108 =head2 _get_ds
109
110 Return hash formatted as data structure
111
112 my $ds = _get_ds();
113
114 =cut
115
116 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
117
118 sub _get_ds {
119 return $out;
120 }
121
122 =head2 _clean_ds
123
124 Clean data structure hash for next record
125
126 _clean_ds();
127
128 =cut
129
130 sub _clean_ds {
131 my $a = {@_};
132 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
133 $marc_encoding = $a->{marc_encoding};
134 }
135
136 =head2 _set_lookup
137
138 Set current lookup hash
139
140 _set_lookup( $lookup );
141
142 =cut
143
144 my $lookup;
145
146 sub _set_lookup {
147 $lookup = shift;
148 }
149
150 =head2 _get_marc_fields
151
152 Get all fields defined by calls to C<marc>
153
154 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
155
156
157
158 We are using I<magic> which detect repeatable fields only from
159 sequence of field/subfield data generated by normalization.
160
161 Repeatable field is created if there is second occurence of same subfield or
162 if any of indicators are different. This is sane for most cases except for
163 non-repeatable fields with repeatable subfields.
164
165 You can change behaviour of that using C<marc_repeatable_subfield>.
166
167 =cut
168
169 sub _get_marc_fields {
170
171
172 # first, sort all existing fields
173 # XXX might not be needed, but modern perl might randomize elements in hash
174 my @sorted_marc_record = sort {
175 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
176 } @{ $marc_record };
177
178 # output marc fields
179 my @m;
180
181 # count unique field-subfields (used for offset when walking to next subfield)
182 my $u;
183 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
184
185 if ($debug) {
186 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
187 warn "## marc_record ", dump( $marc_record ), $/;
188 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
189 warn "## subfield count ", dump( $u ), $/;
190 }
191
192 my $len = $#sorted_marc_record;
193 my $visited;
194 my $i = 0;
195 my $field;
196
197 foreach ( 0 .. $len ) {
198
199 # find next element which isn't visited
200 while ($visited->{$i}) {
201 $i = ($i + 1) % ($len + 1);
202 }
203
204 # mark it visited
205 $visited->{$i}++;
206
207 my $row = $sorted_marc_record[$i];
208
209 # field and subfield which is key for
210 # marc_repeatable_subfield and u
211 my $fsf = $row->[0] . $row->[3];
212
213 if ($debug > 1) {
214
215 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
216 print "### this [$i]: ", dump( $row ),$/;
217 print "### sf: ", $row->[3], " vs ", $field->[3],
218 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
219 if ($#$field >= 0);
220
221 }
222
223 # if field exists
224 if ( $#$field >= 0 ) {
225 if (
226 $row->[0] ne $field->[0] || # field
227 $row->[1] ne $field->[1] || # i1
228 $row->[2] ne $field->[2] # i2
229 ) {
230 push @m, $field;
231 warn "## saved/1 ", dump( $field ),$/ if ($debug);
232 $field = $row;
233
234 } elsif (
235 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
236 ||
237 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
238 ! $marc_repeatable_subfield->{ $fsf }
239 )
240 ) {
241 push @m, $field;
242 warn "## saved/2 ", dump( $field ),$/ if ($debug);
243 $field = $row;
244
245 } else {
246 # append new subfields to existing field
247 push @$field, ( $row->[3], $row->[4] );
248 }
249 } else {
250 # insert first field
251 $field = $row;
252 }
253
254 if (! $marc_repeatable_subfield->{ $fsf }) {
255 # make step to next subfield
256 $i = ($i + $u->{ $fsf } ) % ($len + 1);
257 }
258 }
259
260 if ($#$field >= 0) {
261 push @m, $field;
262 warn "## saved/3 ", dump( $field ),$/ if ($debug);
263 }
264
265 return @m;
266 }
267
268 =head1 Functions to create C<data_structure>
269
270 Those functions generally have to first in your normalization file.
271
272 =head2 tag
273
274 Define new tag for I<search> and I<display>.
275
276 tag('Title', rec('200','a') );
277
278
279 =cut
280
281 sub tag {
282 my $name = shift or die "tag needs name as first argument";
283 my @o = grep { defined($_) && $_ ne '' } @_;
284 return unless (@o);
285 $out->{$name}->{tag} = $name;
286 $out->{$name}->{search} = \@o;
287 $out->{$name}->{display} = \@o;
288 }
289
290 =head2 display
291
292 Define tag just for I<display>
293
294 @v = display('Title', rec('200','a') );
295
296 =cut
297
298 sub display {
299 my $name = shift or die "display needs name as first argument";
300 my @o = grep { defined($_) && $_ ne '' } @_;
301 return unless (@o);
302 $out->{$name}->{tag} = $name;
303 $out->{$name}->{display} = \@o;
304 }
305
306 =head2 search
307
308 Prepare values just for I<search>
309
310 @v = search('Title', rec('200','a') );
311
312 =cut
313
314 sub search {
315 my $name = shift or die "search needs name as first argument";
316 my @o = grep { defined($_) && $_ ne '' } @_;
317 return unless (@o);
318 $out->{$name}->{tag} = $name;
319 $out->{$name}->{search} = \@o;
320 }
321
322 =head2 marc
323
324 Save value for MARC field
325
326 marc('900','a', rec('200','a') );
327
328 =cut
329
330 sub marc {
331 my $f = shift or die "marc needs field";
332 die "marc field must be numer" unless ($f =~ /^\d+$/);
333
334 my $sf = shift or die "marc needs subfield";
335
336 foreach (@_) {
337 my $v = $_; # make var read-write for Encode
338 next unless (defined($v) && $v !~ /^\s*$/);
339 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
340 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
341 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
342 }
343 }
344
345 =head2 marc_repeatable_subfield
346
347 Save values for MARC repetable subfield
348
349 marc_repeatable_subfield('910', 'z', rec('909') );
350
351 =cut
352
353 sub marc_repeatable_subfield {
354 my ($f,$sf) = @_;
355 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
356 $marc_repeatable_subfield->{ $f . $sf }++;
357 marc(@_);
358 }
359
360 =head2 marc_indicators
361
362 Set both indicators for MARC field
363
364 marc_indicators('900', ' ', 1);
365
366 Any indicator value other than C<0-9> will be treated as undefined.
367
368 =cut
369
370 sub marc_indicators {
371 my $f = shift || die "marc_indicators need field!\n";
372 my ($i1,$i2) = @_;
373 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
374 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
375
376 $i1 = ' ' if ($i1 !~ /^\d$/);
377 $i2 = ' ' if ($i2 !~ /^\d$/);
378 @{ $marc_indicators->{$f} } = ($i1,$i2);
379 }
380
381
382 =head1 Functions to extract data from input
383
384 This function should be used inside functions to create C<data_structure> described
385 above.
386
387 =head2 rec1
388
389 Return all values in some field
390
391 @v = rec1('200')
392
393 TODO: order of values is probably same as in source data, need to investigate that
394
395 =cut
396
397 sub rec1 {
398 my $f = shift;
399 return unless (defined($rec) && defined($rec->{$f}));
400 if (ref($rec->{$f}) eq 'ARRAY') {
401 return map {
402 if (ref($_) eq 'HASH') {
403 values %{$_};
404 } else {
405 $_;
406 }
407 } @{ $rec->{$f} };
408 } elsif( defined($rec->{$f}) ) {
409 return $rec->{$f};
410 }
411 }
412
413 =head2 rec2
414
415 Return all values in specific field and subfield
416
417 @v = rec2('200','a')
418
419 =cut
420
421 sub rec2 {
422 my $f = shift;
423 return unless (defined($rec && $rec->{$f}));
424 my $sf = shift;
425 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
426 }
427
428 =head2 rec
429
430 syntaxtic sugar for
431
432 @v = rec('200')
433 @v = rec('200','a')
434
435 =cut
436
437 sub rec {
438 if ($#_ == 0) {
439 return rec1(@_);
440 } elsif ($#_ == 1) {
441 return rec2(@_);
442 }
443 }
444
445 =head2 regex
446
447 Apply regex to some or all values
448
449 @v = regex( 's/foo/bar/g', @v );
450
451 =cut
452
453 sub regex {
454 my $r = shift;
455 my @out;
456 #warn "r: $r\n", dump(\@_);
457 foreach my $t (@_) {
458 next unless ($t);
459 eval "\$t =~ $r";
460 push @out, $t if ($t && $t ne '');
461 }
462 return @out;
463 }
464
465 =head2 prefix
466
467 Prefix all values with a string
468
469 @v = prefix( 'my_', @v );
470
471 =cut
472
473 sub prefix {
474 my $p = shift or die "prefix needs string as first argument";
475 return map { $p . $_ } grep { defined($_) } @_;
476 }
477
478 =head2 suffix
479
480 suffix all values with a string
481
482 @v = suffix( '_my', @v );
483
484 =cut
485
486 sub suffix {
487 my $s = shift or die "suffix needs string as first argument";
488 return map { $_ . $s } grep { defined($_) } @_;
489 }
490
491 =head2 surround
492
493 surround all values with a two strings
494
495 @v = surround( 'prefix_', '_suffix', @v );
496
497 =cut
498
499 sub surround {
500 my $p = shift or die "surround need prefix as first argument";
501 my $s = shift or die "surround needs suffix as second argument";
502 return map { $p . $_ . $s } grep { defined($_) } @_;
503 }
504
505 =head2 first
506
507 Return first element
508
509 $v = first( @v );
510
511 =cut
512
513 sub first {
514 my $r = shift;
515 return $r;
516 }
517
518 =head2 lookup
519
520 Consult lookup hashes for some value
521
522 @v = lookup( $v );
523 @v = lookup( @v );
524
525 =cut
526
527 sub lookup {
528 my $k = shift or return;
529 return unless (defined($lookup->{$k}));
530 if (ref($lookup->{$k}) eq 'ARRAY') {
531 return @{ $lookup->{$k} };
532 } else {
533 return $lookup->{$k};
534 }
535 }
536
537 =head2 join_with
538
539 Joins walues with some delimiter
540
541 $v = join_with(", ", @v);
542
543 =cut
544
545 sub join_with {
546 my $d = shift;
547 return join($d, grep { defined($_) && $_ ne '' } @_);
548 }
549
550 # END
551 1;

  ViewVC Help
Powered by ViewVC 1.1.26