/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 551 - (show annotations)
Fri Jun 30 20:43:09 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 10822 byte(s)
 r750@llin:  dpavlin | 2006-06-30 22:34:44 +0200
 check if marc_record has values

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc marc_indicators marc_repeatable_subfield
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dump qw/dump/;
20 use Encode qw/from_to/;
21
22 # debugging warn(s)
23 my $debug = 0;
24
25
26 =head1 NAME
27
28 WebPAC::Normalize - describe normalisaton rules using sets
29
30 =head1 VERSION
31
32 Version 0.06
33
34 =cut
35
36 our $VERSION = '0.06';
37
38 =head1 SYNOPSIS
39
40 This module uses C<conf/normalize/*.pl> files to perform normalisation
41 from input records using perl functions which are specialized for set
42 processing.
43
44 Sets are implemented as arrays, and normalisation file is valid perl, which
45 means that you check it's validity before running WebPAC using
46 C<perl -c normalize.pl>.
47
48 Normalisation can generate multiple output normalized data. For now, supported output
49 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
50 C<marc>.
51
52 =head1 FUNCTIONS
53
54 Functions which start with C<_> are private and used by WebPAC internally.
55 All other functions are available for use within normalisation rules.
56
57 =head2 data_structure
58
59 Return data structure
60
61 my $ds = WebPAC::Normalize::data_structure(
62 lookup => $lookup->lookup_hash,
63 row => $row,
64 rules => $normalize_pl_config,
65 marc_encoding => 'utf-8',
66 );
67
68 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
69 other are optional.
70
71 This function will B<die> if normalizastion can't be evaled.
72
73 Since this function isn't exported you have to call it with
74 C<WebPAC::Normalize::data_structure>.
75
76 =cut
77
78 sub data_structure {
79 my $arg = {@_};
80
81 die "need row argument" unless ($arg->{row});
82 die "need normalisation argument" unless ($arg->{rules});
83
84 no strict 'subs';
85 _set_lookup( $arg->{lookup} );
86 _set_rec( $arg->{row} );
87 _clean_ds( %{ $arg } );
88 eval "$arg->{rules}";
89 die "error evaling $arg->{rules}: $@\n" if ($@);
90
91 return _get_ds();
92 }
93
94 =head2 _set_rec
95
96 Set current record hash
97
98 _set_rec( $rec );
99
100 =cut
101
102 my $rec;
103
104 sub _set_rec {
105 $rec = shift or die "no record hash";
106 }
107
108 =head2 _get_ds
109
110 Return hash formatted as data structure
111
112 my $ds = _get_ds();
113
114 =cut
115
116 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
117
118 sub _get_ds {
119 return $out;
120 }
121
122 =head2 _clean_ds
123
124 Clean data structure hash for next record
125
126 _clean_ds();
127
128 =cut
129
130 sub _clean_ds {
131 my $a = {@_};
132 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
133 $marc_encoding = $a->{marc_encoding};
134 }
135
136 =head2 _set_lookup
137
138 Set current lookup hash
139
140 _set_lookup( $lookup );
141
142 =cut
143
144 my $lookup;
145
146 sub _set_lookup {
147 $lookup = shift;
148 }
149
150 =head2 _get_marc_fields
151
152 Get all fields defined by calls to C<marc>
153
154 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
155
156
157
158 We are using I<magic> which detect repeatable fields only from
159 sequence of field/subfield data generated by normalization.
160
161 Repeatable field is created if there is second occurence of same subfield or
162 if any of indicators are different. This is sane for most cases except for
163 non-repeatable fields with repeatable subfields.
164
165 You can change behaviour of that using C<marc_repeatable_subfield>.
166
167 =cut
168
169 sub _get_marc_fields {
170
171 return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
172
173 # first, sort all existing fields
174 # XXX might not be needed, but modern perl might randomize elements in hash
175 my @sorted_marc_record = sort {
176 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
177 } @{ $marc_record };
178
179 # output marc fields
180 my @m;
181
182 # count unique field-subfields (used for offset when walking to next subfield)
183 my $u;
184 map { $u->{ $_->[0] . $_->[3] }++ } @sorted_marc_record;
185
186 if ($debug) {
187 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
188 warn "## marc_record ", dump( $marc_record ), $/;
189 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
190 warn "## subfield count ", dump( $u ), $/;
191 }
192
193 my $len = $#sorted_marc_record;
194 my $visited;
195 my $i = 0;
196 my $field;
197
198 foreach ( 0 .. $len ) {
199
200 # find next element which isn't visited
201 while ($visited->{$i}) {
202 $i = ($i + 1) % ($len + 1);
203 }
204
205 # mark it visited
206 $visited->{$i}++;
207
208 my $row = $sorted_marc_record[$i];
209
210 # field and subfield which is key for
211 # marc_repeatable_subfield and u
212 my $fsf = $row->[0] . $row->[3];
213
214 if ($debug > 1) {
215
216 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
217 print "### this [$i]: ", dump( $row ),$/;
218 print "### sf: ", $row->[3], " vs ", $field->[3],
219 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
220 if ($#$field >= 0);
221
222 }
223
224 # if field exists
225 if ( $#$field >= 0 ) {
226 if (
227 $row->[0] ne $field->[0] || # field
228 $row->[1] ne $field->[1] || # i1
229 $row->[2] ne $field->[2] # i2
230 ) {
231 push @m, $field;
232 warn "## saved/1 ", dump( $field ),$/ if ($debug);
233 $field = $row;
234
235 } elsif (
236 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
237 ||
238 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
239 ! $marc_repeatable_subfield->{ $fsf }
240 )
241 ) {
242 push @m, $field;
243 warn "## saved/2 ", dump( $field ),$/ if ($debug);
244 $field = $row;
245
246 } else {
247 # append new subfields to existing field
248 push @$field, ( $row->[3], $row->[4] );
249 }
250 } else {
251 # insert first field
252 $field = $row;
253 }
254
255 if (! $marc_repeatable_subfield->{ $fsf }) {
256 # make step to next subfield
257 $i = ($i + $u->{ $fsf } ) % ($len + 1);
258 }
259 }
260
261 if ($#$field >= 0) {
262 push @m, $field;
263 warn "## saved/3 ", dump( $field ),$/ if ($debug);
264 }
265
266 return @m;
267 }
268
269 =head1 Functions to create C<data_structure>
270
271 Those functions generally have to first in your normalization file.
272
273 =head2 tag
274
275 Define new tag for I<search> and I<display>.
276
277 tag('Title', rec('200','a') );
278
279
280 =cut
281
282 sub tag {
283 my $name = shift or die "tag needs name as first argument";
284 my @o = grep { defined($_) && $_ ne '' } @_;
285 return unless (@o);
286 $out->{$name}->{tag} = $name;
287 $out->{$name}->{search} = \@o;
288 $out->{$name}->{display} = \@o;
289 }
290
291 =head2 display
292
293 Define tag just for I<display>
294
295 @v = display('Title', rec('200','a') );
296
297 =cut
298
299 sub display {
300 my $name = shift or die "display needs name as first argument";
301 my @o = grep { defined($_) && $_ ne '' } @_;
302 return unless (@o);
303 $out->{$name}->{tag} = $name;
304 $out->{$name}->{display} = \@o;
305 }
306
307 =head2 search
308
309 Prepare values just for I<search>
310
311 @v = search('Title', rec('200','a') );
312
313 =cut
314
315 sub search {
316 my $name = shift or die "search needs name as first argument";
317 my @o = grep { defined($_) && $_ ne '' } @_;
318 return unless (@o);
319 $out->{$name}->{tag} = $name;
320 $out->{$name}->{search} = \@o;
321 }
322
323 =head2 marc
324
325 Save value for MARC field
326
327 marc('900','a', rec('200','a') );
328
329 =cut
330
331 sub marc {
332 my $f = shift or die "marc needs field";
333 die "marc field must be numer" unless ($f =~ /^\d+$/);
334
335 my $sf = shift or die "marc needs subfield";
336
337 foreach (@_) {
338 my $v = $_; # make var read-write for Encode
339 next unless (defined($v) && $v !~ /^\s*$/);
340 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
341 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
342 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
343 }
344 }
345
346 =head2 marc_repeatable_subfield
347
348 Save values for MARC repetable subfield
349
350 marc_repeatable_subfield('910', 'z', rec('909') );
351
352 =cut
353
354 sub marc_repeatable_subfield {
355 my ($f,$sf) = @_;
356 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
357 $marc_repeatable_subfield->{ $f . $sf }++;
358 marc(@_);
359 }
360
361 =head2 marc_indicators
362
363 Set both indicators for MARC field
364
365 marc_indicators('900', ' ', 1);
366
367 Any indicator value other than C<0-9> will be treated as undefined.
368
369 =cut
370
371 sub marc_indicators {
372 my $f = shift || die "marc_indicators need field!\n";
373 my ($i1,$i2) = @_;
374 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
375 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
376
377 $i1 = ' ' if ($i1 !~ /^\d$/);
378 $i2 = ' ' if ($i2 !~ /^\d$/);
379 @{ $marc_indicators->{$f} } = ($i1,$i2);
380 }
381
382
383 =head1 Functions to extract data from input
384
385 This function should be used inside functions to create C<data_structure> described
386 above.
387
388 =head2 rec1
389
390 Return all values in some field
391
392 @v = rec1('200')
393
394 TODO: order of values is probably same as in source data, need to investigate that
395
396 =cut
397
398 sub rec1 {
399 my $f = shift;
400 return unless (defined($rec) && defined($rec->{$f}));
401 if (ref($rec->{$f}) eq 'ARRAY') {
402 return map {
403 if (ref($_) eq 'HASH') {
404 values %{$_};
405 } else {
406 $_;
407 }
408 } @{ $rec->{$f} };
409 } elsif( defined($rec->{$f}) ) {
410 return $rec->{$f};
411 }
412 }
413
414 =head2 rec2
415
416 Return all values in specific field and subfield
417
418 @v = rec2('200','a')
419
420 =cut
421
422 sub rec2 {
423 my $f = shift;
424 return unless (defined($rec && $rec->{$f}));
425 my $sf = shift;
426 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
427 }
428
429 =head2 rec
430
431 syntaxtic sugar for
432
433 @v = rec('200')
434 @v = rec('200','a')
435
436 =cut
437
438 sub rec {
439 if ($#_ == 0) {
440 return rec1(@_);
441 } elsif ($#_ == 1) {
442 return rec2(@_);
443 }
444 }
445
446 =head2 regex
447
448 Apply regex to some or all values
449
450 @v = regex( 's/foo/bar/g', @v );
451
452 =cut
453
454 sub regex {
455 my $r = shift;
456 my @out;
457 #warn "r: $r\n", dump(\@_);
458 foreach my $t (@_) {
459 next unless ($t);
460 eval "\$t =~ $r";
461 push @out, $t if ($t && $t ne '');
462 }
463 return @out;
464 }
465
466 =head2 prefix
467
468 Prefix all values with a string
469
470 @v = prefix( 'my_', @v );
471
472 =cut
473
474 sub prefix {
475 my $p = shift or die "prefix needs string as first argument";
476 return map { $p . $_ } grep { defined($_) } @_;
477 }
478
479 =head2 suffix
480
481 suffix all values with a string
482
483 @v = suffix( '_my', @v );
484
485 =cut
486
487 sub suffix {
488 my $s = shift or die "suffix needs string as first argument";
489 return map { $_ . $s } grep { defined($_) } @_;
490 }
491
492 =head2 surround
493
494 surround all values with a two strings
495
496 @v = surround( 'prefix_', '_suffix', @v );
497
498 =cut
499
500 sub surround {
501 my $p = shift or die "surround need prefix as first argument";
502 my $s = shift or die "surround needs suffix as second argument";
503 return map { $p . $_ . $s } grep { defined($_) } @_;
504 }
505
506 =head2 first
507
508 Return first element
509
510 $v = first( @v );
511
512 =cut
513
514 sub first {
515 my $r = shift;
516 return $r;
517 }
518
519 =head2 lookup
520
521 Consult lookup hashes for some value
522
523 @v = lookup( $v );
524 @v = lookup( @v );
525
526 =cut
527
528 sub lookup {
529 my $k = shift or return;
530 return unless (defined($lookup->{$k}));
531 if (ref($lookup->{$k}) eq 'ARRAY') {
532 return @{ $lookup->{$k} };
533 } else {
534 return $lookup->{$k};
535 }
536 }
537
538 =head2 join_with
539
540 Joins walues with some delimiter
541
542 $v = join_with(", ", @v);
543
544 =cut
545
546 sub join_with {
547 my $d = shift;
548 return join($d, grep { defined($_) && $_ ne '' } @_);
549 }
550
551 # END
552 1;

  ViewVC Help
Powered by ViewVC 1.1.26