/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 889 - (show annotations)
Thu Sep 6 19:12:15 2007 UTC (16 years, 7 months ago) by dpavlin
File size: 30494 byte(s)
make missing value in marc_fixed confess and fixed warning

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader marc_fixed
13 marc_duplicate marc_remove marc_count
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22
23 get set
24 count
25 /;
26
27 use warnings;
28 use strict;
29
30 #use base qw/WebPAC::Common/;
31 use Data::Dump qw/dump/;
32 use Storable qw/dclone/;
33 use Carp qw/confess/;
34
35 # debugging warn(s)
36 my $debug = 0;
37
38
39 =head1 NAME
40
41 WebPAC::Normalize - describe normalisaton rules using sets
42
43 =head1 VERSION
44
45 Version 0.29
46
47 =cut
48
49 our $VERSION = '0.29';
50
51 =head1 SYNOPSIS
52
53 This module uses C<conf/normalize/*.pl> files to perform normalisation
54 from input records using perl functions which are specialized for set
55 processing.
56
57 Sets are implemented as arrays, and normalisation file is valid perl, which
58 means that you check it's validity before running WebPAC using
59 C<perl -c normalize.pl>.
60
61 Normalisation can generate multiple output normalized data. For now, supported output
62 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
63 C<marc>.
64
65 =head1 FUNCTIONS
66
67 Functions which start with C<_> are private and used by WebPAC internally.
68 All other functions are available for use within normalisation rules.
69
70 =head2 data_structure
71
72 Return data structure
73
74 my $ds = WebPAC::Normalize::data_structure(
75 lookup => $lookup_hash,
76 row => $row,
77 rules => $normalize_pl_config,
78 marc_encoding => 'utf-8',
79 config => $config,
80 load_row_coderef => sub {
81 my ($database,$input,$mfn) = shift;
82 $store->load_row( database => $database, input => $input, id => $mfn );
83 },
84 );
85
86 Options C<row>, C<rules> and C<log> are mandatory while all
87 other are optional.
88
89 C<load_row_coderef> is closure only used when executing lookups, so they will
90 die if it's not defined.
91
92 This function will B<die> if normalizastion can't be evaled.
93
94 Since this function isn't exported you have to call it with
95 C<WebPAC::Normalize::data_structure>.
96
97 =cut
98
99 my $load_row_coderef;
100
101 sub data_structure {
102 my $arg = {@_};
103
104 die "need row argument" unless ($arg->{row});
105 die "need normalisation argument" unless ($arg->{rules});
106
107 no strict 'subs';
108 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 _set_rec( $arg->{row} );
110 _set_config( $arg->{config} ) if defined($arg->{config});
111 _clean_ds( %{ $arg } );
112 $load_row_coderef = $arg->{load_row_coderef};
113
114 eval "$arg->{rules}";
115 die "error evaling $arg->{rules}: $@\n" if ($@);
116
117 return _get_ds();
118 }
119
120 =head2 _set_rec
121
122 Set current record hash
123
124 _set_rec( $rec );
125
126 =cut
127
128 my $rec;
129
130 sub _set_rec {
131 $rec = shift or die "no record hash";
132 }
133
134 =head2 _set_config
135
136 Set current config hash
137
138 _set_config( $config );
139
140 Magic keys are:
141
142 =over 4
143
144 =item _
145
146 Code of current database
147
148 =item _mfn
149
150 Current MFN
151
152 =back
153
154 =cut
155
156 my $config;
157
158 sub _set_config {
159 $config = shift;
160 }
161
162 =head2 _get_ds
163
164 Return hash formatted as data structure
165
166 my $ds = _get_ds();
167
168 =cut
169
170 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
171 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
172
173 sub _get_ds {
174 return $out;
175 }
176
177 =head2 _clean_ds
178
179 Clean data structure hash for next record
180
181 _clean_ds();
182
183 =cut
184
185 sub _clean_ds {
186 my $a = {@_};
187 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
188 ($marc_record_offset, $marc_fetch_offset) = (0,0);
189 $marc_encoding = $a->{marc_encoding};
190 }
191
192 =head2 _set_lookup
193
194 Set current lookup hash
195
196 _set_lookup( $lookup );
197
198 =cut
199
200 my $lookup;
201
202 sub _set_lookup {
203 $lookup = shift;
204 }
205
206 =head2 _get_lookup
207
208 Get current lookup hash
209
210 my $lookup = _get_lookup();
211
212 =cut
213
214 sub _get_lookup {
215 return $lookup;
216 }
217
218 =head2 _set_load_row
219
220 Setup code reference which will return L<data_structure> from
221 L<WebPAC::Store>
222
223 _set_load_row(sub {
224 my ($database,$input,$mfn) = @_;
225 $store->load_row( database => $database, input => $input, id => $mfn );
226 });
227
228 =cut
229
230 sub _set_load_row {
231 my $coderef = shift;
232 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
233
234 $load_row_coderef = $coderef;
235 }
236
237 =head2 _get_marc_fields
238
239 Get all fields defined by calls to C<marc>
240
241 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
242
243 We are using I<magic> which detect repeatable fields only from
244 sequence of field/subfield data generated by normalization.
245
246 Repeatable field is created when there is second occurence of same subfield or
247 if any of indicators are different.
248
249 This is sane for most cases. Something like:
250
251 900a-1 900b-1 900c-1
252 900a-2 900b-2
253 900a-3
254
255 will be created from any combination of:
256
257 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
258
259 and following rules:
260
261 marc('900','a', rec('200','a') );
262 marc('900','b', rec('200','b') );
263 marc('900','c', rec('200','c') );
264
265 which might not be what you have in mind. If you need repeatable subfield,
266 define it using C<marc_repeatable_subfield> like this:
267
268 marc_repeatable_subfield('900','a');
269 marc('900','a', rec('200','a') );
270 marc('900','b', rec('200','b') );
271 marc('900','c', rec('200','c') );
272
273 will create:
274
275 900a-1 900a-2 900a-3 900b-1 900c-1
276 900b-2
277
278 There is also support for returning next or specific using:
279
280 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
281 # do something with $mf
282 }
283
284 will always return fields from next MARC record or
285
286 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
287
288 will return 42th copy record (if it exists).
289
290 =cut
291
292 my $fetch_pos;
293
294 sub _get_marc_fields {
295
296 my $arg = {@_};
297 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
298 $fetch_pos = $marc_fetch_offset;
299 if ($arg->{offset}) {
300 $fetch_pos = $arg->{offset};
301 } elsif($arg->{fetch_next}) {
302 $marc_fetch_offset++;
303 }
304
305 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
306
307 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
308
309 my $marc_rec = $marc_record->[ $fetch_pos ];
310
311 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
312
313 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
314
315 # first, sort all existing fields
316 # XXX might not be needed, but modern perl might randomize elements in hash
317 my @sorted_marc_record = sort {
318 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
319 } @{ $marc_rec };
320
321 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
322
323 # output marc fields
324 my @m;
325
326 # count unique field-subfields (used for offset when walking to next subfield)
327 my $u;
328 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
329
330 if ($debug) {
331 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
332 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
333 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
334 warn "## subfield count = ", dump( $u ), $/;
335 }
336
337 my $len = $#sorted_marc_record;
338 my $visited;
339 my $i = 0;
340 my $field;
341
342 foreach ( 0 .. $len ) {
343
344 # find next element which isn't visited
345 while ($visited->{$i}) {
346 $i = ($i + 1) % ($len + 1);
347 }
348
349 # mark it visited
350 $visited->{$i}++;
351
352 my $row = dclone( $sorted_marc_record[$i] );
353
354 # field and subfield which is key for
355 # marc_repeatable_subfield and u
356 my $fsf = $row->[0] . ( $row->[3] || '' );
357
358 if ($debug > 1) {
359
360 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
361 print "### this [$i]: ", dump( $row ),$/;
362 print "### sf: ", $row->[3], " vs ", $field->[3],
363 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
364 if ($#$field >= 0);
365
366 }
367
368 # if field exists
369 if ( $#$field >= 0 ) {
370 if (
371 $row->[0] ne $field->[0] || # field
372 $row->[1] ne $field->[1] || # i1
373 $row->[2] ne $field->[2] # i2
374 ) {
375 push @m, $field;
376 warn "## saved/1 ", dump( $field ),$/ if ($debug);
377 $field = $row;
378
379 } elsif (
380 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
381 ||
382 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
383 ! $marc_repeatable_subfield->{ $fsf }
384 )
385 ) {
386 push @m, $field;
387 warn "## saved/2 ", dump( $field ),$/ if ($debug);
388 $field = $row;
389
390 } else {
391 # append new subfields to existing field
392 push @$field, ( $row->[3], $row->[4] );
393 }
394 } else {
395 # insert first field
396 $field = $row;
397 }
398
399 if (! $marc_repeatable_subfield->{ $fsf }) {
400 # make step to next subfield
401 $i = ($i + $u->{ $fsf } ) % ($len + 1);
402 }
403 }
404
405 if ($#$field >= 0) {
406 push @m, $field;
407 warn "## saved/3 ", dump( $field ),$/ if ($debug);
408 }
409
410 return \@m;
411 }
412
413 =head2 _get_marc_leader
414
415 Return leader from currently fetched record by L</_get_marc_fields>
416
417 print WebPAC::Normalize::_get_marc_leader();
418
419 =cut
420
421 sub _get_marc_leader {
422 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
423 return $marc_leader->[ $fetch_pos ];
424 }
425
426 =head2 _debug
427
428 Change level of debug warnings
429
430 _debug( 2 );
431
432 =cut
433
434 sub _debug {
435 my $l = shift;
436 return $debug unless defined($l);
437 warn "debug level $l",$/ if ($l > 0);
438 $debug = $l;
439 }
440
441 =head1 Functions to create C<data_structure>
442
443 Those functions generally have to first in your normalization file.
444
445 =head2 tag
446
447 Define new tag for I<search> and I<display>.
448
449 tag('Title', rec('200','a') );
450
451
452 =cut
453
454 sub tag {
455 my $name = shift or die "tag needs name as first argument";
456 my @o = grep { defined($_) && $_ ne '' } @_;
457 return unless (@o);
458 $out->{$name}->{tag} = $name;
459 $out->{$name}->{search} = \@o;
460 $out->{$name}->{display} = \@o;
461 }
462
463 =head2 display
464
465 Define tag just for I<display>
466
467 @v = display('Title', rec('200','a') );
468
469 =cut
470
471 sub display {
472 my $name = shift or die "display needs name as first argument";
473 my @o = grep { defined($_) && $_ ne '' } @_;
474 return unless (@o);
475 $out->{$name}->{tag} = $name;
476 $out->{$name}->{display} = \@o;
477 }
478
479 =head2 search
480
481 Prepare values just for I<search>
482
483 @v = search('Title', rec('200','a') );
484
485 =cut
486
487 sub search {
488 my $name = shift or die "search needs name as first argument";
489 my @o = grep { defined($_) && $_ ne '' } @_;
490 return unless (@o);
491 $out->{$name}->{tag} = $name;
492 $out->{$name}->{search} = \@o;
493 }
494
495 =head2 marc_leader
496
497 Setup fields within MARC leader or get leader
498
499 marc_leader('05','c');
500 my $leader = marc_leader();
501
502 =cut
503
504 sub marc_leader {
505 my ($offset,$value) = @_;
506
507 if ($offset) {
508 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
509 } else {
510
511 if (defined($marc_leader)) {
512 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
513 return $marc_leader->[ $marc_record_offset ];
514 } else {
515 return;
516 }
517 }
518 }
519
520 =head2 marc_fixed
521
522 Create control/indentifier fields with values in fixed positions
523
524 marc_fixed('008', 00, '070402');
525 marc_fixed('008', 39, '|');
526
527 Positions not specified will be filled with spaces (C<0x20>).
528
529 There will be no effort to extend last specified value to full length of
530 field in standard.
531
532 =cut
533
534 sub marc_fixed {
535 my ($f, $pos, $val) = @_;
536 die "need marc(field, position, value)" unless defined($f) && defined($pos);
537
538 confess "need val" unless defined $val;
539
540 my $update = 0;
541
542 map {
543 if ($_->[0] eq $f) {
544 my $old = $_->[1];
545 if (length($old) <= $pos) {
546 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
547 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
548 } else {
549 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
550 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
551 }
552 $update++;
553 }
554 } @{ $marc_record->[ $marc_record_offset ] };
555
556 if (! $update) {
557 my $v = ' ' x $pos . $val;
558 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
559 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
560 }
561 }
562
563 =head2 marc
564
565 Save value for MARC field
566
567 marc('900','a', rec('200','a') );
568 marc('001', rec('000') );
569
570 =cut
571
572 sub marc {
573 my $f = shift or die "marc needs field";
574 die "marc field must be numer" unless ($f =~ /^\d+$/);
575
576 my $sf;
577 if ($f >= 10) {
578 $sf = shift or die "marc needs subfield";
579 }
580
581 foreach (@_) {
582 my $v = $_; # make var read-write for Encode
583 next unless (defined($v) && $v !~ /^\s*$/);
584 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
585 if (defined $sf) {
586 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
587 } else {
588 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
589 }
590 }
591 }
592
593 =head2 marc_repeatable_subfield
594
595 Save values for MARC repetable subfield
596
597 marc_repeatable_subfield('910', 'z', rec('909') );
598
599 =cut
600
601 sub marc_repeatable_subfield {
602 my ($f,$sf) = @_;
603 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
604 $marc_repeatable_subfield->{ $f . $sf }++;
605 marc(@_);
606 }
607
608 =head2 marc_indicators
609
610 Set both indicators for MARC field
611
612 marc_indicators('900', ' ', 1);
613
614 Any indicator value other than C<0-9> will be treated as undefined.
615
616 =cut
617
618 sub marc_indicators {
619 my $f = shift || die "marc_indicators need field!\n";
620 my ($i1,$i2) = @_;
621 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
622 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
623
624 $i1 = ' ' if ($i1 !~ /^\d$/);
625 $i2 = ' ' if ($i2 !~ /^\d$/);
626 @{ $marc_indicators->{$f} } = ($i1,$i2);
627 }
628
629 =head2 marc_compose
630
631 Save values for each MARC subfield explicitly
632
633 marc_compose('900',
634 'a', rec('200','a')
635 'b', rec('201','a')
636 'a', rec('200','b')
637 'c', rec('200','c')
638 );
639
640 If you specify C<+> for subfield, value will be appended
641 to previous defined subfield.
642
643 =cut
644
645 sub marc_compose {
646 my $f = shift or die "marc_compose needs field";
647 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
648
649 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
650 my $m = [ $f, $i1, $i2 ];
651
652 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
653
654 if ($#_ % 2 != 1) {
655 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
656 }
657
658 while (@_) {
659 my $sf = shift;
660 my $v = shift;
661
662 next unless (defined($v) && $v !~ /^\s*$/);
663 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
664 if ($sf ne '+') {
665 push @$m, ( $sf, $v );
666 } else {
667 $m->[ $#$m ] .= $v;
668 }
669 }
670
671 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
672
673 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
674 }
675
676 =head2 marc_duplicate
677
678 Generate copy of current MARC record and continue working on copy
679
680 marc_duplicate();
681
682 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
683 C<< _get_marc_fields( offset => 42 ) >>.
684
685 =cut
686
687 sub marc_duplicate {
688 my $m = $marc_record->[ -1 ];
689 die "can't duplicate record which isn't defined" unless ($m);
690 push @{ $marc_record }, dclone( $m );
691 push @{ $marc_leader }, dclone( marc_leader() );
692 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
693 $marc_record_offset = $#{ $marc_record };
694 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
695
696 }
697
698 =head2 marc_remove
699
700 Remove some field or subfield from MARC record.
701
702 marc_remove('200');
703 marc_remove('200','a');
704
705 This will erase field C<200> or C<200^a> from current MARC record.
706
707 marc_remove('*');
708
709 Will remove all fields in current MARC record.
710
711 This is useful after calling C<marc_duplicate> or on it's own (but, you
712 should probably just remove that subfield definition if you are not
713 using C<marc_duplicate>).
714
715 FIXME: support fields < 10.
716
717 =cut
718
719 sub marc_remove {
720 my ($f, $sf) = @_;
721
722 die "marc_remove needs record number" unless defined($f);
723
724 my $marc = $marc_record->[ $marc_record_offset ];
725
726 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
727
728 if ($f eq '*') {
729
730 delete( $marc_record->[ $marc_record_offset ] );
731 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
732
733 } else {
734
735 my $i = 0;
736 foreach ( 0 .. $#{ $marc } ) {
737 last unless (defined $marc->[$i]);
738 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
739 if ($marc->[$i]->[0] eq $f) {
740 if (! defined $sf) {
741 # remove whole field
742 splice @$marc, $i, 1;
743 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
744 $i--;
745 } else {
746 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
747 my $o = ($j * 2) + 3;
748 if ($marc->[$i]->[$o] eq $sf) {
749 # remove subfield
750 splice @{$marc->[$i]}, $o, 2;
751 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
752 # is record now empty?
753 if ($#{ $marc->[$i] } == 2) {
754 splice @$marc, $i, 1;
755 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
756 $i--;
757 };
758 }
759 }
760 }
761 }
762 $i++;
763 }
764
765 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
766
767 $marc_record->[ $marc_record_offset ] = $marc;
768 }
769
770 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
771 }
772
773 =head2 marc_original_order
774
775 Copy all subfields preserving original order to marc field.
776
777 marc_original_order( marc_field_number, original_input_field_number );
778
779 Please note that field numbers are consistent with other commands (marc
780 field number first), but somewhat counter-intuitive (destination and then
781 source).
782
783 You might want to use this command if you are just renaming subfields or
784 using pre-processing modify_record in C<config.yml> and don't need any
785 post-processing or want to preserve order of original subfields.
786
787
788 =cut
789
790 sub marc_original_order {
791
792 my ($to, $from) = @_;
793 die "marc_original_order needs from and to fields\n" unless ($from && $to);
794
795 return unless defined($rec->{$from});
796
797 my $r = $rec->{$from};
798 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
799
800 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
801 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
802
803 foreach my $d (@$r) {
804
805 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
806 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
807 next;
808 }
809
810 my @sfs = @{ $d->{subfields} };
811
812 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
813
814 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
815
816 my $m = [ $to, $i1, $i2 ];
817
818 while (my $sf = shift @sfs) {
819
820 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
821 my $offset = shift @sfs;
822 die "corrupted sufields specification for field $from\n" unless defined($offset);
823
824 my $v;
825 if (ref($d->{$sf}) eq 'ARRAY') {
826 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
827 } elsif ($offset == 0) {
828 $v = $d->{$sf};
829 } else {
830 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
831 }
832 push @$m, ( $sf, $v ) if (defined($v));
833 }
834
835 if ($#{$m} > 2) {
836 push @{ $marc_record->[ $marc_record_offset ] }, $m;
837 }
838 }
839
840 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
841 }
842
843 =head2 marc_count
844
845 Return number of MARC records created using L</marc_duplicate>.
846
847 print "created ", marc_count(), " records";
848
849 =cut
850
851 sub marc_count {
852 return $#{ $marc_record };
853 }
854
855
856 =head1 Functions to extract data from input
857
858 This function should be used inside functions to create C<data_structure> described
859 above.
860
861 =head2 _pack_subfields_hash
862
863 @subfields = _pack_subfields_hash( $h );
864 $subfields = _pack_subfields_hash( $h, 1 );
865
866 Return each subfield value in array or pack them all together and return scalar
867 with subfields (denoted by C<^>) and values.
868
869 =cut
870
871 sub _pack_subfields_hash {
872
873 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
874
875 my ($h,$include_subfields) = @_;
876
877 # sanity and ease of use
878 return $h if (ref($h) ne 'HASH');
879
880 if ( defined($h->{subfields}) ) {
881 my $sfs = delete $h->{subfields} || die "no subfields?";
882 my @out;
883 while (@$sfs) {
884 my $sf = shift @$sfs;
885 push @out, '^' . $sf if ($include_subfields);
886 my $o = shift @$sfs;
887 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
888 # single element subfields are not arrays
889 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
890
891 push @out, $h->{$sf};
892 } else {
893 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
894 push @out, $h->{$sf}->[$o];
895 }
896 }
897 if ($include_subfields) {
898 return join('', @out);
899 } else {
900 return @out;
901 }
902 } else {
903 if ($include_subfields) {
904 my $out = '';
905 foreach my $sf (sort keys %$h) {
906 if (ref($h->{$sf}) eq 'ARRAY') {
907 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
908 } else {
909 $out .= '^' . $sf . $h->{$sf};
910 }
911 }
912 return $out;
913 } else {
914 # FIXME this should probably be in alphabetical order instead of hash order
915 values %{$h};
916 }
917 }
918 }
919
920 =head2 rec1
921
922 Return all values in some field
923
924 @v = rec1('200')
925
926 TODO: order of values is probably same as in source data, need to investigate that
927
928 =cut
929
930 sub rec1 {
931 my $f = shift;
932 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
933 return unless (defined($rec) && defined($rec->{$f}));
934 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
935 if (ref($rec->{$f}) eq 'ARRAY') {
936 my @out;
937 foreach my $h ( @{ $rec->{$f} } ) {
938 if (ref($h) eq 'HASH') {
939 push @out, ( _pack_subfields_hash( $h ) );
940 } else {
941 push @out, $h;
942 }
943 }
944 return @out;
945 } elsif( defined($rec->{$f}) ) {
946 return $rec->{$f};
947 }
948 }
949
950 =head2 rec2
951
952 Return all values in specific field and subfield
953
954 @v = rec2('200','a')
955
956 =cut
957
958 sub rec2 {
959 my $f = shift;
960 return unless (defined($rec && $rec->{$f}));
961 my $sf = shift;
962 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
963 return map {
964 if (ref($_->{$sf}) eq 'ARRAY') {
965 @{ $_->{$sf} };
966 } else {
967 $_->{$sf};
968 }
969 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
970 }
971
972 =head2 rec
973
974 syntaxtic sugar for
975
976 @v = rec('200')
977 @v = rec('200','a')
978
979 If rec() returns just single value, it will
980 return scalar, not array.
981
982 =cut
983
984 sub rec {
985 my @out;
986 if ($#_ == 0) {
987 @out = rec1(@_);
988 } elsif ($#_ == 1) {
989 @out = rec2(@_);
990 }
991 if ($#out == 0 && ! wantarray) {
992 return $out[0];
993 } elsif (@out) {
994 return @out;
995 } else {
996 return '';
997 }
998 }
999
1000 =head2 regex
1001
1002 Apply regex to some or all values
1003
1004 @v = regex( 's/foo/bar/g', @v );
1005
1006 =cut
1007
1008 sub regex {
1009 my $r = shift;
1010 my @out;
1011 #warn "r: $r\n", dump(\@_);
1012 foreach my $t (@_) {
1013 next unless ($t);
1014 eval "\$t =~ $r";
1015 push @out, $t if ($t && $t ne '');
1016 }
1017 return @out;
1018 }
1019
1020 =head2 prefix
1021
1022 Prefix all values with a string
1023
1024 @v = prefix( 'my_', @v );
1025
1026 =cut
1027
1028 sub prefix {
1029 my $p = shift;
1030 return @_ unless defined( $p );
1031 return map { $p . $_ } grep { defined($_) } @_;
1032 }
1033
1034 =head2 suffix
1035
1036 suffix all values with a string
1037
1038 @v = suffix( '_my', @v );
1039
1040 =cut
1041
1042 sub suffix {
1043 my $s = shift;
1044 return @_ unless defined( $s );
1045 return map { $_ . $s } grep { defined($_) } @_;
1046 }
1047
1048 =head2 surround
1049
1050 surround all values with a two strings
1051
1052 @v = surround( 'prefix_', '_suffix', @v );
1053
1054 =cut
1055
1056 sub surround {
1057 my $p = shift;
1058 my $s = shift;
1059 $p = '' unless defined( $p );
1060 $s = '' unless defined( $s );
1061 return map { $p . $_ . $s } grep { defined($_) } @_;
1062 }
1063
1064 =head2 first
1065
1066 Return first element
1067
1068 $v = first( @v );
1069
1070 =cut
1071
1072 sub first {
1073 my $r = shift;
1074 return $r;
1075 }
1076
1077 =head2 lookup
1078
1079 Consult lookup hashes for some value
1080
1081 @v = lookup(
1082 sub {
1083 'ffkk/peri/mfn'.rec('000')
1084 },
1085 'ffkk','peri','200-a-200-e',
1086 sub {
1087 first(rec(200,'a')).' '.first(rec('200','e'))
1088 }
1089 );
1090
1091 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1092 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1093
1094 lookup(
1095 # which results to return from record recorded in lookup
1096 sub { 'ffkk/peri/mfn' . rec('000') },
1097 # from which database and input
1098 'ffkk','peri',
1099 # such that following values match
1100 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1101 # if this part is missing, we will try to match same fields
1102 # from lookup record and current one, or you can override
1103 # which records to use from current record using
1104 sub { rec('900','x') . ' ' . rec('900','y') },
1105 )
1106
1107 You can think about this lookup as SQL (if that helps):
1108
1109 select
1110 sub { what }
1111 from
1112 database, input
1113 where
1114 sub { filter from lookuped record }
1115 having
1116 sub { optional filter on current record }
1117
1118 Easy as pie, right?
1119
1120 =cut
1121
1122 sub lookup {
1123 my ($what, $database, $input, $key, $having) = @_;
1124
1125 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1126
1127 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1128 return unless (defined($lookup->{$database}->{$input}->{$key}));
1129
1130 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1131
1132 my $mfns;
1133 my @having = $having->();
1134
1135 warn "## having = ", dump( @having ) if ($debug > 2);
1136
1137 foreach my $h ( @having ) {
1138 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1139 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1140 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1141 }
1142 }
1143
1144 return unless ($mfns);
1145
1146 my @mfns = sort keys %$mfns;
1147
1148 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1149
1150 my $old_rec = $rec;
1151 my @out;
1152
1153 foreach my $mfn (@mfns) {
1154 $rec = $load_row_coderef->( $database, $input, $mfn );
1155
1156 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1157
1158 my @vals = $what->();
1159
1160 push @out, ( @vals );
1161
1162 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1163 }
1164
1165 # if (ref($lookup->{$k}) eq 'ARRAY') {
1166 # return @{ $lookup->{$k} };
1167 # } else {
1168 # return $lookup->{$k};
1169 # }
1170
1171 $rec = $old_rec;
1172
1173 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1174
1175 if ($#out == 0) {
1176 return $out[0];
1177 } else {
1178 return @out;
1179 }
1180 }
1181
1182 =head2 save_into_lookup
1183
1184 Save value into lookup. It associates current database, input
1185 and specific keys with one or more values which will be
1186 associated over MFN.
1187
1188 MFN will be extracted from first occurence current of field 000
1189 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1190
1191 my $nr = save_into_lookup($database,$input,$key,sub {
1192 # code which produce one or more values
1193 });
1194
1195 It returns number of items saved.
1196
1197 This function shouldn't be called directly, it's called from code created by
1198 L<WebPAC::Parser>.
1199
1200 =cut
1201
1202 sub save_into_lookup {
1203 my ($database,$input,$key,$coderef) = @_;
1204 die "save_into_lookup needs database" unless defined($database);
1205 die "save_into_lookup needs input" unless defined($input);
1206 die "save_into_lookup needs key" unless defined($key);
1207 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1208
1209 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1210
1211 my $mfn =
1212 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1213 defined($config->{_mfn}) ? $config->{_mfn} :
1214 die "mfn not defined or zero";
1215
1216 my $nr = 0;
1217
1218 foreach my $v ( $coderef->() ) {
1219 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1220 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1221 $nr++;
1222 }
1223
1224 return $nr;
1225 }
1226
1227 =head2 config
1228
1229 Consult config values stored in C<config.yml>
1230
1231 # return database code (key under databases in yaml)
1232 $database_code = config(); # use _ from hash
1233 $database_name = config('name');
1234 $database_input_name = config('input name');
1235 $tag = config('input normalize tag');
1236
1237 Up to three levels are supported.
1238
1239 =cut
1240
1241 sub config {
1242 return unless ($config);
1243
1244 my $p = shift;
1245
1246 $p ||= '';
1247
1248 my $v;
1249
1250 warn "### getting config($p)\n" if ($debug > 1);
1251
1252 my @p = split(/\s+/,$p);
1253 if ($#p < 0) {
1254 $v = $config->{ '_' }; # special, database code
1255 } else {
1256
1257 my $c = dclone( $config );
1258
1259 foreach my $k (@p) {
1260 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1261 if (ref($c) eq 'ARRAY') {
1262 $c = shift @$c;
1263 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1264 last;
1265 }
1266
1267 if (! defined($c->{$k}) ) {
1268 $c = undef;
1269 last;
1270 } else {
1271 $c = $c->{$k};
1272 }
1273 }
1274 $v = $c if ($c);
1275
1276 }
1277
1278 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1279 warn "config( '$p' ) is empty\n" if (! $v);
1280
1281 return $v;
1282 }
1283
1284 =head2 id
1285
1286 Returns unique id of this record
1287
1288 $id = id();
1289
1290 Returns C<42/2> for 2nd occurence of MFN 42.
1291
1292 =cut
1293
1294 sub id {
1295 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1296 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1297 }
1298
1299 =head2 join_with
1300
1301 Joins walues with some delimiter
1302
1303 $v = join_with(", ", @v);
1304
1305 =cut
1306
1307 sub join_with {
1308 my $d = shift;
1309 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1310 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1311 return '' unless defined($v);
1312 return $v;
1313 }
1314
1315 =head2 split_rec_on
1316
1317 Split record subfield on some regex and take one of parts out
1318
1319 $a_before_semi_column =
1320 split_rec_on('200','a', /\s*;\s*/, $part);
1321
1322 C<$part> is optional number of element. First element is
1323 B<1>, not 0!
1324
1325 If there is no C<$part> parameter or C<$part> is 0, this function will
1326 return all values produced by splitting.
1327
1328 =cut
1329
1330 sub split_rec_on {
1331 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1332
1333 my ($fld, $sf, $regex, $part) = @_;
1334 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1335
1336 my @r = rec( $fld, $sf );
1337 my $v = shift @r;
1338 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1339
1340 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1341
1342 my @s = split( $regex, $v );
1343 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1344 if ($part && $part > 0) {
1345 return $s[ $part - 1 ];
1346 } else {
1347 return @s;
1348 }
1349 }
1350
1351 my $hash;
1352
1353 =head2 set
1354
1355 set( key => 'value' );
1356
1357 =cut
1358
1359 sub set {
1360 my ($k,$v) = @_;
1361 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1362 $hash->{$k} = $v;
1363 };
1364
1365 =head2 get
1366
1367 get( 'key' );
1368
1369 =cut
1370
1371 sub get {
1372 my $k = shift || return;
1373 my $v = $hash->{$k};
1374 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1375 return $v;
1376 }
1377
1378 =head2 count
1379
1380 if ( count( @result ) == 1 ) {
1381 # do something if only 1 result is there
1382 }
1383
1384 =cut
1385
1386 sub count {
1387 warn "## count ",dump(@_),$/ if ( $debug );
1388 return @_ . '';
1389 }
1390
1391 # END
1392 1;

  ViewVC Help
Powered by ViewVC 1.1.26