/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1012 - (show annotations)
Wed Nov 7 09:19:29 2007 UTC (16 years, 4 months ago) by dpavlin
File size: 30980 byte(s)
document frec

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 frec
19 regex prefix suffix surround
20 first lookup join_with
21 save_into_lookup
22
23 split_rec_on
24
25 get set
26 count
27
28 /;
29
30 use warnings;
31 use strict;
32
33 #use base qw/WebPAC::Common/;
34 use Data::Dump qw/dump/;
35 use Storable qw/dclone/;
36 use Carp qw/confess/;
37
38 # debugging warn(s)
39 my $debug = 0;
40
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
43
44 =head1 NAME
45
46 WebPAC::Normalize - describe normalisaton rules using sets
47
48 =cut
49
50 our $VERSION = '0.32';
51
52 =head1 SYNOPSIS
53
54 This module uses C<conf/normalize/*.pl> files to perform normalisation
55 from input records using perl functions which are specialized for set
56 processing.
57
58 Sets are implemented as arrays, and normalisation file is valid perl, which
59 means that you check it's validity before running WebPAC using
60 C<perl -c normalize.pl>.
61
62 Normalisation can generate multiple output normalized data. For now, supported output
63 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
64 C<marc>.
65
66 =head1 FUNCTIONS
67
68 Functions which start with C<_> are private and used by WebPAC internally.
69 All other functions are available for use within normalisation rules.
70
71 =head2 data_structure
72
73 Return data structure
74
75 my $ds = WebPAC::Normalize::data_structure(
76 lookup => $lookup_hash,
77 row => $row,
78 rules => $normalize_pl_config,
79 marc_encoding => 'utf-8',
80 config => $config,
81 load_row_coderef => sub {
82 my ($database,$input,$mfn) = @_;
83 $store->load_row( database => $database, input => $input, id => $mfn );
84 },
85 );
86
87 Options C<row>, C<rules> and C<log> are mandatory while all
88 other are optional.
89
90 C<load_row_coderef> is closure only used when executing lookups, so they will
91 die if it's not defined.
92
93 This function will B<die> if normalizastion can't be evaled.
94
95 Since this function isn't exported you have to call it with
96 C<WebPAC::Normalize::data_structure>.
97
98 =cut
99
100 my $load_row_coderef;
101
102 sub data_structure {
103 my $arg = {@_};
104
105 die "need row argument" unless ($arg->{row});
106 die "need normalisation argument" unless ($arg->{rules});
107
108 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 _set_ds( $arg->{row} );
110 _set_config( $arg->{config} ) if defined($arg->{config});
111 _clean_ds( %{ $arg } );
112 $load_row_coderef = $arg->{load_row_coderef};
113
114 no strict 'subs';
115 no warnings 'redefine';
116 eval "$arg->{rules};";
117 die "error evaling $arg->{rules}: $@\n" if ($@);
118
119 return _get_ds();
120 }
121
122 =head2 _set_ds
123
124 Set current record hash
125
126 _set_ds( $rec );
127
128 =cut
129
130 my $rec;
131
132 sub _set_ds {
133 $rec = shift or die "no record hash";
134 }
135
136 =head2 _set_config
137
138 Set current config hash
139
140 _set_config( $config );
141
142 Magic keys are:
143
144 =over 4
145
146 =item _
147
148 Code of current database
149
150 =item _mfn
151
152 Current MFN
153
154 =back
155
156 =cut
157
158 my $config;
159
160 sub _set_config {
161 $config = shift;
162 }
163
164 =head2 _get_ds
165
166 Return hash formatted as data structure
167
168 my $ds = _get_ds();
169
170 =cut
171
172 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
173 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
174
175 sub _get_ds {
176 #warn "## out = ",dump($out);
177 return $out;
178 }
179
180 =head2 _clean_ds
181
182 Clean data structure hash for next record
183
184 _clean_ds();
185
186 =cut
187
188 sub _clean_ds {
189 my $a = {@_};
190 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
191 ($marc_record_offset, $marc_fetch_offset) = (0,0);
192 $marc_encoding = $a->{marc_encoding};
193 }
194
195 =head2 _set_lookup
196
197 Set current lookup hash
198
199 _set_lookup( $lookup );
200
201 =cut
202
203 my $lookup;
204
205 sub _set_lookup {
206 $lookup = shift;
207 }
208
209 =head2 _get_lookup
210
211 Get current lookup hash
212
213 my $lookup = _get_lookup();
214
215 =cut
216
217 sub _get_lookup {
218 return $lookup;
219 }
220
221 =head2 _set_load_row
222
223 Setup code reference which will return L<data_structure> from
224 L<WebPAC::Store>
225
226 _set_load_row(sub {
227 my ($database,$input,$mfn) = @_;
228 $store->load_row( database => $database, input => $input, id => $mfn );
229 });
230
231 =cut
232
233 sub _set_load_row {
234 my $coderef = shift;
235 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
236
237 $load_row_coderef = $coderef;
238 }
239
240 =head2 _get_marc_fields
241
242 Get all fields defined by calls to C<marc>
243
244 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
245
246 We are using I<magic> which detect repeatable fields only from
247 sequence of field/subfield data generated by normalization.
248
249 Repeatable field is created when there is second occurence of same subfield or
250 if any of indicators are different.
251
252 This is sane for most cases. Something like:
253
254 900a-1 900b-1 900c-1
255 900a-2 900b-2
256 900a-3
257
258 will be created from any combination of:
259
260 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
261
262 and following rules:
263
264 marc('900','a', rec('200','a') );
265 marc('900','b', rec('200','b') );
266 marc('900','c', rec('200','c') );
267
268 which might not be what you have in mind. If you need repeatable subfield,
269 define it using C<marc_repeatable_subfield> like this:
270
271 marc_repeatable_subfield('900','a');
272 marc('900','a', rec('200','a') );
273 marc('900','b', rec('200','b') );
274 marc('900','c', rec('200','c') );
275
276 will create:
277
278 900a-1 900a-2 900a-3 900b-1 900c-1
279 900b-2
280
281 There is also support for returning next or specific using:
282
283 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
284 # do something with $mf
285 }
286
287 will always return fields from next MARC record or
288
289 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
290
291 will return 42th copy record (if it exists).
292
293 =cut
294
295 my $fetch_pos;
296
297 sub _get_marc_fields {
298
299 my $arg = {@_};
300 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
301 $fetch_pos = $marc_fetch_offset;
302 if ($arg->{offset}) {
303 $fetch_pos = $arg->{offset};
304 } elsif($arg->{fetch_next}) {
305 $marc_fetch_offset++;
306 }
307
308 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
309
310 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
311
312 my $marc_rec = $marc_record->[ $fetch_pos ];
313
314 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
315
316 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
317
318 # first, sort all existing fields
319 # XXX might not be needed, but modern perl might randomize elements in hash
320 my @sorted_marc_record = sort {
321 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
322 } @{ $marc_rec };
323
324 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
325
326 # output marc fields
327 my @m;
328
329 # count unique field-subfields (used for offset when walking to next subfield)
330 my $u;
331 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
332
333 if ($debug) {
334 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
335 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
336 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
337 warn "## subfield count = ", dump( $u ), $/;
338 }
339
340 my $len = $#sorted_marc_record;
341 my $visited;
342 my $i = 0;
343 my $field;
344
345 foreach ( 0 .. $len ) {
346
347 # find next element which isn't visited
348 while ($visited->{$i}) {
349 $i = ($i + 1) % ($len + 1);
350 }
351
352 # mark it visited
353 $visited->{$i}++;
354
355 my $row = dclone( $sorted_marc_record[$i] );
356
357 # field and subfield which is key for
358 # marc_repeatable_subfield and u
359 my $fsf = $row->[0] . ( $row->[3] || '' );
360
361 if ($debug > 1) {
362
363 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
364 print "### this [$i]: ", dump( $row ),$/;
365 print "### sf: ", $row->[3], " vs ", $field->[3],
366 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
367 if ($#$field >= 0);
368
369 }
370
371 # if field exists
372 if ( $#$field >= 0 ) {
373 if (
374 $row->[0] ne $field->[0] || # field
375 $row->[1] ne $field->[1] || # i1
376 $row->[2] ne $field->[2] # i2
377 ) {
378 push @m, $field;
379 warn "## saved/1 ", dump( $field ),$/ if ($debug);
380 $field = $row;
381
382 } elsif (
383 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
384 ||
385 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
386 ! $marc_repeatable_subfield->{ $fsf }
387 )
388 ) {
389 push @m, $field;
390 warn "## saved/2 ", dump( $field ),$/ if ($debug);
391 $field = $row;
392
393 } else {
394 # append new subfields to existing field
395 push @$field, ( $row->[3], $row->[4] );
396 }
397 } else {
398 # insert first field
399 $field = $row;
400 }
401
402 if (! $marc_repeatable_subfield->{ $fsf }) {
403 # make step to next subfield
404 $i = ($i + $u->{ $fsf } ) % ($len + 1);
405 }
406 }
407
408 if ($#$field >= 0) {
409 push @m, $field;
410 warn "## saved/3 ", dump( $field ),$/ if ($debug);
411 }
412
413 return \@m;
414 }
415
416 =head2 _get_marc_leader
417
418 Return leader from currently fetched record by L</_get_marc_fields>
419
420 print WebPAC::Normalize::_get_marc_leader();
421
422 =cut
423
424 sub _get_marc_leader {
425 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
426 return $marc_leader->[ $fetch_pos ];
427 }
428
429 =head2 _debug
430
431 Change level of debug warnings
432
433 _debug( 2 );
434
435 =cut
436
437 sub _debug {
438 my $l = shift;
439 return $debug unless defined($l);
440 warn "debug level $l",$/ if ($l > 0);
441 $debug = $l;
442 }
443
444 =head1 Functions to create C<data_structure>
445
446 Those functions generally have to first in your normalization file.
447
448 =head2 search_display
449
450 Define output for L<search> and L<display> at the same time
451
452 search_display('Title', rec('200','a') );
453
454
455 =cut
456
457 sub search_display {
458 my $name = shift or die "search_display needs name as first argument";
459 my @o = grep { defined($_) && $_ ne '' } @_;
460 return unless (@o);
461 $out->{$name}->{search} = \@o;
462 $out->{$name}->{display} = \@o;
463 }
464
465 =head2 tag
466
467 Old name for L<search_display>, but supported
468
469 =cut
470
471 sub tag {
472 search_display( @_ );
473 }
474
475 =head2 display
476
477 Define output just for I<display>
478
479 @v = display('Title', rec('200','a') );
480
481 =cut
482
483 sub _field {
484 my $type = shift or confess "need type -- BUG?";
485 my $name = shift or confess "needs name as first argument";
486 my @o = grep { defined($_) && $_ ne '' } @_;
487 return unless (@o);
488 $out->{$name}->{$type} = \@o;
489 }
490
491 sub display { _field( 'display', @_ ) }
492
493 =head2 search
494
495 Prepare values just for I<search>
496
497 @v = search('Title', rec('200','a') );
498
499 =cut
500
501 sub search { _field( 'search', @_ ) }
502
503 =head2 sorted
504
505 Insert into lists which will be automatically sorted
506
507 sorted('Title', rec('200','a') );
508
509 =cut
510
511 sub sorted { _field( 'sorted', @_ ) }
512
513
514 =head2 marc_leader
515
516 Setup fields within MARC leader or get leader
517
518 marc_leader('05','c');
519 my $leader = marc_leader();
520
521 =cut
522
523 sub marc_leader {
524 my ($offset,$value) = @_;
525
526 if ($offset) {
527 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
528 } else {
529
530 if (defined($marc_leader)) {
531 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
532 return $marc_leader->[ $marc_record_offset ];
533 } else {
534 return;
535 }
536 }
537 }
538
539 =head2 marc_fixed
540
541 Create control/indentifier fields with values in fixed positions
542
543 marc_fixed('008', 00, '070402');
544 marc_fixed('008', 39, '|');
545
546 Positions not specified will be filled with spaces (C<0x20>).
547
548 There will be no effort to extend last specified value to full length of
549 field in standard.
550
551 =cut
552
553 sub marc_fixed {
554 my ($f, $pos, $val) = @_;
555 die "need marc(field, position, value)" unless defined($f) && defined($pos);
556
557 confess "need val" unless defined $val;
558
559 my $update = 0;
560
561 map {
562 if ($_->[0] eq $f) {
563 my $old = $_->[1];
564 if (length($old) <= $pos) {
565 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
566 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
567 } else {
568 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
569 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
570 }
571 $update++;
572 }
573 } @{ $marc_record->[ $marc_record_offset ] };
574
575 if (! $update) {
576 my $v = ' ' x $pos . $val;
577 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
578 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
579 }
580 }
581
582 =head2 marc
583
584 Save value for MARC field
585
586 marc('900','a', rec('200','a') );
587 marc('001', rec('000') );
588
589 =cut
590
591 sub marc {
592 my $f = shift or die "marc needs field";
593 die "marc field must be numer" unless ($f =~ /^\d+$/);
594
595 my $sf;
596 if ($f >= 10) {
597 $sf = shift or die "marc needs subfield";
598 }
599
600 foreach (@_) {
601 my $v = $_; # make var read-write for Encode
602 next unless (defined($v) && $v !~ /^\s*$/);
603 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
604 if (defined $sf) {
605 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
606 } else {
607 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
608 }
609 }
610 }
611
612 =head2 marc_repeatable_subfield
613
614 Save values for MARC repetable subfield
615
616 marc_repeatable_subfield('910', 'z', rec('909') );
617
618 =cut
619
620 sub marc_repeatable_subfield {
621 my ($f,$sf) = @_;
622 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
623 $marc_repeatable_subfield->{ $f . $sf }++;
624 marc(@_);
625 }
626
627 =head2 marc_indicators
628
629 Set both indicators for MARC field
630
631 marc_indicators('900', ' ', 1);
632
633 Any indicator value other than C<0-9> will be treated as undefined.
634
635 =cut
636
637 sub marc_indicators {
638 my $f = shift || die "marc_indicators need field!\n";
639 my ($i1,$i2) = @_;
640 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
641 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
642
643 $i1 = ' ' if ($i1 !~ /^\d$/);
644 $i2 = ' ' if ($i2 !~ /^\d$/);
645 @{ $marc_indicators->{$f} } = ($i1,$i2);
646 }
647
648 =head2 marc_compose
649
650 Save values for each MARC subfield explicitly
651
652 marc_compose('900',
653 'a', rec('200','a')
654 'b', rec('201','a')
655 'a', rec('200','b')
656 'c', rec('200','c')
657 );
658
659 If you specify C<+> for subfield, value will be appended
660 to previous defined subfield.
661
662 =cut
663
664 sub marc_compose {
665 my $f = shift or die "marc_compose needs field";
666 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
667
668 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
669 my $m = [ $f, $i1, $i2 ];
670
671 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
672
673 if ($#_ % 2 != 1) {
674 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
675 }
676
677 while (@_) {
678 my $sf = shift;
679 my $v = shift;
680
681 next unless (defined($v) && $v !~ /^\s*$/);
682 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
683 if ($sf ne '+') {
684 push @$m, ( $sf, $v );
685 } else {
686 $m->[ $#$m ] .= $v;
687 }
688 }
689
690 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
691
692 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
693 }
694
695 =head2 marc_duplicate
696
697 Generate copy of current MARC record and continue working on copy
698
699 marc_duplicate();
700
701 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
702 C<< _get_marc_fields( offset => 42 ) >>.
703
704 =cut
705
706 sub marc_duplicate {
707 my $m = $marc_record->[ -1 ];
708 die "can't duplicate record which isn't defined" unless ($m);
709 push @{ $marc_record }, dclone( $m );
710 push @{ $marc_leader }, dclone( marc_leader() );
711 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
712 $marc_record_offset = $#{ $marc_record };
713 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
714
715 }
716
717 =head2 marc_remove
718
719 Remove some field or subfield from MARC record.
720
721 marc_remove('200');
722 marc_remove('200','a');
723
724 This will erase field C<200> or C<200^a> from current MARC record.
725
726 marc_remove('*');
727
728 Will remove all fields in current MARC record.
729
730 This is useful after calling C<marc_duplicate> or on it's own (but, you
731 should probably just remove that subfield definition if you are not
732 using C<marc_duplicate>).
733
734 FIXME: support fields < 10.
735
736 =cut
737
738 sub marc_remove {
739 my ($f, $sf) = @_;
740
741 die "marc_remove needs record number" unless defined($f);
742
743 my $marc = $marc_record->[ $marc_record_offset ];
744
745 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
746
747 if ($f eq '*') {
748
749 delete( $marc_record->[ $marc_record_offset ] );
750 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
751
752 } else {
753
754 my $i = 0;
755 foreach ( 0 .. $#{ $marc } ) {
756 last unless (defined $marc->[$i]);
757 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
758 if ($marc->[$i]->[0] eq $f) {
759 if (! defined $sf) {
760 # remove whole field
761 splice @$marc, $i, 1;
762 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
763 $i--;
764 } else {
765 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
766 my $o = ($j * 2) + 3;
767 if ($marc->[$i]->[$o] eq $sf) {
768 # remove subfield
769 splice @{$marc->[$i]}, $o, 2;
770 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
771 # is record now empty?
772 if ($#{ $marc->[$i] } == 2) {
773 splice @$marc, $i, 1;
774 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
775 $i--;
776 };
777 }
778 }
779 }
780 }
781 $i++;
782 }
783
784 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
785
786 $marc_record->[ $marc_record_offset ] = $marc;
787 }
788
789 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
790 }
791
792 =head2 marc_original_order
793
794 Copy all subfields preserving original order to marc field.
795
796 marc_original_order( marc_field_number, original_input_field_number );
797
798 Please note that field numbers are consistent with other commands (marc
799 field number first), but somewhat counter-intuitive (destination and then
800 source).
801
802 You might want to use this command if you are just renaming subfields or
803 using pre-processing modify_record in C<config.yml> and don't need any
804 post-processing or want to preserve order of original subfields.
805
806
807 =cut
808
809 sub marc_original_order {
810
811 my ($to, $from) = @_;
812 die "marc_original_order needs from and to fields\n" unless ($from && $to);
813
814 return unless defined($rec->{$from});
815
816 my $r = $rec->{$from};
817 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
818
819 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
820 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
821
822 foreach my $d (@$r) {
823
824 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
825 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
826 next;
827 }
828
829 my @sfs = @{ $d->{subfields} };
830
831 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
832
833 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
834
835 my $m = [ $to, $i1, $i2 ];
836
837 while (my $sf = shift @sfs) {
838
839 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
840 my $offset = shift @sfs;
841 die "corrupted sufields specification for field $from\n" unless defined($offset);
842
843 my $v;
844 if (ref($d->{$sf}) eq 'ARRAY') {
845 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
846 } elsif ($offset == 0) {
847 $v = $d->{$sf};
848 } else {
849 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
850 }
851 push @$m, ( $sf, $v ) if (defined($v));
852 }
853
854 if ($#{$m} > 2) {
855 push @{ $marc_record->[ $marc_record_offset ] }, $m;
856 }
857 }
858
859 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
860 }
861
862 =head2 marc_count
863
864 Return number of MARC records created using L</marc_duplicate>.
865
866 print "created ", marc_count(), " records";
867
868 =cut
869
870 sub marc_count {
871 return $#{ $marc_record };
872 }
873
874
875 =head1 Functions to extract data from input
876
877 This function should be used inside functions to create C<data_structure> described
878 above.
879
880 =head2 _pack_subfields_hash
881
882 @subfields = _pack_subfields_hash( $h );
883 $subfields = _pack_subfields_hash( $h, 1 );
884
885 Return each subfield value in array or pack them all together and return scalar
886 with subfields (denoted by C<^>) and values.
887
888 =cut
889
890 sub _pack_subfields_hash {
891
892 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
893
894 my ($h,$include_subfields) = @_;
895
896 # sanity and ease of use
897 return $h if (ref($h) ne 'HASH');
898
899 if ( defined($h->{subfields}) ) {
900 my $sfs = delete $h->{subfields} || die "no subfields?";
901 my @out;
902 while (@$sfs) {
903 my $sf = shift @$sfs;
904 push @out, '^' . $sf if ($include_subfields);
905 my $o = shift @$sfs;
906 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
907 # single element subfields are not arrays
908 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
909
910 push @out, $h->{$sf};
911 } else {
912 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
913 push @out, $h->{$sf}->[$o];
914 }
915 }
916 if ($include_subfields) {
917 return join('', @out);
918 } else {
919 return @out;
920 }
921 } else {
922 if ($include_subfields) {
923 my $out = '';
924 foreach my $sf (sort keys %$h) {
925 if (ref($h->{$sf}) eq 'ARRAY') {
926 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
927 } else {
928 $out .= '^' . $sf . $h->{$sf};
929 }
930 }
931 return $out;
932 } else {
933 # FIXME this should probably be in alphabetical order instead of hash order
934 values %{$h};
935 }
936 }
937 }
938
939 =head2 rec1
940
941 Return all values in some field
942
943 @v = rec1('200')
944
945 TODO: order of values is probably same as in source data, need to investigate that
946
947 =cut
948
949 sub rec1 {
950 my $f = shift;
951 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
952 return unless (defined($rec) && defined($rec->{$f}));
953 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
954 if (ref($rec->{$f}) eq 'ARRAY') {
955 my @out;
956 foreach my $h ( @{ $rec->{$f} } ) {
957 if (ref($h) eq 'HASH') {
958 push @out, ( _pack_subfields_hash( $h ) );
959 } else {
960 push @out, $h;
961 }
962 }
963 return @out;
964 } elsif( defined($rec->{$f}) ) {
965 return $rec->{$f};
966 }
967 }
968
969 =head2 rec2
970
971 Return all values in specific field and subfield
972
973 @v = rec2('200','a')
974
975 =cut
976
977 sub rec2 {
978 my $f = shift;
979 return unless (defined($rec && $rec->{$f}));
980 my $sf = shift;
981 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
982 return map {
983 if (ref($_->{$sf}) eq 'ARRAY') {
984 @{ $_->{$sf} };
985 } else {
986 $_->{$sf};
987 }
988 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
989 }
990
991 =head2 rec
992
993 syntaxtic sugar for
994
995 @v = rec('200')
996 @v = rec('200','a')
997
998 If rec() returns just single value, it will
999 return scalar, not array.
1000
1001 =cut
1002
1003 sub rec {
1004 my @out;
1005 if ($#_ == 0) {
1006 @out = rec1(@_);
1007 } elsif ($#_ == 1) {
1008 @out = rec2(@_);
1009 }
1010 if ($#out == 0 && ! wantarray) {
1011 return $out[0];
1012 } elsif (@out) {
1013 return @out;
1014 } else {
1015 return '';
1016 }
1017 }
1018
1019 =head2 frec
1020
1021 Returns first value from field
1022
1023 $v = frec('200');
1024 $v = frec('200','a');
1025
1026 =cut
1027
1028 sub frec {
1029 my @out = rec(@_);
1030 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1031 return shift @out;
1032 }
1033
1034 =head2 regex
1035
1036 Apply regex to some or all values
1037
1038 @v = regex( 's/foo/bar/g', @v );
1039
1040 =cut
1041
1042 sub regex {
1043 my $r = shift;
1044 my @out;
1045 #warn "r: $r\n", dump(\@_);
1046 foreach my $t (@_) {
1047 next unless ($t);
1048 eval "\$t =~ $r";
1049 push @out, $t if ($t && $t ne '');
1050 }
1051 return @out;
1052 }
1053
1054 =head2 prefix
1055
1056 Prefix all values with a string
1057
1058 @v = prefix( 'my_', @v );
1059
1060 =cut
1061
1062 sub prefix {
1063 my $p = shift;
1064 return @_ unless defined( $p );
1065 return map { $p . $_ } grep { defined($_) } @_;
1066 }
1067
1068 =head2 suffix
1069
1070 suffix all values with a string
1071
1072 @v = suffix( '_my', @v );
1073
1074 =cut
1075
1076 sub suffix {
1077 my $s = shift;
1078 return @_ unless defined( $s );
1079 return map { $_ . $s } grep { defined($_) } @_;
1080 }
1081
1082 =head2 surround
1083
1084 surround all values with a two strings
1085
1086 @v = surround( 'prefix_', '_suffix', @v );
1087
1088 =cut
1089
1090 sub surround {
1091 my $p = shift;
1092 my $s = shift;
1093 $p = '' unless defined( $p );
1094 $s = '' unless defined( $s );
1095 return map { $p . $_ . $s } grep { defined($_) } @_;
1096 }
1097
1098 =head2 first
1099
1100 Return first element
1101
1102 $v = first( @v );
1103
1104 =cut
1105
1106 sub first {
1107 my $r = shift;
1108 return $r;
1109 }
1110
1111 =head2 lookup
1112
1113 Consult lookup hashes for some value
1114
1115 @v = lookup(
1116 sub {
1117 'ffkk/peri/mfn'.rec('000')
1118 },
1119 'ffkk','peri','200-a-200-e',
1120 sub {
1121 first(rec(200,'a')).' '.first(rec('200','e'))
1122 }
1123 );
1124
1125 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1126 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1127
1128 lookup(
1129 # which results to return from record recorded in lookup
1130 sub { 'ffkk/peri/mfn' . rec('000') },
1131 # from which database and input
1132 'ffkk','peri',
1133 # such that following values match
1134 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1135 # if this part is missing, we will try to match same fields
1136 # from lookup record and current one, or you can override
1137 # which records to use from current record using
1138 sub { rec('900','x') . ' ' . rec('900','y') },
1139 )
1140
1141 You can think about this lookup as SQL (if that helps):
1142
1143 select
1144 sub { what }
1145 from
1146 database, input
1147 where
1148 sub { filter from lookuped record }
1149 having
1150 sub { optional filter on current record }
1151
1152 Easy as pie, right?
1153
1154 =cut
1155
1156 sub lookup {
1157 my ($what, $database, $input, $key, $having) = @_;
1158
1159 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1160
1161 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1162 return unless (defined($lookup->{$database}->{$input}->{$key}));
1163
1164 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1165
1166 my $mfns;
1167 my @having = $having->();
1168
1169 warn "## having = ", dump( @having ) if ($debug > 2);
1170
1171 foreach my $h ( @having ) {
1172 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1173 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1174 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1175 }
1176 }
1177
1178 return unless ($mfns);
1179
1180 my @mfns = sort keys %$mfns;
1181
1182 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1183
1184 my $old_rec = $rec;
1185 my @out;
1186
1187 foreach my $mfn (@mfns) {
1188 $rec = $load_row_coderef->( $database, $input, $mfn );
1189
1190 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1191
1192 my @vals = $what->();
1193
1194 push @out, ( @vals );
1195
1196 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1197 }
1198
1199 # if (ref($lookup->{$k}) eq 'ARRAY') {
1200 # return @{ $lookup->{$k} };
1201 # } else {
1202 # return $lookup->{$k};
1203 # }
1204
1205 $rec = $old_rec;
1206
1207 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1208
1209 if ($#out == 0) {
1210 return $out[0];
1211 } else {
1212 return @out;
1213 }
1214 }
1215
1216 =head2 save_into_lookup
1217
1218 Save value into lookup. It associates current database, input
1219 and specific keys with one or more values which will be
1220 associated over MFN.
1221
1222 MFN will be extracted from first occurence current of field 000
1223 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1224
1225 my $nr = save_into_lookup($database,$input,$key,sub {
1226 # code which produce one or more values
1227 });
1228
1229 It returns number of items saved.
1230
1231 This function shouldn't be called directly, it's called from code created by
1232 L<WebPAC::Parser>.
1233
1234 =cut
1235
1236 sub save_into_lookup {
1237 my ($database,$input,$key,$coderef) = @_;
1238 die "save_into_lookup needs database" unless defined($database);
1239 die "save_into_lookup needs input" unless defined($input);
1240 die "save_into_lookup needs key" unless defined($key);
1241 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1242
1243 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1244
1245 my $mfn =
1246 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1247 defined($config->{_mfn}) ? $config->{_mfn} :
1248 die "mfn not defined or zero";
1249
1250 my $nr = 0;
1251
1252 foreach my $v ( $coderef->() ) {
1253 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1254 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1255 $nr++;
1256 }
1257
1258 return $nr;
1259 }
1260
1261 =head2 config
1262
1263 Consult config values stored in C<config.yml>
1264
1265 # return database code (key under databases in yaml)
1266 $database_code = config(); # use _ from hash
1267 $database_name = config('name');
1268 $database_input_name = config('input name');
1269
1270 Up to three levels are supported.
1271
1272 =cut
1273
1274 sub config {
1275 return unless ($config);
1276
1277 my $p = shift;
1278
1279 $p ||= '';
1280
1281 my $v;
1282
1283 warn "### getting config($p)\n" if ($debug > 1);
1284
1285 my @p = split(/\s+/,$p);
1286 if ($#p < 0) {
1287 $v = $config->{ '_' }; # special, database code
1288 } else {
1289
1290 my $c = dclone( $config );
1291
1292 foreach my $k (@p) {
1293 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1294 if (ref($c) eq 'ARRAY') {
1295 $c = shift @$c;
1296 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1297 last;
1298 }
1299
1300 if (! defined($c->{$k}) ) {
1301 $c = undef;
1302 last;
1303 } else {
1304 $c = $c->{$k};
1305 }
1306 }
1307 $v = $c if ($c);
1308
1309 }
1310
1311 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1312 warn "config( '$p' ) is empty\n" if (! $v);
1313
1314 return $v;
1315 }
1316
1317 =head2 id
1318
1319 Returns unique id of this record
1320
1321 $id = id();
1322
1323 Returns C<42/2> for 2nd occurence of MFN 42.
1324
1325 =cut
1326
1327 sub id {
1328 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1329 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1330 }
1331
1332 =head2 join_with
1333
1334 Joins walues with some delimiter
1335
1336 $v = join_with(", ", @v);
1337
1338 =cut
1339
1340 sub join_with {
1341 my $d = shift;
1342 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1343 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1344 return '' unless defined($v);
1345 return $v;
1346 }
1347
1348 =head2 split_rec_on
1349
1350 Split record subfield on some regex and take one of parts out
1351
1352 $a_before_semi_column =
1353 split_rec_on('200','a', /\s*;\s*/, $part);
1354
1355 C<$part> is optional number of element. First element is
1356 B<1>, not 0!
1357
1358 If there is no C<$part> parameter or C<$part> is 0, this function will
1359 return all values produced by splitting.
1360
1361 =cut
1362
1363 sub split_rec_on {
1364 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1365
1366 my ($fld, $sf, $regex, $part) = @_;
1367 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1368
1369 my @r = rec( $fld, $sf );
1370 my $v = shift @r;
1371 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1372
1373 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1374
1375 my @s = split( $regex, $v );
1376 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1377 if ($part && $part > 0) {
1378 return $s[ $part - 1 ];
1379 } else {
1380 return @s;
1381 }
1382 }
1383
1384 my $hash;
1385
1386 =head2 set
1387
1388 set( key => 'value' );
1389
1390 =cut
1391
1392 sub set {
1393 my ($k,$v) = @_;
1394 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1395 $hash->{$k} = $v;
1396 };
1397
1398 =head2 get
1399
1400 get( 'key' );
1401
1402 =cut
1403
1404 sub get {
1405 my $k = shift || return;
1406 my $v = $hash->{$k};
1407 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1408 return $v;
1409 }
1410
1411 =head2 count
1412
1413 if ( count( @result ) == 1 ) {
1414 # do something if only 1 result is there
1415 }
1416
1417 =cut
1418
1419 sub count {
1420 warn "## count ",dump(@_),$/ if ( $debug );
1421 return @_ . '';
1422 }
1423
1424 # END
1425 1;

  ViewVC Help
Powered by ViewVC 1.1.26