/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1011 - (show annotations)
Tue Nov 6 20:26:31 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 30884 byte(s)
added no warnings 'redefine'; so we can define subs in normalization perl
without annoying warnings.

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 frec
19 regex prefix suffix surround
20 first lookup join_with
21 save_into_lookup
22
23 split_rec_on
24
25 get set
26 count
27
28 /;
29
30 use warnings;
31 use strict;
32
33 #use base qw/WebPAC::Common/;
34 use Data::Dump qw/dump/;
35 use Storable qw/dclone/;
36 use Carp qw/confess/;
37
38 # debugging warn(s)
39 my $debug = 0;
40
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
43
44 =head1 NAME
45
46 WebPAC::Normalize - describe normalisaton rules using sets
47
48 =cut
49
50 our $VERSION = '0.32';
51
52 =head1 SYNOPSIS
53
54 This module uses C<conf/normalize/*.pl> files to perform normalisation
55 from input records using perl functions which are specialized for set
56 processing.
57
58 Sets are implemented as arrays, and normalisation file is valid perl, which
59 means that you check it's validity before running WebPAC using
60 C<perl -c normalize.pl>.
61
62 Normalisation can generate multiple output normalized data. For now, supported output
63 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
64 C<marc>.
65
66 =head1 FUNCTIONS
67
68 Functions which start with C<_> are private and used by WebPAC internally.
69 All other functions are available for use within normalisation rules.
70
71 =head2 data_structure
72
73 Return data structure
74
75 my $ds = WebPAC::Normalize::data_structure(
76 lookup => $lookup_hash,
77 row => $row,
78 rules => $normalize_pl_config,
79 marc_encoding => 'utf-8',
80 config => $config,
81 load_row_coderef => sub {
82 my ($database,$input,$mfn) = @_;
83 $store->load_row( database => $database, input => $input, id => $mfn );
84 },
85 );
86
87 Options C<row>, C<rules> and C<log> are mandatory while all
88 other are optional.
89
90 C<load_row_coderef> is closure only used when executing lookups, so they will
91 die if it's not defined.
92
93 This function will B<die> if normalizastion can't be evaled.
94
95 Since this function isn't exported you have to call it with
96 C<WebPAC::Normalize::data_structure>.
97
98 =cut
99
100 my $load_row_coderef;
101
102 sub data_structure {
103 my $arg = {@_};
104
105 die "need row argument" unless ($arg->{row});
106 die "need normalisation argument" unless ($arg->{rules});
107
108 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 _set_ds( $arg->{row} );
110 _set_config( $arg->{config} ) if defined($arg->{config});
111 _clean_ds( %{ $arg } );
112 $load_row_coderef = $arg->{load_row_coderef};
113
114 no strict 'subs';
115 no warnings 'redefine';
116 eval "$arg->{rules};";
117 die "error evaling $arg->{rules}: $@\n" if ($@);
118
119 return _get_ds();
120 }
121
122 =head2 _set_ds
123
124 Set current record hash
125
126 _set_ds( $rec );
127
128 =cut
129
130 my $rec;
131
132 sub _set_ds {
133 $rec = shift or die "no record hash";
134 }
135
136 =head2 _set_config
137
138 Set current config hash
139
140 _set_config( $config );
141
142 Magic keys are:
143
144 =over 4
145
146 =item _
147
148 Code of current database
149
150 =item _mfn
151
152 Current MFN
153
154 =back
155
156 =cut
157
158 my $config;
159
160 sub _set_config {
161 $config = shift;
162 }
163
164 =head2 _get_ds
165
166 Return hash formatted as data structure
167
168 my $ds = _get_ds();
169
170 =cut
171
172 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
173 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
174
175 sub _get_ds {
176 #warn "## out = ",dump($out);
177 return $out;
178 }
179
180 =head2 _clean_ds
181
182 Clean data structure hash for next record
183
184 _clean_ds();
185
186 =cut
187
188 sub _clean_ds {
189 my $a = {@_};
190 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
191 ($marc_record_offset, $marc_fetch_offset) = (0,0);
192 $marc_encoding = $a->{marc_encoding};
193 }
194
195 =head2 _set_lookup
196
197 Set current lookup hash
198
199 _set_lookup( $lookup );
200
201 =cut
202
203 my $lookup;
204
205 sub _set_lookup {
206 $lookup = shift;
207 }
208
209 =head2 _get_lookup
210
211 Get current lookup hash
212
213 my $lookup = _get_lookup();
214
215 =cut
216
217 sub _get_lookup {
218 return $lookup;
219 }
220
221 =head2 _set_load_row
222
223 Setup code reference which will return L<data_structure> from
224 L<WebPAC::Store>
225
226 _set_load_row(sub {
227 my ($database,$input,$mfn) = @_;
228 $store->load_row( database => $database, input => $input, id => $mfn );
229 });
230
231 =cut
232
233 sub _set_load_row {
234 my $coderef = shift;
235 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
236
237 $load_row_coderef = $coderef;
238 }
239
240 =head2 _get_marc_fields
241
242 Get all fields defined by calls to C<marc>
243
244 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
245
246 We are using I<magic> which detect repeatable fields only from
247 sequence of field/subfield data generated by normalization.
248
249 Repeatable field is created when there is second occurence of same subfield or
250 if any of indicators are different.
251
252 This is sane for most cases. Something like:
253
254 900a-1 900b-1 900c-1
255 900a-2 900b-2
256 900a-3
257
258 will be created from any combination of:
259
260 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
261
262 and following rules:
263
264 marc('900','a', rec('200','a') );
265 marc('900','b', rec('200','b') );
266 marc('900','c', rec('200','c') );
267
268 which might not be what you have in mind. If you need repeatable subfield,
269 define it using C<marc_repeatable_subfield> like this:
270
271 marc_repeatable_subfield('900','a');
272 marc('900','a', rec('200','a') );
273 marc('900','b', rec('200','b') );
274 marc('900','c', rec('200','c') );
275
276 will create:
277
278 900a-1 900a-2 900a-3 900b-1 900c-1
279 900b-2
280
281 There is also support for returning next or specific using:
282
283 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
284 # do something with $mf
285 }
286
287 will always return fields from next MARC record or
288
289 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
290
291 will return 42th copy record (if it exists).
292
293 =cut
294
295 my $fetch_pos;
296
297 sub _get_marc_fields {
298
299 my $arg = {@_};
300 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
301 $fetch_pos = $marc_fetch_offset;
302 if ($arg->{offset}) {
303 $fetch_pos = $arg->{offset};
304 } elsif($arg->{fetch_next}) {
305 $marc_fetch_offset++;
306 }
307
308 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
309
310 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
311
312 my $marc_rec = $marc_record->[ $fetch_pos ];
313
314 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
315
316 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
317
318 # first, sort all existing fields
319 # XXX might not be needed, but modern perl might randomize elements in hash
320 my @sorted_marc_record = sort {
321 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
322 } @{ $marc_rec };
323
324 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
325
326 # output marc fields
327 my @m;
328
329 # count unique field-subfields (used for offset when walking to next subfield)
330 my $u;
331 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
332
333 if ($debug) {
334 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
335 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
336 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
337 warn "## subfield count = ", dump( $u ), $/;
338 }
339
340 my $len = $#sorted_marc_record;
341 my $visited;
342 my $i = 0;
343 my $field;
344
345 foreach ( 0 .. $len ) {
346
347 # find next element which isn't visited
348 while ($visited->{$i}) {
349 $i = ($i + 1) % ($len + 1);
350 }
351
352 # mark it visited
353 $visited->{$i}++;
354
355 my $row = dclone( $sorted_marc_record[$i] );
356
357 # field and subfield which is key for
358 # marc_repeatable_subfield and u
359 my $fsf = $row->[0] . ( $row->[3] || '' );
360
361 if ($debug > 1) {
362
363 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
364 print "### this [$i]: ", dump( $row ),$/;
365 print "### sf: ", $row->[3], " vs ", $field->[3],
366 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
367 if ($#$field >= 0);
368
369 }
370
371 # if field exists
372 if ( $#$field >= 0 ) {
373 if (
374 $row->[0] ne $field->[0] || # field
375 $row->[1] ne $field->[1] || # i1
376 $row->[2] ne $field->[2] # i2
377 ) {
378 push @m, $field;
379 warn "## saved/1 ", dump( $field ),$/ if ($debug);
380 $field = $row;
381
382 } elsif (
383 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
384 ||
385 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
386 ! $marc_repeatable_subfield->{ $fsf }
387 )
388 ) {
389 push @m, $field;
390 warn "## saved/2 ", dump( $field ),$/ if ($debug);
391 $field = $row;
392
393 } else {
394 # append new subfields to existing field
395 push @$field, ( $row->[3], $row->[4] );
396 }
397 } else {
398 # insert first field
399 $field = $row;
400 }
401
402 if (! $marc_repeatable_subfield->{ $fsf }) {
403 # make step to next subfield
404 $i = ($i + $u->{ $fsf } ) % ($len + 1);
405 }
406 }
407
408 if ($#$field >= 0) {
409 push @m, $field;
410 warn "## saved/3 ", dump( $field ),$/ if ($debug);
411 }
412
413 return \@m;
414 }
415
416 =head2 _get_marc_leader
417
418 Return leader from currently fetched record by L</_get_marc_fields>
419
420 print WebPAC::Normalize::_get_marc_leader();
421
422 =cut
423
424 sub _get_marc_leader {
425 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
426 return $marc_leader->[ $fetch_pos ];
427 }
428
429 =head2 _debug
430
431 Change level of debug warnings
432
433 _debug( 2 );
434
435 =cut
436
437 sub _debug {
438 my $l = shift;
439 return $debug unless defined($l);
440 warn "debug level $l",$/ if ($l > 0);
441 $debug = $l;
442 }
443
444 =head1 Functions to create C<data_structure>
445
446 Those functions generally have to first in your normalization file.
447
448 =head2 search_display
449
450 Define output for L<search> and L<display> at the same time
451
452 search_display('Title', rec('200','a') );
453
454
455 =cut
456
457 sub search_display {
458 my $name = shift or die "search_display needs name as first argument";
459 my @o = grep { defined($_) && $_ ne '' } @_;
460 return unless (@o);
461 $out->{$name}->{search} = \@o;
462 $out->{$name}->{display} = \@o;
463 }
464
465 =head2 tag
466
467 Old name for L<search_display>, but supported
468
469 =cut
470
471 sub tag {
472 search_display( @_ );
473 }
474
475 =head2 display
476
477 Define output just for I<display>
478
479 @v = display('Title', rec('200','a') );
480
481 =cut
482
483 sub _field {
484 my $type = shift or confess "need type -- BUG?";
485 my $name = shift or confess "needs name as first argument";
486 my @o = grep { defined($_) && $_ ne '' } @_;
487 return unless (@o);
488 $out->{$name}->{$type} = \@o;
489 }
490
491 sub display { _field( 'display', @_ ) }
492
493 =head2 search
494
495 Prepare values just for I<search>
496
497 @v = search('Title', rec('200','a') );
498
499 =cut
500
501 sub search { _field( 'search', @_ ) }
502
503 =head2 sorted
504
505 Insert into lists which will be automatically sorted
506
507 sorted('Title', rec('200','a') );
508
509 =cut
510
511 sub sorted { _field( 'sorted', @_ ) }
512
513
514 =head2 marc_leader
515
516 Setup fields within MARC leader or get leader
517
518 marc_leader('05','c');
519 my $leader = marc_leader();
520
521 =cut
522
523 sub marc_leader {
524 my ($offset,$value) = @_;
525
526 if ($offset) {
527 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
528 } else {
529
530 if (defined($marc_leader)) {
531 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
532 return $marc_leader->[ $marc_record_offset ];
533 } else {
534 return;
535 }
536 }
537 }
538
539 =head2 marc_fixed
540
541 Create control/indentifier fields with values in fixed positions
542
543 marc_fixed('008', 00, '070402');
544 marc_fixed('008', 39, '|');
545
546 Positions not specified will be filled with spaces (C<0x20>).
547
548 There will be no effort to extend last specified value to full length of
549 field in standard.
550
551 =cut
552
553 sub marc_fixed {
554 my ($f, $pos, $val) = @_;
555 die "need marc(field, position, value)" unless defined($f) && defined($pos);
556
557 confess "need val" unless defined $val;
558
559 my $update = 0;
560
561 map {
562 if ($_->[0] eq $f) {
563 my $old = $_->[1];
564 if (length($old) <= $pos) {
565 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
566 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
567 } else {
568 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
569 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
570 }
571 $update++;
572 }
573 } @{ $marc_record->[ $marc_record_offset ] };
574
575 if (! $update) {
576 my $v = ' ' x $pos . $val;
577 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
578 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
579 }
580 }
581
582 =head2 marc
583
584 Save value for MARC field
585
586 marc('900','a', rec('200','a') );
587 marc('001', rec('000') );
588
589 =cut
590
591 sub marc {
592 my $f = shift or die "marc needs field";
593 die "marc field must be numer" unless ($f =~ /^\d+$/);
594
595 my $sf;
596 if ($f >= 10) {
597 $sf = shift or die "marc needs subfield";
598 }
599
600 foreach (@_) {
601 my $v = $_; # make var read-write for Encode
602 next unless (defined($v) && $v !~ /^\s*$/);
603 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
604 if (defined $sf) {
605 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
606 } else {
607 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
608 }
609 }
610 }
611
612 =head2 marc_repeatable_subfield
613
614 Save values for MARC repetable subfield
615
616 marc_repeatable_subfield('910', 'z', rec('909') );
617
618 =cut
619
620 sub marc_repeatable_subfield {
621 my ($f,$sf) = @_;
622 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
623 $marc_repeatable_subfield->{ $f . $sf }++;
624 marc(@_);
625 }
626
627 =head2 marc_indicators
628
629 Set both indicators for MARC field
630
631 marc_indicators('900', ' ', 1);
632
633 Any indicator value other than C<0-9> will be treated as undefined.
634
635 =cut
636
637 sub marc_indicators {
638 my $f = shift || die "marc_indicators need field!\n";
639 my ($i1,$i2) = @_;
640 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
641 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
642
643 $i1 = ' ' if ($i1 !~ /^\d$/);
644 $i2 = ' ' if ($i2 !~ /^\d$/);
645 @{ $marc_indicators->{$f} } = ($i1,$i2);
646 }
647
648 =head2 marc_compose
649
650 Save values for each MARC subfield explicitly
651
652 marc_compose('900',
653 'a', rec('200','a')
654 'b', rec('201','a')
655 'a', rec('200','b')
656 'c', rec('200','c')
657 );
658
659 If you specify C<+> for subfield, value will be appended
660 to previous defined subfield.
661
662 =cut
663
664 sub marc_compose {
665 my $f = shift or die "marc_compose needs field";
666 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
667
668 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
669 my $m = [ $f, $i1, $i2 ];
670
671 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
672
673 if ($#_ % 2 != 1) {
674 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
675 }
676
677 while (@_) {
678 my $sf = shift;
679 my $v = shift;
680
681 next unless (defined($v) && $v !~ /^\s*$/);
682 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
683 if ($sf ne '+') {
684 push @$m, ( $sf, $v );
685 } else {
686 $m->[ $#$m ] .= $v;
687 }
688 }
689
690 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
691
692 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
693 }
694
695 =head2 marc_duplicate
696
697 Generate copy of current MARC record and continue working on copy
698
699 marc_duplicate();
700
701 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
702 C<< _get_marc_fields( offset => 42 ) >>.
703
704 =cut
705
706 sub marc_duplicate {
707 my $m = $marc_record->[ -1 ];
708 die "can't duplicate record which isn't defined" unless ($m);
709 push @{ $marc_record }, dclone( $m );
710 push @{ $marc_leader }, dclone( marc_leader() );
711 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
712 $marc_record_offset = $#{ $marc_record };
713 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
714
715 }
716
717 =head2 marc_remove
718
719 Remove some field or subfield from MARC record.
720
721 marc_remove('200');
722 marc_remove('200','a');
723
724 This will erase field C<200> or C<200^a> from current MARC record.
725
726 marc_remove('*');
727
728 Will remove all fields in current MARC record.
729
730 This is useful after calling C<marc_duplicate> or on it's own (but, you
731 should probably just remove that subfield definition if you are not
732 using C<marc_duplicate>).
733
734 FIXME: support fields < 10.
735
736 =cut
737
738 sub marc_remove {
739 my ($f, $sf) = @_;
740
741 die "marc_remove needs record number" unless defined($f);
742
743 my $marc = $marc_record->[ $marc_record_offset ];
744
745 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
746
747 if ($f eq '*') {
748
749 delete( $marc_record->[ $marc_record_offset ] );
750 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
751
752 } else {
753
754 my $i = 0;
755 foreach ( 0 .. $#{ $marc } ) {
756 last unless (defined $marc->[$i]);
757 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
758 if ($marc->[$i]->[0] eq $f) {
759 if (! defined $sf) {
760 # remove whole field
761 splice @$marc, $i, 1;
762 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
763 $i--;
764 } else {
765 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
766 my $o = ($j * 2) + 3;
767 if ($marc->[$i]->[$o] eq $sf) {
768 # remove subfield
769 splice @{$marc->[$i]}, $o, 2;
770 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
771 # is record now empty?
772 if ($#{ $marc->[$i] } == 2) {
773 splice @$marc, $i, 1;
774 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
775 $i--;
776 };
777 }
778 }
779 }
780 }
781 $i++;
782 }
783
784 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
785
786 $marc_record->[ $marc_record_offset ] = $marc;
787 }
788
789 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
790 }
791
792 =head2 marc_original_order
793
794 Copy all subfields preserving original order to marc field.
795
796 marc_original_order( marc_field_number, original_input_field_number );
797
798 Please note that field numbers are consistent with other commands (marc
799 field number first), but somewhat counter-intuitive (destination and then
800 source).
801
802 You might want to use this command if you are just renaming subfields or
803 using pre-processing modify_record in C<config.yml> and don't need any
804 post-processing or want to preserve order of original subfields.
805
806
807 =cut
808
809 sub marc_original_order {
810
811 my ($to, $from) = @_;
812 die "marc_original_order needs from and to fields\n" unless ($from && $to);
813
814 return unless defined($rec->{$from});
815
816 my $r = $rec->{$from};
817 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
818
819 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
820 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
821
822 foreach my $d (@$r) {
823
824 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
825 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
826 next;
827 }
828
829 my @sfs = @{ $d->{subfields} };
830
831 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
832
833 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
834
835 my $m = [ $to, $i1, $i2 ];
836
837 while (my $sf = shift @sfs) {
838
839 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
840 my $offset = shift @sfs;
841 die "corrupted sufields specification for field $from\n" unless defined($offset);
842
843 my $v;
844 if (ref($d->{$sf}) eq 'ARRAY') {
845 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
846 } elsif ($offset == 0) {
847 $v = $d->{$sf};
848 } else {
849 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
850 }
851 push @$m, ( $sf, $v ) if (defined($v));
852 }
853
854 if ($#{$m} > 2) {
855 push @{ $marc_record->[ $marc_record_offset ] }, $m;
856 }
857 }
858
859 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
860 }
861
862 =head2 marc_count
863
864 Return number of MARC records created using L</marc_duplicate>.
865
866 print "created ", marc_count(), " records";
867
868 =cut
869
870 sub marc_count {
871 return $#{ $marc_record };
872 }
873
874
875 =head1 Functions to extract data from input
876
877 This function should be used inside functions to create C<data_structure> described
878 above.
879
880 =head2 _pack_subfields_hash
881
882 @subfields = _pack_subfields_hash( $h );
883 $subfields = _pack_subfields_hash( $h, 1 );
884
885 Return each subfield value in array or pack them all together and return scalar
886 with subfields (denoted by C<^>) and values.
887
888 =cut
889
890 sub _pack_subfields_hash {
891
892 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
893
894 my ($h,$include_subfields) = @_;
895
896 # sanity and ease of use
897 return $h if (ref($h) ne 'HASH');
898
899 if ( defined($h->{subfields}) ) {
900 my $sfs = delete $h->{subfields} || die "no subfields?";
901 my @out;
902 while (@$sfs) {
903 my $sf = shift @$sfs;
904 push @out, '^' . $sf if ($include_subfields);
905 my $o = shift @$sfs;
906 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
907 # single element subfields are not arrays
908 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
909
910 push @out, $h->{$sf};
911 } else {
912 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
913 push @out, $h->{$sf}->[$o];
914 }
915 }
916 if ($include_subfields) {
917 return join('', @out);
918 } else {
919 return @out;
920 }
921 } else {
922 if ($include_subfields) {
923 my $out = '';
924 foreach my $sf (sort keys %$h) {
925 if (ref($h->{$sf}) eq 'ARRAY') {
926 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
927 } else {
928 $out .= '^' . $sf . $h->{$sf};
929 }
930 }
931 return $out;
932 } else {
933 # FIXME this should probably be in alphabetical order instead of hash order
934 values %{$h};
935 }
936 }
937 }
938
939 =head2 rec1
940
941 Return all values in some field
942
943 @v = rec1('200')
944
945 TODO: order of values is probably same as in source data, need to investigate that
946
947 =cut
948
949 sub rec1 {
950 my $f = shift;
951 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
952 return unless (defined($rec) && defined($rec->{$f}));
953 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
954 if (ref($rec->{$f}) eq 'ARRAY') {
955 my @out;
956 foreach my $h ( @{ $rec->{$f} } ) {
957 if (ref($h) eq 'HASH') {
958 push @out, ( _pack_subfields_hash( $h ) );
959 } else {
960 push @out, $h;
961 }
962 }
963 return @out;
964 } elsif( defined($rec->{$f}) ) {
965 return $rec->{$f};
966 }
967 }
968
969 =head2 rec2
970
971 Return all values in specific field and subfield
972
973 @v = rec2('200','a')
974
975 =cut
976
977 sub rec2 {
978 my $f = shift;
979 return unless (defined($rec && $rec->{$f}));
980 my $sf = shift;
981 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
982 return map {
983 if (ref($_->{$sf}) eq 'ARRAY') {
984 @{ $_->{$sf} };
985 } else {
986 $_->{$sf};
987 }
988 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
989 }
990
991 =head2 rec
992
993 syntaxtic sugar for
994
995 @v = rec('200')
996 @v = rec('200','a')
997
998 If rec() returns just single value, it will
999 return scalar, not array.
1000
1001 =cut
1002
1003 sub frec {
1004 my @out = rec(@_);
1005 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1006 return shift @out;
1007 }
1008
1009 sub rec {
1010 my @out;
1011 if ($#_ == 0) {
1012 @out = rec1(@_);
1013 } elsif ($#_ == 1) {
1014 @out = rec2(@_);
1015 }
1016 if ($#out == 0 && ! wantarray) {
1017 return $out[0];
1018 } elsif (@out) {
1019 return @out;
1020 } else {
1021 return '';
1022 }
1023 }
1024
1025 =head2 regex
1026
1027 Apply regex to some or all values
1028
1029 @v = regex( 's/foo/bar/g', @v );
1030
1031 =cut
1032
1033 sub regex {
1034 my $r = shift;
1035 my @out;
1036 #warn "r: $r\n", dump(\@_);
1037 foreach my $t (@_) {
1038 next unless ($t);
1039 eval "\$t =~ $r";
1040 push @out, $t if ($t && $t ne '');
1041 }
1042 return @out;
1043 }
1044
1045 =head2 prefix
1046
1047 Prefix all values with a string
1048
1049 @v = prefix( 'my_', @v );
1050
1051 =cut
1052
1053 sub prefix {
1054 my $p = shift;
1055 return @_ unless defined( $p );
1056 return map { $p . $_ } grep { defined($_) } @_;
1057 }
1058
1059 =head2 suffix
1060
1061 suffix all values with a string
1062
1063 @v = suffix( '_my', @v );
1064
1065 =cut
1066
1067 sub suffix {
1068 my $s = shift;
1069 return @_ unless defined( $s );
1070 return map { $_ . $s } grep { defined($_) } @_;
1071 }
1072
1073 =head2 surround
1074
1075 surround all values with a two strings
1076
1077 @v = surround( 'prefix_', '_suffix', @v );
1078
1079 =cut
1080
1081 sub surround {
1082 my $p = shift;
1083 my $s = shift;
1084 $p = '' unless defined( $p );
1085 $s = '' unless defined( $s );
1086 return map { $p . $_ . $s } grep { defined($_) } @_;
1087 }
1088
1089 =head2 first
1090
1091 Return first element
1092
1093 $v = first( @v );
1094
1095 =cut
1096
1097 sub first {
1098 my $r = shift;
1099 return $r;
1100 }
1101
1102 =head2 lookup
1103
1104 Consult lookup hashes for some value
1105
1106 @v = lookup(
1107 sub {
1108 'ffkk/peri/mfn'.rec('000')
1109 },
1110 'ffkk','peri','200-a-200-e',
1111 sub {
1112 first(rec(200,'a')).' '.first(rec('200','e'))
1113 }
1114 );
1115
1116 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1117 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1118
1119 lookup(
1120 # which results to return from record recorded in lookup
1121 sub { 'ffkk/peri/mfn' . rec('000') },
1122 # from which database and input
1123 'ffkk','peri',
1124 # such that following values match
1125 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1126 # if this part is missing, we will try to match same fields
1127 # from lookup record and current one, or you can override
1128 # which records to use from current record using
1129 sub { rec('900','x') . ' ' . rec('900','y') },
1130 )
1131
1132 You can think about this lookup as SQL (if that helps):
1133
1134 select
1135 sub { what }
1136 from
1137 database, input
1138 where
1139 sub { filter from lookuped record }
1140 having
1141 sub { optional filter on current record }
1142
1143 Easy as pie, right?
1144
1145 =cut
1146
1147 sub lookup {
1148 my ($what, $database, $input, $key, $having) = @_;
1149
1150 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1151
1152 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1153 return unless (defined($lookup->{$database}->{$input}->{$key}));
1154
1155 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1156
1157 my $mfns;
1158 my @having = $having->();
1159
1160 warn "## having = ", dump( @having ) if ($debug > 2);
1161
1162 foreach my $h ( @having ) {
1163 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1164 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1165 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1166 }
1167 }
1168
1169 return unless ($mfns);
1170
1171 my @mfns = sort keys %$mfns;
1172
1173 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1174
1175 my $old_rec = $rec;
1176 my @out;
1177
1178 foreach my $mfn (@mfns) {
1179 $rec = $load_row_coderef->( $database, $input, $mfn );
1180
1181 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1182
1183 my @vals = $what->();
1184
1185 push @out, ( @vals );
1186
1187 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1188 }
1189
1190 # if (ref($lookup->{$k}) eq 'ARRAY') {
1191 # return @{ $lookup->{$k} };
1192 # } else {
1193 # return $lookup->{$k};
1194 # }
1195
1196 $rec = $old_rec;
1197
1198 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1199
1200 if ($#out == 0) {
1201 return $out[0];
1202 } else {
1203 return @out;
1204 }
1205 }
1206
1207 =head2 save_into_lookup
1208
1209 Save value into lookup. It associates current database, input
1210 and specific keys with one or more values which will be
1211 associated over MFN.
1212
1213 MFN will be extracted from first occurence current of field 000
1214 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1215
1216 my $nr = save_into_lookup($database,$input,$key,sub {
1217 # code which produce one or more values
1218 });
1219
1220 It returns number of items saved.
1221
1222 This function shouldn't be called directly, it's called from code created by
1223 L<WebPAC::Parser>.
1224
1225 =cut
1226
1227 sub save_into_lookup {
1228 my ($database,$input,$key,$coderef) = @_;
1229 die "save_into_lookup needs database" unless defined($database);
1230 die "save_into_lookup needs input" unless defined($input);
1231 die "save_into_lookup needs key" unless defined($key);
1232 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1233
1234 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1235
1236 my $mfn =
1237 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1238 defined($config->{_mfn}) ? $config->{_mfn} :
1239 die "mfn not defined or zero";
1240
1241 my $nr = 0;
1242
1243 foreach my $v ( $coderef->() ) {
1244 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1245 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1246 $nr++;
1247 }
1248
1249 return $nr;
1250 }
1251
1252 =head2 config
1253
1254 Consult config values stored in C<config.yml>
1255
1256 # return database code (key under databases in yaml)
1257 $database_code = config(); # use _ from hash
1258 $database_name = config('name');
1259 $database_input_name = config('input name');
1260
1261 Up to three levels are supported.
1262
1263 =cut
1264
1265 sub config {
1266 return unless ($config);
1267
1268 my $p = shift;
1269
1270 $p ||= '';
1271
1272 my $v;
1273
1274 warn "### getting config($p)\n" if ($debug > 1);
1275
1276 my @p = split(/\s+/,$p);
1277 if ($#p < 0) {
1278 $v = $config->{ '_' }; # special, database code
1279 } else {
1280
1281 my $c = dclone( $config );
1282
1283 foreach my $k (@p) {
1284 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1285 if (ref($c) eq 'ARRAY') {
1286 $c = shift @$c;
1287 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1288 last;
1289 }
1290
1291 if (! defined($c->{$k}) ) {
1292 $c = undef;
1293 last;
1294 } else {
1295 $c = $c->{$k};
1296 }
1297 }
1298 $v = $c if ($c);
1299
1300 }
1301
1302 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1303 warn "config( '$p' ) is empty\n" if (! $v);
1304
1305 return $v;
1306 }
1307
1308 =head2 id
1309
1310 Returns unique id of this record
1311
1312 $id = id();
1313
1314 Returns C<42/2> for 2nd occurence of MFN 42.
1315
1316 =cut
1317
1318 sub id {
1319 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1320 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1321 }
1322
1323 =head2 join_with
1324
1325 Joins walues with some delimiter
1326
1327 $v = join_with(", ", @v);
1328
1329 =cut
1330
1331 sub join_with {
1332 my $d = shift;
1333 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1334 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1335 return '' unless defined($v);
1336 return $v;
1337 }
1338
1339 =head2 split_rec_on
1340
1341 Split record subfield on some regex and take one of parts out
1342
1343 $a_before_semi_column =
1344 split_rec_on('200','a', /\s*;\s*/, $part);
1345
1346 C<$part> is optional number of element. First element is
1347 B<1>, not 0!
1348
1349 If there is no C<$part> parameter or C<$part> is 0, this function will
1350 return all values produced by splitting.
1351
1352 =cut
1353
1354 sub split_rec_on {
1355 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1356
1357 my ($fld, $sf, $regex, $part) = @_;
1358 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1359
1360 my @r = rec( $fld, $sf );
1361 my $v = shift @r;
1362 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1363
1364 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1365
1366 my @s = split( $regex, $v );
1367 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1368 if ($part && $part > 0) {
1369 return $s[ $part - 1 ];
1370 } else {
1371 return @s;
1372 }
1373 }
1374
1375 my $hash;
1376
1377 =head2 set
1378
1379 set( key => 'value' );
1380
1381 =cut
1382
1383 sub set {
1384 my ($k,$v) = @_;
1385 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1386 $hash->{$k} = $v;
1387 };
1388
1389 =head2 get
1390
1391 get( 'key' );
1392
1393 =cut
1394
1395 sub get {
1396 my $k = shift || return;
1397 my $v = $hash->{$k};
1398 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1399 return $v;
1400 }
1401
1402 =head2 count
1403
1404 if ( count( @result ) == 1 ) {
1405 # do something if only 1 result is there
1406 }
1407
1408 =cut
1409
1410 sub count {
1411 warn "## count ",dump(@_),$/ if ( $debug );
1412 return @_ . '';
1413 }
1414
1415 # END
1416 1;

  ViewVC Help
Powered by ViewVC 1.1.26