/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 982 - (show annotations)
Sat Nov 3 13:35:03 2007 UTC (16 years, 4 months ago) by dpavlin
File size: 30863 byte(s)
remove debug

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 frec
19 regex prefix suffix surround
20 first lookup join_with
21 save_into_lookup
22
23 split_rec_on
24
25 get set
26 count
27
28 /;
29
30 use warnings;
31 use strict;
32
33 #use base qw/WebPAC::Common/;
34 use Data::Dump qw/dump/;
35 use Storable qw/dclone/;
36 use Carp qw/confess/;
37
38 # debugging warn(s)
39 my $debug = 0;
40
41 use WebPAC::Normalize::ISBN;
42 push @EXPORT, ( 'isbn_10', 'isbn_13' );
43
44 =head1 NAME
45
46 WebPAC::Normalize - describe normalisaton rules using sets
47
48 =cut
49
50 our $VERSION = '0.31';
51
52 =head1 SYNOPSIS
53
54 This module uses C<conf/normalize/*.pl> files to perform normalisation
55 from input records using perl functions which are specialized for set
56 processing.
57
58 Sets are implemented as arrays, and normalisation file is valid perl, which
59 means that you check it's validity before running WebPAC using
60 C<perl -c normalize.pl>.
61
62 Normalisation can generate multiple output normalized data. For now, supported output
63 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
64 C<marc>.
65
66 =head1 FUNCTIONS
67
68 Functions which start with C<_> are private and used by WebPAC internally.
69 All other functions are available for use within normalisation rules.
70
71 =head2 data_structure
72
73 Return data structure
74
75 my $ds = WebPAC::Normalize::data_structure(
76 lookup => $lookup_hash,
77 row => $row,
78 rules => $normalize_pl_config,
79 marc_encoding => 'utf-8',
80 config => $config,
81 load_row_coderef => sub {
82 my ($database,$input,$mfn) = @_;
83 $store->load_row( database => $database, input => $input, id => $mfn );
84 },
85 );
86
87 Options C<row>, C<rules> and C<log> are mandatory while all
88 other are optional.
89
90 C<load_row_coderef> is closure only used when executing lookups, so they will
91 die if it's not defined.
92
93 This function will B<die> if normalizastion can't be evaled.
94
95 Since this function isn't exported you have to call it with
96 C<WebPAC::Normalize::data_structure>.
97
98 =cut
99
100 my $load_row_coderef;
101
102 sub data_structure {
103 my $arg = {@_};
104
105 die "need row argument" unless ($arg->{row});
106 die "need normalisation argument" unless ($arg->{rules});
107
108 no strict 'subs';
109 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
110 _set_rec( $arg->{row} );
111 _set_config( $arg->{config} ) if defined($arg->{config});
112 _clean_ds( %{ $arg } );
113 $load_row_coderef = $arg->{load_row_coderef};
114
115 eval "$arg->{rules}";
116 die "error evaling $arg->{rules}: $@\n" if ($@);
117
118 return _get_ds();
119 }
120
121 =head2 _set_rec
122
123 Set current record hash
124
125 _set_rec( $rec );
126
127 =cut
128
129 my $rec;
130
131 sub _set_rec {
132 $rec = shift or die "no record hash";
133 }
134
135 =head2 _set_config
136
137 Set current config hash
138
139 _set_config( $config );
140
141 Magic keys are:
142
143 =over 4
144
145 =item _
146
147 Code of current database
148
149 =item _mfn
150
151 Current MFN
152
153 =back
154
155 =cut
156
157 my $config;
158
159 sub _set_config {
160 $config = shift;
161 }
162
163 =head2 _get_ds
164
165 Return hash formatted as data structure
166
167 my $ds = _get_ds();
168
169 =cut
170
171 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
172 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
173
174 sub _get_ds {
175 #warn "## out = ",dump($out);
176 return $out;
177 }
178
179 =head2 _clean_ds
180
181 Clean data structure hash for next record
182
183 _clean_ds();
184
185 =cut
186
187 sub _clean_ds {
188 my $a = {@_};
189 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
190 ($marc_record_offset, $marc_fetch_offset) = (0,0);
191 $marc_encoding = $a->{marc_encoding};
192 }
193
194 =head2 _set_lookup
195
196 Set current lookup hash
197
198 _set_lookup( $lookup );
199
200 =cut
201
202 my $lookup;
203
204 sub _set_lookup {
205 $lookup = shift;
206 }
207
208 =head2 _get_lookup
209
210 Get current lookup hash
211
212 my $lookup = _get_lookup();
213
214 =cut
215
216 sub _get_lookup {
217 return $lookup;
218 }
219
220 =head2 _set_load_row
221
222 Setup code reference which will return L<data_structure> from
223 L<WebPAC::Store>
224
225 _set_load_row(sub {
226 my ($database,$input,$mfn) = @_;
227 $store->load_row( database => $database, input => $input, id => $mfn );
228 });
229
230 =cut
231
232 sub _set_load_row {
233 my $coderef = shift;
234 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
235
236 $load_row_coderef = $coderef;
237 }
238
239 =head2 _get_marc_fields
240
241 Get all fields defined by calls to C<marc>
242
243 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
244
245 We are using I<magic> which detect repeatable fields only from
246 sequence of field/subfield data generated by normalization.
247
248 Repeatable field is created when there is second occurence of same subfield or
249 if any of indicators are different.
250
251 This is sane for most cases. Something like:
252
253 900a-1 900b-1 900c-1
254 900a-2 900b-2
255 900a-3
256
257 will be created from any combination of:
258
259 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
260
261 and following rules:
262
263 marc('900','a', rec('200','a') );
264 marc('900','b', rec('200','b') );
265 marc('900','c', rec('200','c') );
266
267 which might not be what you have in mind. If you need repeatable subfield,
268 define it using C<marc_repeatable_subfield> like this:
269
270 marc_repeatable_subfield('900','a');
271 marc('900','a', rec('200','a') );
272 marc('900','b', rec('200','b') );
273 marc('900','c', rec('200','c') );
274
275 will create:
276
277 900a-1 900a-2 900a-3 900b-1 900c-1
278 900b-2
279
280 There is also support for returning next or specific using:
281
282 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
283 # do something with $mf
284 }
285
286 will always return fields from next MARC record or
287
288 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
289
290 will return 42th copy record (if it exists).
291
292 =cut
293
294 my $fetch_pos;
295
296 sub _get_marc_fields {
297
298 my $arg = {@_};
299 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
300 $fetch_pos = $marc_fetch_offset;
301 if ($arg->{offset}) {
302 $fetch_pos = $arg->{offset};
303 } elsif($arg->{fetch_next}) {
304 $marc_fetch_offset++;
305 }
306
307 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
308
309 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
310
311 my $marc_rec = $marc_record->[ $fetch_pos ];
312
313 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
314
315 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
316
317 # first, sort all existing fields
318 # XXX might not be needed, but modern perl might randomize elements in hash
319 my @sorted_marc_record = sort {
320 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
321 } @{ $marc_rec };
322
323 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
324
325 # output marc fields
326 my @m;
327
328 # count unique field-subfields (used for offset when walking to next subfield)
329 my $u;
330 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
331
332 if ($debug) {
333 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
334 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
335 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
336 warn "## subfield count = ", dump( $u ), $/;
337 }
338
339 my $len = $#sorted_marc_record;
340 my $visited;
341 my $i = 0;
342 my $field;
343
344 foreach ( 0 .. $len ) {
345
346 # find next element which isn't visited
347 while ($visited->{$i}) {
348 $i = ($i + 1) % ($len + 1);
349 }
350
351 # mark it visited
352 $visited->{$i}++;
353
354 my $row = dclone( $sorted_marc_record[$i] );
355
356 # field and subfield which is key for
357 # marc_repeatable_subfield and u
358 my $fsf = $row->[0] . ( $row->[3] || '' );
359
360 if ($debug > 1) {
361
362 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
363 print "### this [$i]: ", dump( $row ),$/;
364 print "### sf: ", $row->[3], " vs ", $field->[3],
365 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
366 if ($#$field >= 0);
367
368 }
369
370 # if field exists
371 if ( $#$field >= 0 ) {
372 if (
373 $row->[0] ne $field->[0] || # field
374 $row->[1] ne $field->[1] || # i1
375 $row->[2] ne $field->[2] # i2
376 ) {
377 push @m, $field;
378 warn "## saved/1 ", dump( $field ),$/ if ($debug);
379 $field = $row;
380
381 } elsif (
382 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
383 ||
384 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
385 ! $marc_repeatable_subfield->{ $fsf }
386 )
387 ) {
388 push @m, $field;
389 warn "## saved/2 ", dump( $field ),$/ if ($debug);
390 $field = $row;
391
392 } else {
393 # append new subfields to existing field
394 push @$field, ( $row->[3], $row->[4] );
395 }
396 } else {
397 # insert first field
398 $field = $row;
399 }
400
401 if (! $marc_repeatable_subfield->{ $fsf }) {
402 # make step to next subfield
403 $i = ($i + $u->{ $fsf } ) % ($len + 1);
404 }
405 }
406
407 if ($#$field >= 0) {
408 push @m, $field;
409 warn "## saved/3 ", dump( $field ),$/ if ($debug);
410 }
411
412 return \@m;
413 }
414
415 =head2 _get_marc_leader
416
417 Return leader from currently fetched record by L</_get_marc_fields>
418
419 print WebPAC::Normalize::_get_marc_leader();
420
421 =cut
422
423 sub _get_marc_leader {
424 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
425 return $marc_leader->[ $fetch_pos ];
426 }
427
428 =head2 _debug
429
430 Change level of debug warnings
431
432 _debug( 2 );
433
434 =cut
435
436 sub _debug {
437 my $l = shift;
438 return $debug unless defined($l);
439 warn "debug level $l",$/ if ($l > 0);
440 $debug = $l;
441 }
442
443 =head1 Functions to create C<data_structure>
444
445 Those functions generally have to first in your normalization file.
446
447 =head2 search_display
448
449 Define output for L<search> and L<display> at the same time
450
451 search_display('Title', rec('200','a') );
452
453
454 =cut
455
456 sub search_display {
457 my $name = shift or die "search_display needs name as first argument";
458 my @o = grep { defined($_) && $_ ne '' } @_;
459 return unless (@o);
460 $out->{$name}->{search} = \@o;
461 $out->{$name}->{display} = \@o;
462 }
463
464 =head2 tag
465
466 Old name for L<search_display>, but supported
467
468 =cut
469
470 sub tag {
471 search_display( @_ );
472 }
473
474 =head2 display
475
476 Define output just for I<display>
477
478 @v = display('Title', rec('200','a') );
479
480 =cut
481
482 sub _field {
483 my $type = shift or confess "need type -- BUG?";
484 my $name = shift or confess "needs name as first argument";
485 my @o = grep { defined($_) && $_ ne '' } @_;
486 return unless (@o);
487 $out->{$name}->{$type} = \@o;
488 }
489
490 sub display { _field( 'display', @_ ) }
491
492 =head2 search
493
494 Prepare values just for I<search>
495
496 @v = search('Title', rec('200','a') );
497
498 =cut
499
500 sub search { _field( 'search', @_ ) }
501
502 =head2 sorted
503
504 Insert into lists which will be automatically sorted
505
506 sorted('Title', rec('200','a') );
507
508 =cut
509
510 sub sorted { _field( 'sorted', @_ ) }
511
512
513 =head2 marc_leader
514
515 Setup fields within MARC leader or get leader
516
517 marc_leader('05','c');
518 my $leader = marc_leader();
519
520 =cut
521
522 sub marc_leader {
523 my ($offset,$value) = @_;
524
525 if ($offset) {
526 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
527 } else {
528
529 if (defined($marc_leader)) {
530 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
531 return $marc_leader->[ $marc_record_offset ];
532 } else {
533 return;
534 }
535 }
536 }
537
538 =head2 marc_fixed
539
540 Create control/indentifier fields with values in fixed positions
541
542 marc_fixed('008', 00, '070402');
543 marc_fixed('008', 39, '|');
544
545 Positions not specified will be filled with spaces (C<0x20>).
546
547 There will be no effort to extend last specified value to full length of
548 field in standard.
549
550 =cut
551
552 sub marc_fixed {
553 my ($f, $pos, $val) = @_;
554 die "need marc(field, position, value)" unless defined($f) && defined($pos);
555
556 confess "need val" unless defined $val;
557
558 my $update = 0;
559
560 map {
561 if ($_->[0] eq $f) {
562 my $old = $_->[1];
563 if (length($old) <= $pos) {
564 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
565 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
566 } else {
567 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
568 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
569 }
570 $update++;
571 }
572 } @{ $marc_record->[ $marc_record_offset ] };
573
574 if (! $update) {
575 my $v = ' ' x $pos . $val;
576 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
577 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
578 }
579 }
580
581 =head2 marc
582
583 Save value for MARC field
584
585 marc('900','a', rec('200','a') );
586 marc('001', rec('000') );
587
588 =cut
589
590 sub marc {
591 my $f = shift or die "marc needs field";
592 die "marc field must be numer" unless ($f =~ /^\d+$/);
593
594 my $sf;
595 if ($f >= 10) {
596 $sf = shift or die "marc needs subfield";
597 }
598
599 foreach (@_) {
600 my $v = $_; # make var read-write for Encode
601 next unless (defined($v) && $v !~ /^\s*$/);
602 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
603 if (defined $sf) {
604 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
605 } else {
606 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
607 }
608 }
609 }
610
611 =head2 marc_repeatable_subfield
612
613 Save values for MARC repetable subfield
614
615 marc_repeatable_subfield('910', 'z', rec('909') );
616
617 =cut
618
619 sub marc_repeatable_subfield {
620 my ($f,$sf) = @_;
621 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
622 $marc_repeatable_subfield->{ $f . $sf }++;
623 marc(@_);
624 }
625
626 =head2 marc_indicators
627
628 Set both indicators for MARC field
629
630 marc_indicators('900', ' ', 1);
631
632 Any indicator value other than C<0-9> will be treated as undefined.
633
634 =cut
635
636 sub marc_indicators {
637 my $f = shift || die "marc_indicators need field!\n";
638 my ($i1,$i2) = @_;
639 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
640 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
641
642 $i1 = ' ' if ($i1 !~ /^\d$/);
643 $i2 = ' ' if ($i2 !~ /^\d$/);
644 @{ $marc_indicators->{$f} } = ($i1,$i2);
645 }
646
647 =head2 marc_compose
648
649 Save values for each MARC subfield explicitly
650
651 marc_compose('900',
652 'a', rec('200','a')
653 'b', rec('201','a')
654 'a', rec('200','b')
655 'c', rec('200','c')
656 );
657
658 If you specify C<+> for subfield, value will be appended
659 to previous defined subfield.
660
661 =cut
662
663 sub marc_compose {
664 my $f = shift or die "marc_compose needs field";
665 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
666
667 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
668 my $m = [ $f, $i1, $i2 ];
669
670 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
671
672 if ($#_ % 2 != 1) {
673 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
674 }
675
676 while (@_) {
677 my $sf = shift;
678 my $v = shift;
679
680 next unless (defined($v) && $v !~ /^\s*$/);
681 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
682 if ($sf ne '+') {
683 push @$m, ( $sf, $v );
684 } else {
685 $m->[ $#$m ] .= $v;
686 }
687 }
688
689 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
690
691 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
692 }
693
694 =head2 marc_duplicate
695
696 Generate copy of current MARC record and continue working on copy
697
698 marc_duplicate();
699
700 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
701 C<< _get_marc_fields( offset => 42 ) >>.
702
703 =cut
704
705 sub marc_duplicate {
706 my $m = $marc_record->[ -1 ];
707 die "can't duplicate record which isn't defined" unless ($m);
708 push @{ $marc_record }, dclone( $m );
709 push @{ $marc_leader }, dclone( marc_leader() );
710 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
711 $marc_record_offset = $#{ $marc_record };
712 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
713
714 }
715
716 =head2 marc_remove
717
718 Remove some field or subfield from MARC record.
719
720 marc_remove('200');
721 marc_remove('200','a');
722
723 This will erase field C<200> or C<200^a> from current MARC record.
724
725 marc_remove('*');
726
727 Will remove all fields in current MARC record.
728
729 This is useful after calling C<marc_duplicate> or on it's own (but, you
730 should probably just remove that subfield definition if you are not
731 using C<marc_duplicate>).
732
733 FIXME: support fields < 10.
734
735 =cut
736
737 sub marc_remove {
738 my ($f, $sf) = @_;
739
740 die "marc_remove needs record number" unless defined($f);
741
742 my $marc = $marc_record->[ $marc_record_offset ];
743
744 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
745
746 if ($f eq '*') {
747
748 delete( $marc_record->[ $marc_record_offset ] );
749 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
750
751 } else {
752
753 my $i = 0;
754 foreach ( 0 .. $#{ $marc } ) {
755 last unless (defined $marc->[$i]);
756 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
757 if ($marc->[$i]->[0] eq $f) {
758 if (! defined $sf) {
759 # remove whole field
760 splice @$marc, $i, 1;
761 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
762 $i--;
763 } else {
764 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
765 my $o = ($j * 2) + 3;
766 if ($marc->[$i]->[$o] eq $sf) {
767 # remove subfield
768 splice @{$marc->[$i]}, $o, 2;
769 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
770 # is record now empty?
771 if ($#{ $marc->[$i] } == 2) {
772 splice @$marc, $i, 1;
773 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
774 $i--;
775 };
776 }
777 }
778 }
779 }
780 $i++;
781 }
782
783 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
784
785 $marc_record->[ $marc_record_offset ] = $marc;
786 }
787
788 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
789 }
790
791 =head2 marc_original_order
792
793 Copy all subfields preserving original order to marc field.
794
795 marc_original_order( marc_field_number, original_input_field_number );
796
797 Please note that field numbers are consistent with other commands (marc
798 field number first), but somewhat counter-intuitive (destination and then
799 source).
800
801 You might want to use this command if you are just renaming subfields or
802 using pre-processing modify_record in C<config.yml> and don't need any
803 post-processing or want to preserve order of original subfields.
804
805
806 =cut
807
808 sub marc_original_order {
809
810 my ($to, $from) = @_;
811 die "marc_original_order needs from and to fields\n" unless ($from && $to);
812
813 return unless defined($rec->{$from});
814
815 my $r = $rec->{$from};
816 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
817
818 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
819 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
820
821 foreach my $d (@$r) {
822
823 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
824 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
825 next;
826 }
827
828 my @sfs = @{ $d->{subfields} };
829
830 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
831
832 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
833
834 my $m = [ $to, $i1, $i2 ];
835
836 while (my $sf = shift @sfs) {
837
838 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
839 my $offset = shift @sfs;
840 die "corrupted sufields specification for field $from\n" unless defined($offset);
841
842 my $v;
843 if (ref($d->{$sf}) eq 'ARRAY') {
844 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
845 } elsif ($offset == 0) {
846 $v = $d->{$sf};
847 } else {
848 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
849 }
850 push @$m, ( $sf, $v ) if (defined($v));
851 }
852
853 if ($#{$m} > 2) {
854 push @{ $marc_record->[ $marc_record_offset ] }, $m;
855 }
856 }
857
858 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
859 }
860
861 =head2 marc_count
862
863 Return number of MARC records created using L</marc_duplicate>.
864
865 print "created ", marc_count(), " records";
866
867 =cut
868
869 sub marc_count {
870 return $#{ $marc_record };
871 }
872
873
874 =head1 Functions to extract data from input
875
876 This function should be used inside functions to create C<data_structure> described
877 above.
878
879 =head2 _pack_subfields_hash
880
881 @subfields = _pack_subfields_hash( $h );
882 $subfields = _pack_subfields_hash( $h, 1 );
883
884 Return each subfield value in array or pack them all together and return scalar
885 with subfields (denoted by C<^>) and values.
886
887 =cut
888
889 sub _pack_subfields_hash {
890
891 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
892
893 my ($h,$include_subfields) = @_;
894
895 # sanity and ease of use
896 return $h if (ref($h) ne 'HASH');
897
898 if ( defined($h->{subfields}) ) {
899 my $sfs = delete $h->{subfields} || die "no subfields?";
900 my @out;
901 while (@$sfs) {
902 my $sf = shift @$sfs;
903 push @out, '^' . $sf if ($include_subfields);
904 my $o = shift @$sfs;
905 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
906 # single element subfields are not arrays
907 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
908
909 push @out, $h->{$sf};
910 } else {
911 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
912 push @out, $h->{$sf}->[$o];
913 }
914 }
915 if ($include_subfields) {
916 return join('', @out);
917 } else {
918 return @out;
919 }
920 } else {
921 if ($include_subfields) {
922 my $out = '';
923 foreach my $sf (sort keys %$h) {
924 if (ref($h->{$sf}) eq 'ARRAY') {
925 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
926 } else {
927 $out .= '^' . $sf . $h->{$sf};
928 }
929 }
930 return $out;
931 } else {
932 # FIXME this should probably be in alphabetical order instead of hash order
933 values %{$h};
934 }
935 }
936 }
937
938 =head2 rec1
939
940 Return all values in some field
941
942 @v = rec1('200')
943
944 TODO: order of values is probably same as in source data, need to investigate that
945
946 =cut
947
948 sub rec1 {
949 my $f = shift;
950 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
951 return unless (defined($rec) && defined($rec->{$f}));
952 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
953 if (ref($rec->{$f}) eq 'ARRAY') {
954 my @out;
955 foreach my $h ( @{ $rec->{$f} } ) {
956 if (ref($h) eq 'HASH') {
957 push @out, ( _pack_subfields_hash( $h ) );
958 } else {
959 push @out, $h;
960 }
961 }
962 return @out;
963 } elsif( defined($rec->{$f}) ) {
964 return $rec->{$f};
965 }
966 }
967
968 =head2 rec2
969
970 Return all values in specific field and subfield
971
972 @v = rec2('200','a')
973
974 =cut
975
976 sub rec2 {
977 my $f = shift;
978 return unless (defined($rec && $rec->{$f}));
979 my $sf = shift;
980 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
981 return map {
982 if (ref($_->{$sf}) eq 'ARRAY') {
983 @{ $_->{$sf} };
984 } else {
985 $_->{$sf};
986 }
987 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
988 }
989
990 =head2 rec
991
992 syntaxtic sugar for
993
994 @v = rec('200')
995 @v = rec('200','a')
996
997 If rec() returns just single value, it will
998 return scalar, not array.
999
1000 =cut
1001
1002 sub frec {
1003 my @out = rec(@_);
1004 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1005 return shift @out;
1006 }
1007
1008 sub rec {
1009 my @out;
1010 if ($#_ == 0) {
1011 @out = rec1(@_);
1012 } elsif ($#_ == 1) {
1013 @out = rec2(@_);
1014 }
1015 if ($#out == 0 && ! wantarray) {
1016 return $out[0];
1017 } elsif (@out) {
1018 return @out;
1019 } else {
1020 return '';
1021 }
1022 }
1023
1024 =head2 regex
1025
1026 Apply regex to some or all values
1027
1028 @v = regex( 's/foo/bar/g', @v );
1029
1030 =cut
1031
1032 sub regex {
1033 my $r = shift;
1034 my @out;
1035 #warn "r: $r\n", dump(\@_);
1036 foreach my $t (@_) {
1037 next unless ($t);
1038 eval "\$t =~ $r";
1039 push @out, $t if ($t && $t ne '');
1040 }
1041 return @out;
1042 }
1043
1044 =head2 prefix
1045
1046 Prefix all values with a string
1047
1048 @v = prefix( 'my_', @v );
1049
1050 =cut
1051
1052 sub prefix {
1053 my $p = shift;
1054 return @_ unless defined( $p );
1055 return map { $p . $_ } grep { defined($_) } @_;
1056 }
1057
1058 =head2 suffix
1059
1060 suffix all values with a string
1061
1062 @v = suffix( '_my', @v );
1063
1064 =cut
1065
1066 sub suffix {
1067 my $s = shift;
1068 return @_ unless defined( $s );
1069 return map { $_ . $s } grep { defined($_) } @_;
1070 }
1071
1072 =head2 surround
1073
1074 surround all values with a two strings
1075
1076 @v = surround( 'prefix_', '_suffix', @v );
1077
1078 =cut
1079
1080 sub surround {
1081 my $p = shift;
1082 my $s = shift;
1083 $p = '' unless defined( $p );
1084 $s = '' unless defined( $s );
1085 return map { $p . $_ . $s } grep { defined($_) } @_;
1086 }
1087
1088 =head2 first
1089
1090 Return first element
1091
1092 $v = first( @v );
1093
1094 =cut
1095
1096 sub first {
1097 my $r = shift;
1098 return $r;
1099 }
1100
1101 =head2 lookup
1102
1103 Consult lookup hashes for some value
1104
1105 @v = lookup(
1106 sub {
1107 'ffkk/peri/mfn'.rec('000')
1108 },
1109 'ffkk','peri','200-a-200-e',
1110 sub {
1111 first(rec(200,'a')).' '.first(rec('200','e'))
1112 }
1113 );
1114
1115 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1116 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1117
1118 lookup(
1119 # which results to return from record recorded in lookup
1120 sub { 'ffkk/peri/mfn' . rec('000') },
1121 # from which database and input
1122 'ffkk','peri',
1123 # such that following values match
1124 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1125 # if this part is missing, we will try to match same fields
1126 # from lookup record and current one, or you can override
1127 # which records to use from current record using
1128 sub { rec('900','x') . ' ' . rec('900','y') },
1129 )
1130
1131 You can think about this lookup as SQL (if that helps):
1132
1133 select
1134 sub { what }
1135 from
1136 database, input
1137 where
1138 sub { filter from lookuped record }
1139 having
1140 sub { optional filter on current record }
1141
1142 Easy as pie, right?
1143
1144 =cut
1145
1146 sub lookup {
1147 my ($what, $database, $input, $key, $having) = @_;
1148
1149 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1150
1151 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1152 return unless (defined($lookup->{$database}->{$input}->{$key}));
1153
1154 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1155
1156 my $mfns;
1157 my @having = $having->();
1158
1159 warn "## having = ", dump( @having ) if ($debug > 2);
1160
1161 foreach my $h ( @having ) {
1162 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1163 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1164 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1165 }
1166 }
1167
1168 return unless ($mfns);
1169
1170 my @mfns = sort keys %$mfns;
1171
1172 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1173
1174 my $old_rec = $rec;
1175 my @out;
1176
1177 foreach my $mfn (@mfns) {
1178 $rec = $load_row_coderef->( $database, $input, $mfn );
1179
1180 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1181
1182 my @vals = $what->();
1183
1184 push @out, ( @vals );
1185
1186 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1187 }
1188
1189 # if (ref($lookup->{$k}) eq 'ARRAY') {
1190 # return @{ $lookup->{$k} };
1191 # } else {
1192 # return $lookup->{$k};
1193 # }
1194
1195 $rec = $old_rec;
1196
1197 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1198
1199 if ($#out == 0) {
1200 return $out[0];
1201 } else {
1202 return @out;
1203 }
1204 }
1205
1206 =head2 save_into_lookup
1207
1208 Save value into lookup. It associates current database, input
1209 and specific keys with one or more values which will be
1210 associated over MFN.
1211
1212 MFN will be extracted from first occurence current of field 000
1213 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1214
1215 my $nr = save_into_lookup($database,$input,$key,sub {
1216 # code which produce one or more values
1217 });
1218
1219 It returns number of items saved.
1220
1221 This function shouldn't be called directly, it's called from code created by
1222 L<WebPAC::Parser>.
1223
1224 =cut
1225
1226 sub save_into_lookup {
1227 my ($database,$input,$key,$coderef) = @_;
1228 die "save_into_lookup needs database" unless defined($database);
1229 die "save_into_lookup needs input" unless defined($input);
1230 die "save_into_lookup needs key" unless defined($key);
1231 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1232
1233 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1234
1235 my $mfn =
1236 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1237 defined($config->{_mfn}) ? $config->{_mfn} :
1238 die "mfn not defined or zero";
1239
1240 my $nr = 0;
1241
1242 foreach my $v ( $coderef->() ) {
1243 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1244 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1245 $nr++;
1246 }
1247
1248 return $nr;
1249 }
1250
1251 =head2 config
1252
1253 Consult config values stored in C<config.yml>
1254
1255 # return database code (key under databases in yaml)
1256 $database_code = config(); # use _ from hash
1257 $database_name = config('name');
1258 $database_input_name = config('input name');
1259
1260 Up to three levels are supported.
1261
1262 =cut
1263
1264 sub config {
1265 return unless ($config);
1266
1267 my $p = shift;
1268
1269 $p ||= '';
1270
1271 my $v;
1272
1273 warn "### getting config($p)\n" if ($debug > 1);
1274
1275 my @p = split(/\s+/,$p);
1276 if ($#p < 0) {
1277 $v = $config->{ '_' }; # special, database code
1278 } else {
1279
1280 my $c = dclone( $config );
1281
1282 foreach my $k (@p) {
1283 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1284 if (ref($c) eq 'ARRAY') {
1285 $c = shift @$c;
1286 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1287 last;
1288 }
1289
1290 if (! defined($c->{$k}) ) {
1291 $c = undef;
1292 last;
1293 } else {
1294 $c = $c->{$k};
1295 }
1296 }
1297 $v = $c if ($c);
1298
1299 }
1300
1301 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1302 warn "config( '$p' ) is empty\n" if (! $v);
1303
1304 return $v;
1305 }
1306
1307 =head2 id
1308
1309 Returns unique id of this record
1310
1311 $id = id();
1312
1313 Returns C<42/2> for 2nd occurence of MFN 42.
1314
1315 =cut
1316
1317 sub id {
1318 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1319 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1320 }
1321
1322 =head2 join_with
1323
1324 Joins walues with some delimiter
1325
1326 $v = join_with(", ", @v);
1327
1328 =cut
1329
1330 sub join_with {
1331 my $d = shift;
1332 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1333 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1334 return '' unless defined($v);
1335 return $v;
1336 }
1337
1338 =head2 split_rec_on
1339
1340 Split record subfield on some regex and take one of parts out
1341
1342 $a_before_semi_column =
1343 split_rec_on('200','a', /\s*;\s*/, $part);
1344
1345 C<$part> is optional number of element. First element is
1346 B<1>, not 0!
1347
1348 If there is no C<$part> parameter or C<$part> is 0, this function will
1349 return all values produced by splitting.
1350
1351 =cut
1352
1353 sub split_rec_on {
1354 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1355
1356 my ($fld, $sf, $regex, $part) = @_;
1357 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1358
1359 my @r = rec( $fld, $sf );
1360 my $v = shift @r;
1361 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1362
1363 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1364
1365 my @s = split( $regex, $v );
1366 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1367 if ($part && $part > 0) {
1368 return $s[ $part - 1 ];
1369 } else {
1370 return @s;
1371 }
1372 }
1373
1374 my $hash;
1375
1376 =head2 set
1377
1378 set( key => 'value' );
1379
1380 =cut
1381
1382 sub set {
1383 my ($k,$v) = @_;
1384 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1385 $hash->{$k} = $v;
1386 };
1387
1388 =head2 get
1389
1390 get( 'key' );
1391
1392 =cut
1393
1394 sub get {
1395 my $k = shift || return;
1396 my $v = $hash->{$k};
1397 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1398 return $v;
1399 }
1400
1401 =head2 count
1402
1403 if ( count( @result ) == 1 ) {
1404 # do something if only 1 result is there
1405 }
1406
1407 =cut
1408
1409 sub count {
1410 warn "## count ",dump(@_),$/ if ( $debug );
1411 return @_ . '';
1412 }
1413
1414 # END
1415 1;

  ViewVC Help
Powered by ViewVC 1.1.26