/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 983 - (show annotations)
Sun Nov 4 11:12:38 2007 UTC (15 years, 3 months ago) by dpavlin
File size: 30983 byte(s)
 r1505@llin:  dpavlin | 2007-11-04 12:12:20 +0100
 renamed _set_rec to _set_ds (because it's a data_structure actually)
 and added symetric public get_ds to get whole data_structure as
 hash to manually traverse in normalization

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 get_ds
6 _set_load_row
7 _get_ds _clean_ds
8 _debug
9 _pack_subfields_hash
10
11 search_display search display sorted
12
13 marc marc_indicators marc_repeatable_subfield
14 marc_compose marc_leader marc_fixed
15 marc_duplicate marc_remove marc_count
16 marc_original_order
17
18 rec1 rec2 rec
19 frec
20 regex prefix suffix surround
21 first lookup join_with
22 save_into_lookup
23
24 split_rec_on
25
26 get set
27 count
28
29 /;
30
31 use warnings;
32 use strict;
33
34 #use base qw/WebPAC::Common/;
35 use Data::Dump qw/dump/;
36 use Storable qw/dclone/;
37 use Carp qw/confess/;
38
39 # debugging warn(s)
40 my $debug = 0;
41
42 use WebPAC::Normalize::ISBN;
43 push @EXPORT, ( 'isbn_10', 'isbn_13' );
44
45 =head1 NAME
46
47 WebPAC::Normalize - describe normalisaton rules using sets
48
49 =cut
50
51 our $VERSION = '0.32';
52
53 =head1 SYNOPSIS
54
55 This module uses C<conf/normalize/*.pl> files to perform normalisation
56 from input records using perl functions which are specialized for set
57 processing.
58
59 Sets are implemented as arrays, and normalisation file is valid perl, which
60 means that you check it's validity before running WebPAC using
61 C<perl -c normalize.pl>.
62
63 Normalisation can generate multiple output normalized data. For now, supported output
64 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
65 C<marc>.
66
67 =head1 FUNCTIONS
68
69 Functions which start with C<_> are private and used by WebPAC internally.
70 All other functions are available for use within normalisation rules.
71
72 =head2 data_structure
73
74 Return data structure
75
76 my $ds = WebPAC::Normalize::data_structure(
77 lookup => $lookup_hash,
78 row => $row,
79 rules => $normalize_pl_config,
80 marc_encoding => 'utf-8',
81 config => $config,
82 load_row_coderef => sub {
83 my ($database,$input,$mfn) = @_;
84 $store->load_row( database => $database, input => $input, id => $mfn );
85 },
86 );
87
88 Options C<row>, C<rules> and C<log> are mandatory while all
89 other are optional.
90
91 C<load_row_coderef> is closure only used when executing lookups, so they will
92 die if it's not defined.
93
94 This function will B<die> if normalizastion can't be evaled.
95
96 Since this function isn't exported you have to call it with
97 C<WebPAC::Normalize::data_structure>.
98
99 =cut
100
101 my $load_row_coderef;
102
103 sub data_structure {
104 my $arg = {@_};
105
106 die "need row argument" unless ($arg->{row});
107 die "need normalisation argument" unless ($arg->{rules});
108
109 no strict 'subs';
110 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
111 _set_ds( $arg->{row} );
112 _set_config( $arg->{config} ) if defined($arg->{config});
113 _clean_ds( %{ $arg } );
114 $load_row_coderef = $arg->{load_row_coderef};
115
116 eval "$arg->{rules}";
117 die "error evaling $arg->{rules}: $@\n" if ($@);
118
119 return _get_ds();
120 }
121
122 =head2 _set_ds
123
124 Set current record hash
125
126 _set_ds( $rec );
127
128 =cut
129
130 my $rec;
131
132 sub _set_ds {
133 $rec = shift or die "no record hash";
134 }
135
136 =head2 get_ds
137
138 Access to original record from input module
139
140 my $ds = get_rec;
141
142 =cut
143
144 sub get_ds {
145 return $rec;
146 }
147
148 =head2 _set_config
149
150 Set current config hash
151
152 _set_config( $config );
153
154 Magic keys are:
155
156 =over 4
157
158 =item _
159
160 Code of current database
161
162 =item _mfn
163
164 Current MFN
165
166 =back
167
168 =cut
169
170 my $config;
171
172 sub _set_config {
173 $config = shift;
174 }
175
176 =head2 _get_ds
177
178 Return hash formatted as data structure
179
180 my $ds = _get_ds();
181
182 =cut
183
184 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
185 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
186
187 sub _get_ds {
188 #warn "## out = ",dump($out);
189 return $out;
190 }
191
192 =head2 _clean_ds
193
194 Clean data structure hash for next record
195
196 _clean_ds();
197
198 =cut
199
200 sub _clean_ds {
201 my $a = {@_};
202 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
203 ($marc_record_offset, $marc_fetch_offset) = (0,0);
204 $marc_encoding = $a->{marc_encoding};
205 }
206
207 =head2 _set_lookup
208
209 Set current lookup hash
210
211 _set_lookup( $lookup );
212
213 =cut
214
215 my $lookup;
216
217 sub _set_lookup {
218 $lookup = shift;
219 }
220
221 =head2 _get_lookup
222
223 Get current lookup hash
224
225 my $lookup = _get_lookup();
226
227 =cut
228
229 sub _get_lookup {
230 return $lookup;
231 }
232
233 =head2 _set_load_row
234
235 Setup code reference which will return L<data_structure> from
236 L<WebPAC::Store>
237
238 _set_load_row(sub {
239 my ($database,$input,$mfn) = @_;
240 $store->load_row( database => $database, input => $input, id => $mfn );
241 });
242
243 =cut
244
245 sub _set_load_row {
246 my $coderef = shift;
247 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
248
249 $load_row_coderef = $coderef;
250 }
251
252 =head2 _get_marc_fields
253
254 Get all fields defined by calls to C<marc>
255
256 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
257
258 We are using I<magic> which detect repeatable fields only from
259 sequence of field/subfield data generated by normalization.
260
261 Repeatable field is created when there is second occurence of same subfield or
262 if any of indicators are different.
263
264 This is sane for most cases. Something like:
265
266 900a-1 900b-1 900c-1
267 900a-2 900b-2
268 900a-3
269
270 will be created from any combination of:
271
272 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
273
274 and following rules:
275
276 marc('900','a', rec('200','a') );
277 marc('900','b', rec('200','b') );
278 marc('900','c', rec('200','c') );
279
280 which might not be what you have in mind. If you need repeatable subfield,
281 define it using C<marc_repeatable_subfield> like this:
282
283 marc_repeatable_subfield('900','a');
284 marc('900','a', rec('200','a') );
285 marc('900','b', rec('200','b') );
286 marc('900','c', rec('200','c') );
287
288 will create:
289
290 900a-1 900a-2 900a-3 900b-1 900c-1
291 900b-2
292
293 There is also support for returning next or specific using:
294
295 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
296 # do something with $mf
297 }
298
299 will always return fields from next MARC record or
300
301 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
302
303 will return 42th copy record (if it exists).
304
305 =cut
306
307 my $fetch_pos;
308
309 sub _get_marc_fields {
310
311 my $arg = {@_};
312 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
313 $fetch_pos = $marc_fetch_offset;
314 if ($arg->{offset}) {
315 $fetch_pos = $arg->{offset};
316 } elsif($arg->{fetch_next}) {
317 $marc_fetch_offset++;
318 }
319
320 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
321
322 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
323
324 my $marc_rec = $marc_record->[ $fetch_pos ];
325
326 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
327
328 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
329
330 # first, sort all existing fields
331 # XXX might not be needed, but modern perl might randomize elements in hash
332 my @sorted_marc_record = sort {
333 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
334 } @{ $marc_rec };
335
336 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
337
338 # output marc fields
339 my @m;
340
341 # count unique field-subfields (used for offset when walking to next subfield)
342 my $u;
343 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
344
345 if ($debug) {
346 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
347 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
348 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
349 warn "## subfield count = ", dump( $u ), $/;
350 }
351
352 my $len = $#sorted_marc_record;
353 my $visited;
354 my $i = 0;
355 my $field;
356
357 foreach ( 0 .. $len ) {
358
359 # find next element which isn't visited
360 while ($visited->{$i}) {
361 $i = ($i + 1) % ($len + 1);
362 }
363
364 # mark it visited
365 $visited->{$i}++;
366
367 my $row = dclone( $sorted_marc_record[$i] );
368
369 # field and subfield which is key for
370 # marc_repeatable_subfield and u
371 my $fsf = $row->[0] . ( $row->[3] || '' );
372
373 if ($debug > 1) {
374
375 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
376 print "### this [$i]: ", dump( $row ),$/;
377 print "### sf: ", $row->[3], " vs ", $field->[3],
378 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
379 if ($#$field >= 0);
380
381 }
382
383 # if field exists
384 if ( $#$field >= 0 ) {
385 if (
386 $row->[0] ne $field->[0] || # field
387 $row->[1] ne $field->[1] || # i1
388 $row->[2] ne $field->[2] # i2
389 ) {
390 push @m, $field;
391 warn "## saved/1 ", dump( $field ),$/ if ($debug);
392 $field = $row;
393
394 } elsif (
395 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
396 ||
397 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
398 ! $marc_repeatable_subfield->{ $fsf }
399 )
400 ) {
401 push @m, $field;
402 warn "## saved/2 ", dump( $field ),$/ if ($debug);
403 $field = $row;
404
405 } else {
406 # append new subfields to existing field
407 push @$field, ( $row->[3], $row->[4] );
408 }
409 } else {
410 # insert first field
411 $field = $row;
412 }
413
414 if (! $marc_repeatable_subfield->{ $fsf }) {
415 # make step to next subfield
416 $i = ($i + $u->{ $fsf } ) % ($len + 1);
417 }
418 }
419
420 if ($#$field >= 0) {
421 push @m, $field;
422 warn "## saved/3 ", dump( $field ),$/ if ($debug);
423 }
424
425 return \@m;
426 }
427
428 =head2 _get_marc_leader
429
430 Return leader from currently fetched record by L</_get_marc_fields>
431
432 print WebPAC::Normalize::_get_marc_leader();
433
434 =cut
435
436 sub _get_marc_leader {
437 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
438 return $marc_leader->[ $fetch_pos ];
439 }
440
441 =head2 _debug
442
443 Change level of debug warnings
444
445 _debug( 2 );
446
447 =cut
448
449 sub _debug {
450 my $l = shift;
451 return $debug unless defined($l);
452 warn "debug level $l",$/ if ($l > 0);
453 $debug = $l;
454 }
455
456 =head1 Functions to create C<data_structure>
457
458 Those functions generally have to first in your normalization file.
459
460 =head2 search_display
461
462 Define output for L<search> and L<display> at the same time
463
464 search_display('Title', rec('200','a') );
465
466
467 =cut
468
469 sub search_display {
470 my $name = shift or die "search_display needs name as first argument";
471 my @o = grep { defined($_) && $_ ne '' } @_;
472 return unless (@o);
473 $out->{$name}->{search} = \@o;
474 $out->{$name}->{display} = \@o;
475 }
476
477 =head2 tag
478
479 Old name for L<search_display>, but supported
480
481 =cut
482
483 sub tag {
484 search_display( @_ );
485 }
486
487 =head2 display
488
489 Define output just for I<display>
490
491 @v = display('Title', rec('200','a') );
492
493 =cut
494
495 sub _field {
496 my $type = shift or confess "need type -- BUG?";
497 my $name = shift or confess "needs name as first argument";
498 my @o = grep { defined($_) && $_ ne '' } @_;
499 return unless (@o);
500 $out->{$name}->{$type} = \@o;
501 }
502
503 sub display { _field( 'display', @_ ) }
504
505 =head2 search
506
507 Prepare values just for I<search>
508
509 @v = search('Title', rec('200','a') );
510
511 =cut
512
513 sub search { _field( 'search', @_ ) }
514
515 =head2 sorted
516
517 Insert into lists which will be automatically sorted
518
519 sorted('Title', rec('200','a') );
520
521 =cut
522
523 sub sorted { _field( 'sorted', @_ ) }
524
525
526 =head2 marc_leader
527
528 Setup fields within MARC leader or get leader
529
530 marc_leader('05','c');
531 my $leader = marc_leader();
532
533 =cut
534
535 sub marc_leader {
536 my ($offset,$value) = @_;
537
538 if ($offset) {
539 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
540 } else {
541
542 if (defined($marc_leader)) {
543 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
544 return $marc_leader->[ $marc_record_offset ];
545 } else {
546 return;
547 }
548 }
549 }
550
551 =head2 marc_fixed
552
553 Create control/indentifier fields with values in fixed positions
554
555 marc_fixed('008', 00, '070402');
556 marc_fixed('008', 39, '|');
557
558 Positions not specified will be filled with spaces (C<0x20>).
559
560 There will be no effort to extend last specified value to full length of
561 field in standard.
562
563 =cut
564
565 sub marc_fixed {
566 my ($f, $pos, $val) = @_;
567 die "need marc(field, position, value)" unless defined($f) && defined($pos);
568
569 confess "need val" unless defined $val;
570
571 my $update = 0;
572
573 map {
574 if ($_->[0] eq $f) {
575 my $old = $_->[1];
576 if (length($old) <= $pos) {
577 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
578 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
579 } else {
580 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
581 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
582 }
583 $update++;
584 }
585 } @{ $marc_record->[ $marc_record_offset ] };
586
587 if (! $update) {
588 my $v = ' ' x $pos . $val;
589 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
590 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
591 }
592 }
593
594 =head2 marc
595
596 Save value for MARC field
597
598 marc('900','a', rec('200','a') );
599 marc('001', rec('000') );
600
601 =cut
602
603 sub marc {
604 my $f = shift or die "marc needs field";
605 die "marc field must be numer" unless ($f =~ /^\d+$/);
606
607 my $sf;
608 if ($f >= 10) {
609 $sf = shift or die "marc needs subfield";
610 }
611
612 foreach (@_) {
613 my $v = $_; # make var read-write for Encode
614 next unless (defined($v) && $v !~ /^\s*$/);
615 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
616 if (defined $sf) {
617 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
618 } else {
619 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
620 }
621 }
622 }
623
624 =head2 marc_repeatable_subfield
625
626 Save values for MARC repetable subfield
627
628 marc_repeatable_subfield('910', 'z', rec('909') );
629
630 =cut
631
632 sub marc_repeatable_subfield {
633 my ($f,$sf) = @_;
634 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
635 $marc_repeatable_subfield->{ $f . $sf }++;
636 marc(@_);
637 }
638
639 =head2 marc_indicators
640
641 Set both indicators for MARC field
642
643 marc_indicators('900', ' ', 1);
644
645 Any indicator value other than C<0-9> will be treated as undefined.
646
647 =cut
648
649 sub marc_indicators {
650 my $f = shift || die "marc_indicators need field!\n";
651 my ($i1,$i2) = @_;
652 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
653 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
654
655 $i1 = ' ' if ($i1 !~ /^\d$/);
656 $i2 = ' ' if ($i2 !~ /^\d$/);
657 @{ $marc_indicators->{$f} } = ($i1,$i2);
658 }
659
660 =head2 marc_compose
661
662 Save values for each MARC subfield explicitly
663
664 marc_compose('900',
665 'a', rec('200','a')
666 'b', rec('201','a')
667 'a', rec('200','b')
668 'c', rec('200','c')
669 );
670
671 If you specify C<+> for subfield, value will be appended
672 to previous defined subfield.
673
674 =cut
675
676 sub marc_compose {
677 my $f = shift or die "marc_compose needs field";
678 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
679
680 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
681 my $m = [ $f, $i1, $i2 ];
682
683 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
684
685 if ($#_ % 2 != 1) {
686 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
687 }
688
689 while (@_) {
690 my $sf = shift;
691 my $v = shift;
692
693 next unless (defined($v) && $v !~ /^\s*$/);
694 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
695 if ($sf ne '+') {
696 push @$m, ( $sf, $v );
697 } else {
698 $m->[ $#$m ] .= $v;
699 }
700 }
701
702 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
703
704 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
705 }
706
707 =head2 marc_duplicate
708
709 Generate copy of current MARC record and continue working on copy
710
711 marc_duplicate();
712
713 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
714 C<< _get_marc_fields( offset => 42 ) >>.
715
716 =cut
717
718 sub marc_duplicate {
719 my $m = $marc_record->[ -1 ];
720 die "can't duplicate record which isn't defined" unless ($m);
721 push @{ $marc_record }, dclone( $m );
722 push @{ $marc_leader }, dclone( marc_leader() );
723 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
724 $marc_record_offset = $#{ $marc_record };
725 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
726
727 }
728
729 =head2 marc_remove
730
731 Remove some field or subfield from MARC record.
732
733 marc_remove('200');
734 marc_remove('200','a');
735
736 This will erase field C<200> or C<200^a> from current MARC record.
737
738 marc_remove('*');
739
740 Will remove all fields in current MARC record.
741
742 This is useful after calling C<marc_duplicate> or on it's own (but, you
743 should probably just remove that subfield definition if you are not
744 using C<marc_duplicate>).
745
746 FIXME: support fields < 10.
747
748 =cut
749
750 sub marc_remove {
751 my ($f, $sf) = @_;
752
753 die "marc_remove needs record number" unless defined($f);
754
755 my $marc = $marc_record->[ $marc_record_offset ];
756
757 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
758
759 if ($f eq '*') {
760
761 delete( $marc_record->[ $marc_record_offset ] );
762 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
763
764 } else {
765
766 my $i = 0;
767 foreach ( 0 .. $#{ $marc } ) {
768 last unless (defined $marc->[$i]);
769 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
770 if ($marc->[$i]->[0] eq $f) {
771 if (! defined $sf) {
772 # remove whole field
773 splice @$marc, $i, 1;
774 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
775 $i--;
776 } else {
777 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
778 my $o = ($j * 2) + 3;
779 if ($marc->[$i]->[$o] eq $sf) {
780 # remove subfield
781 splice @{$marc->[$i]}, $o, 2;
782 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
783 # is record now empty?
784 if ($#{ $marc->[$i] } == 2) {
785 splice @$marc, $i, 1;
786 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
787 $i--;
788 };
789 }
790 }
791 }
792 }
793 $i++;
794 }
795
796 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
797
798 $marc_record->[ $marc_record_offset ] = $marc;
799 }
800
801 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
802 }
803
804 =head2 marc_original_order
805
806 Copy all subfields preserving original order to marc field.
807
808 marc_original_order( marc_field_number, original_input_field_number );
809
810 Please note that field numbers are consistent with other commands (marc
811 field number first), but somewhat counter-intuitive (destination and then
812 source).
813
814 You might want to use this command if you are just renaming subfields or
815 using pre-processing modify_record in C<config.yml> and don't need any
816 post-processing or want to preserve order of original subfields.
817
818
819 =cut
820
821 sub marc_original_order {
822
823 my ($to, $from) = @_;
824 die "marc_original_order needs from and to fields\n" unless ($from && $to);
825
826 return unless defined($rec->{$from});
827
828 my $r = $rec->{$from};
829 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
830
831 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
832 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
833
834 foreach my $d (@$r) {
835
836 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
837 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
838 next;
839 }
840
841 my @sfs = @{ $d->{subfields} };
842
843 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
844
845 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
846
847 my $m = [ $to, $i1, $i2 ];
848
849 while (my $sf = shift @sfs) {
850
851 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
852 my $offset = shift @sfs;
853 die "corrupted sufields specification for field $from\n" unless defined($offset);
854
855 my $v;
856 if (ref($d->{$sf}) eq 'ARRAY') {
857 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
858 } elsif ($offset == 0) {
859 $v = $d->{$sf};
860 } else {
861 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
862 }
863 push @$m, ( $sf, $v ) if (defined($v));
864 }
865
866 if ($#{$m} > 2) {
867 push @{ $marc_record->[ $marc_record_offset ] }, $m;
868 }
869 }
870
871 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
872 }
873
874 =head2 marc_count
875
876 Return number of MARC records created using L</marc_duplicate>.
877
878 print "created ", marc_count(), " records";
879
880 =cut
881
882 sub marc_count {
883 return $#{ $marc_record };
884 }
885
886
887 =head1 Functions to extract data from input
888
889 This function should be used inside functions to create C<data_structure> described
890 above.
891
892 =head2 _pack_subfields_hash
893
894 @subfields = _pack_subfields_hash( $h );
895 $subfields = _pack_subfields_hash( $h, 1 );
896
897 Return each subfield value in array or pack them all together and return scalar
898 with subfields (denoted by C<^>) and values.
899
900 =cut
901
902 sub _pack_subfields_hash {
903
904 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
905
906 my ($h,$include_subfields) = @_;
907
908 # sanity and ease of use
909 return $h if (ref($h) ne 'HASH');
910
911 if ( defined($h->{subfields}) ) {
912 my $sfs = delete $h->{subfields} || die "no subfields?";
913 my @out;
914 while (@$sfs) {
915 my $sf = shift @$sfs;
916 push @out, '^' . $sf if ($include_subfields);
917 my $o = shift @$sfs;
918 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
919 # single element subfields are not arrays
920 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
921
922 push @out, $h->{$sf};
923 } else {
924 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
925 push @out, $h->{$sf}->[$o];
926 }
927 }
928 if ($include_subfields) {
929 return join('', @out);
930 } else {
931 return @out;
932 }
933 } else {
934 if ($include_subfields) {
935 my $out = '';
936 foreach my $sf (sort keys %$h) {
937 if (ref($h->{$sf}) eq 'ARRAY') {
938 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
939 } else {
940 $out .= '^' . $sf . $h->{$sf};
941 }
942 }
943 return $out;
944 } else {
945 # FIXME this should probably be in alphabetical order instead of hash order
946 values %{$h};
947 }
948 }
949 }
950
951 =head2 rec1
952
953 Return all values in some field
954
955 @v = rec1('200')
956
957 TODO: order of values is probably same as in source data, need to investigate that
958
959 =cut
960
961 sub rec1 {
962 my $f = shift;
963 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
964 return unless (defined($rec) && defined($rec->{$f}));
965 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
966 if (ref($rec->{$f}) eq 'ARRAY') {
967 my @out;
968 foreach my $h ( @{ $rec->{$f} } ) {
969 if (ref($h) eq 'HASH') {
970 push @out, ( _pack_subfields_hash( $h ) );
971 } else {
972 push @out, $h;
973 }
974 }
975 return @out;
976 } elsif( defined($rec->{$f}) ) {
977 return $rec->{$f};
978 }
979 }
980
981 =head2 rec2
982
983 Return all values in specific field and subfield
984
985 @v = rec2('200','a')
986
987 =cut
988
989 sub rec2 {
990 my $f = shift;
991 return unless (defined($rec && $rec->{$f}));
992 my $sf = shift;
993 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
994 return map {
995 if (ref($_->{$sf}) eq 'ARRAY') {
996 @{ $_->{$sf} };
997 } else {
998 $_->{$sf};
999 }
1000 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
1001 }
1002
1003 =head2 rec
1004
1005 syntaxtic sugar for
1006
1007 @v = rec('200')
1008 @v = rec('200','a')
1009
1010 If rec() returns just single value, it will
1011 return scalar, not array.
1012
1013 =cut
1014
1015 sub frec {
1016 my @out = rec(@_);
1017 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1018 return shift @out;
1019 }
1020
1021 sub rec {
1022 my @out;
1023 if ($#_ == 0) {
1024 @out = rec1(@_);
1025 } elsif ($#_ == 1) {
1026 @out = rec2(@_);
1027 }
1028 if ($#out == 0 && ! wantarray) {
1029 return $out[0];
1030 } elsif (@out) {
1031 return @out;
1032 } else {
1033 return '';
1034 }
1035 }
1036
1037 =head2 regex
1038
1039 Apply regex to some or all values
1040
1041 @v = regex( 's/foo/bar/g', @v );
1042
1043 =cut
1044
1045 sub regex {
1046 my $r = shift;
1047 my @out;
1048 #warn "r: $r\n", dump(\@_);
1049 foreach my $t (@_) {
1050 next unless ($t);
1051 eval "\$t =~ $r";
1052 push @out, $t if ($t && $t ne '');
1053 }
1054 return @out;
1055 }
1056
1057 =head2 prefix
1058
1059 Prefix all values with a string
1060
1061 @v = prefix( 'my_', @v );
1062
1063 =cut
1064
1065 sub prefix {
1066 my $p = shift;
1067 return @_ unless defined( $p );
1068 return map { $p . $_ } grep { defined($_) } @_;
1069 }
1070
1071 =head2 suffix
1072
1073 suffix all values with a string
1074
1075 @v = suffix( '_my', @v );
1076
1077 =cut
1078
1079 sub suffix {
1080 my $s = shift;
1081 return @_ unless defined( $s );
1082 return map { $_ . $s } grep { defined($_) } @_;
1083 }
1084
1085 =head2 surround
1086
1087 surround all values with a two strings
1088
1089 @v = surround( 'prefix_', '_suffix', @v );
1090
1091 =cut
1092
1093 sub surround {
1094 my $p = shift;
1095 my $s = shift;
1096 $p = '' unless defined( $p );
1097 $s = '' unless defined( $s );
1098 return map { $p . $_ . $s } grep { defined($_) } @_;
1099 }
1100
1101 =head2 first
1102
1103 Return first element
1104
1105 $v = first( @v );
1106
1107 =cut
1108
1109 sub first {
1110 my $r = shift;
1111 return $r;
1112 }
1113
1114 =head2 lookup
1115
1116 Consult lookup hashes for some value
1117
1118 @v = lookup(
1119 sub {
1120 'ffkk/peri/mfn'.rec('000')
1121 },
1122 'ffkk','peri','200-a-200-e',
1123 sub {
1124 first(rec(200,'a')).' '.first(rec('200','e'))
1125 }
1126 );
1127
1128 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1129 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1130
1131 lookup(
1132 # which results to return from record recorded in lookup
1133 sub { 'ffkk/peri/mfn' . rec('000') },
1134 # from which database and input
1135 'ffkk','peri',
1136 # such that following values match
1137 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1138 # if this part is missing, we will try to match same fields
1139 # from lookup record and current one, or you can override
1140 # which records to use from current record using
1141 sub { rec('900','x') . ' ' . rec('900','y') },
1142 )
1143
1144 You can think about this lookup as SQL (if that helps):
1145
1146 select
1147 sub { what }
1148 from
1149 database, input
1150 where
1151 sub { filter from lookuped record }
1152 having
1153 sub { optional filter on current record }
1154
1155 Easy as pie, right?
1156
1157 =cut
1158
1159 sub lookup {
1160 my ($what, $database, $input, $key, $having) = @_;
1161
1162 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1163
1164 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1165 return unless (defined($lookup->{$database}->{$input}->{$key}));
1166
1167 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1168
1169 my $mfns;
1170 my @having = $having->();
1171
1172 warn "## having = ", dump( @having ) if ($debug > 2);
1173
1174 foreach my $h ( @having ) {
1175 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1176 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1177 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1178 }
1179 }
1180
1181 return unless ($mfns);
1182
1183 my @mfns = sort keys %$mfns;
1184
1185 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1186
1187 my $old_rec = $rec;
1188 my @out;
1189
1190 foreach my $mfn (@mfns) {
1191 $rec = $load_row_coderef->( $database, $input, $mfn );
1192
1193 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1194
1195 my @vals = $what->();
1196
1197 push @out, ( @vals );
1198
1199 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1200 }
1201
1202 # if (ref($lookup->{$k}) eq 'ARRAY') {
1203 # return @{ $lookup->{$k} };
1204 # } else {
1205 # return $lookup->{$k};
1206 # }
1207
1208 $rec = $old_rec;
1209
1210 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1211
1212 if ($#out == 0) {
1213 return $out[0];
1214 } else {
1215 return @out;
1216 }
1217 }
1218
1219 =head2 save_into_lookup
1220
1221 Save value into lookup. It associates current database, input
1222 and specific keys with one or more values which will be
1223 associated over MFN.
1224
1225 MFN will be extracted from first occurence current of field 000
1226 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1227
1228 my $nr = save_into_lookup($database,$input,$key,sub {
1229 # code which produce one or more values
1230 });
1231
1232 It returns number of items saved.
1233
1234 This function shouldn't be called directly, it's called from code created by
1235 L<WebPAC::Parser>.
1236
1237 =cut
1238
1239 sub save_into_lookup {
1240 my ($database,$input,$key,$coderef) = @_;
1241 die "save_into_lookup needs database" unless defined($database);
1242 die "save_into_lookup needs input" unless defined($input);
1243 die "save_into_lookup needs key" unless defined($key);
1244 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1245
1246 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1247
1248 my $mfn =
1249 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1250 defined($config->{_mfn}) ? $config->{_mfn} :
1251 die "mfn not defined or zero";
1252
1253 my $nr = 0;
1254
1255 foreach my $v ( $coderef->() ) {
1256 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1257 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1258 $nr++;
1259 }
1260
1261 return $nr;
1262 }
1263
1264 =head2 config
1265
1266 Consult config values stored in C<config.yml>
1267
1268 # return database code (key under databases in yaml)
1269 $database_code = config(); # use _ from hash
1270 $database_name = config('name');
1271 $database_input_name = config('input name');
1272
1273 Up to three levels are supported.
1274
1275 =cut
1276
1277 sub config {
1278 return unless ($config);
1279
1280 my $p = shift;
1281
1282 $p ||= '';
1283
1284 my $v;
1285
1286 warn "### getting config($p)\n" if ($debug > 1);
1287
1288 my @p = split(/\s+/,$p);
1289 if ($#p < 0) {
1290 $v = $config->{ '_' }; # special, database code
1291 } else {
1292
1293 my $c = dclone( $config );
1294
1295 foreach my $k (@p) {
1296 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1297 if (ref($c) eq 'ARRAY') {
1298 $c = shift @$c;
1299 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1300 last;
1301 }
1302
1303 if (! defined($c->{$k}) ) {
1304 $c = undef;
1305 last;
1306 } else {
1307 $c = $c->{$k};
1308 }
1309 }
1310 $v = $c if ($c);
1311
1312 }
1313
1314 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1315 warn "config( '$p' ) is empty\n" if (! $v);
1316
1317 return $v;
1318 }
1319
1320 =head2 id
1321
1322 Returns unique id of this record
1323
1324 $id = id();
1325
1326 Returns C<42/2> for 2nd occurence of MFN 42.
1327
1328 =cut
1329
1330 sub id {
1331 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1332 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1333 }
1334
1335 =head2 join_with
1336
1337 Joins walues with some delimiter
1338
1339 $v = join_with(", ", @v);
1340
1341 =cut
1342
1343 sub join_with {
1344 my $d = shift;
1345 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1346 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1347 return '' unless defined($v);
1348 return $v;
1349 }
1350
1351 =head2 split_rec_on
1352
1353 Split record subfield on some regex and take one of parts out
1354
1355 $a_before_semi_column =
1356 split_rec_on('200','a', /\s*;\s*/, $part);
1357
1358 C<$part> is optional number of element. First element is
1359 B<1>, not 0!
1360
1361 If there is no C<$part> parameter or C<$part> is 0, this function will
1362 return all values produced by splitting.
1363
1364 =cut
1365
1366 sub split_rec_on {
1367 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1368
1369 my ($fld, $sf, $regex, $part) = @_;
1370 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1371
1372 my @r = rec( $fld, $sf );
1373 my $v = shift @r;
1374 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1375
1376 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1377
1378 my @s = split( $regex, $v );
1379 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1380 if ($part && $part > 0) {
1381 return $s[ $part - 1 ];
1382 } else {
1383 return @s;
1384 }
1385 }
1386
1387 my $hash;
1388
1389 =head2 set
1390
1391 set( key => 'value' );
1392
1393 =cut
1394
1395 sub set {
1396 my ($k,$v) = @_;
1397 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1398 $hash->{$k} = $v;
1399 };
1400
1401 =head2 get
1402
1403 get( 'key' );
1404
1405 =cut
1406
1407 sub get {
1408 my $k = shift || return;
1409 my $v = $hash->{$k};
1410 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1411 return $v;
1412 }
1413
1414 =head2 count
1415
1416 if ( count( @result ) == 1 ) {
1417 # do something if only 1 result is there
1418 }
1419
1420 =cut
1421
1422 sub count {
1423 warn "## count ",dump(@_),$/ if ( $debug );
1424 return @_ . '';
1425 }
1426
1427 # END
1428 1;

  ViewVC Help
Powered by ViewVC 1.1.26