/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1013 - (show annotations)
Wed Nov 7 11:54:34 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 35138 byte(s)
 r1567@llin:  dpavlin | 2007-11-07 12:54:20 +0100
 huge dump of marc_template, a new way to specify mappings
 which is basically WebPAC v1 in a sub :-)

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16 marc_template
17
18 rec1 rec2 rec
19 frec
20 regex prefix suffix surround
21 first lookup join_with
22 save_into_lookup
23
24 split_rec_on
25
26 get set
27 count
28
29 /;
30
31 use warnings;
32 use strict;
33
34 #use base qw/WebPAC::Common/;
35 use Data::Dump qw/dump/;
36 use Storable qw/dclone/;
37 use Carp qw/confess/;
38
39 # debugging warn(s)
40 my $debug = 0;
41
42 use WebPAC::Normalize::ISBN;
43 push @EXPORT, ( 'isbn_10', 'isbn_13' );
44
45 =head1 NAME
46
47 WebPAC::Normalize - describe normalisaton rules using sets
48
49 =cut
50
51 our $VERSION = '0.33';
52
53 =head1 SYNOPSIS
54
55 This module uses C<conf/normalize/*.pl> files to perform normalisation
56 from input records using perl functions which are specialized for set
57 processing.
58
59 Sets are implemented as arrays, and normalisation file is valid perl, which
60 means that you check it's validity before running WebPAC using
61 C<perl -c normalize.pl>.
62
63 Normalisation can generate multiple output normalized data. For now, supported output
64 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
65 C<marc>.
66
67 =head1 FUNCTIONS
68
69 Functions which start with C<_> are private and used by WebPAC internally.
70 All other functions are available for use within normalisation rules.
71
72 =head2 data_structure
73
74 Return data structure
75
76 my $ds = WebPAC::Normalize::data_structure(
77 lookup => $lookup_hash,
78 row => $row,
79 rules => $normalize_pl_config,
80 marc_encoding => 'utf-8',
81 config => $config,
82 load_row_coderef => sub {
83 my ($database,$input,$mfn) = @_;
84 $store->load_row( database => $database, input => $input, id => $mfn );
85 },
86 );
87
88 Options C<row>, C<rules> and C<log> are mandatory while all
89 other are optional.
90
91 C<load_row_coderef> is closure only used when executing lookups, so they will
92 die if it's not defined.
93
94 This function will B<die> if normalizastion can't be evaled.
95
96 Since this function isn't exported you have to call it with
97 C<WebPAC::Normalize::data_structure>.
98
99 =cut
100
101 my $load_row_coderef;
102
103 sub data_structure {
104 my $arg = {@_};
105
106 die "need row argument" unless ($arg->{row});
107 die "need normalisation argument" unless ($arg->{rules});
108
109 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
110 _set_ds( $arg->{row} );
111 _set_config( $arg->{config} ) if defined($arg->{config});
112 _clean_ds( %{ $arg } );
113 $load_row_coderef = $arg->{load_row_coderef};
114
115 no strict 'subs';
116 no warnings 'redefine';
117 eval "$arg->{rules};";
118 die "error evaling $arg->{rules}: $@\n" if ($@);
119
120 return _get_ds();
121 }
122
123 =head2 _set_ds
124
125 Set current record hash
126
127 _set_ds( $rec );
128
129 =cut
130
131 my $rec;
132
133 sub _set_ds {
134 $rec = shift or die "no record hash";
135 }
136
137 =head2 _set_config
138
139 Set current config hash
140
141 _set_config( $config );
142
143 Magic keys are:
144
145 =over 4
146
147 =item _
148
149 Code of current database
150
151 =item _mfn
152
153 Current MFN
154
155 =back
156
157 =cut
158
159 my $config;
160
161 sub _set_config {
162 $config = shift;
163 }
164
165 =head2 _get_ds
166
167 Return hash formatted as data structure
168
169 my $ds = _get_ds();
170
171 =cut
172
173 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
174 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
175
176 sub _get_ds {
177 #warn "## out = ",dump($out);
178 return $out;
179 }
180
181 =head2 _clean_ds
182
183 Clean data structure hash for next record
184
185 _clean_ds();
186
187 =cut
188
189 sub _clean_ds {
190 my $a = {@_};
191 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
192 ($marc_record_offset, $marc_fetch_offset) = (0,0);
193 $marc_encoding = $a->{marc_encoding};
194 }
195
196 =head2 _set_lookup
197
198 Set current lookup hash
199
200 _set_lookup( $lookup );
201
202 =cut
203
204 my $lookup;
205
206 sub _set_lookup {
207 $lookup = shift;
208 }
209
210 =head2 _get_lookup
211
212 Get current lookup hash
213
214 my $lookup = _get_lookup();
215
216 =cut
217
218 sub _get_lookup {
219 return $lookup;
220 }
221
222 =head2 _set_load_row
223
224 Setup code reference which will return L<data_structure> from
225 L<WebPAC::Store>
226
227 _set_load_row(sub {
228 my ($database,$input,$mfn) = @_;
229 $store->load_row( database => $database, input => $input, id => $mfn );
230 });
231
232 =cut
233
234 sub _set_load_row {
235 my $coderef = shift;
236 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
237
238 $load_row_coderef = $coderef;
239 }
240
241 =head2 _get_marc_fields
242
243 Get all fields defined by calls to C<marc>
244
245 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
246
247 We are using I<magic> which detect repeatable fields only from
248 sequence of field/subfield data generated by normalization.
249
250 Repeatable field is created when there is second occurence of same subfield or
251 if any of indicators are different.
252
253 This is sane for most cases. Something like:
254
255 900a-1 900b-1 900c-1
256 900a-2 900b-2
257 900a-3
258
259 will be created from any combination of:
260
261 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
262
263 and following rules:
264
265 marc('900','a', rec('200','a') );
266 marc('900','b', rec('200','b') );
267 marc('900','c', rec('200','c') );
268
269 which might not be what you have in mind. If you need repeatable subfield,
270 define it using C<marc_repeatable_subfield> like this:
271
272 marc_repeatable_subfield('900','a');
273 marc('900','a', rec('200','a') );
274 marc('900','b', rec('200','b') );
275 marc('900','c', rec('200','c') );
276
277 will create:
278
279 900a-1 900a-2 900a-3 900b-1 900c-1
280 900b-2
281
282 There is also support for returning next or specific using:
283
284 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
285 # do something with $mf
286 }
287
288 will always return fields from next MARC record or
289
290 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
291
292 will return 42th copy record (if it exists).
293
294 =cut
295
296 my $fetch_pos;
297
298 sub _get_marc_fields {
299
300 my $arg = {@_};
301 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
302 $fetch_pos = $marc_fetch_offset;
303 if ($arg->{offset}) {
304 $fetch_pos = $arg->{offset};
305 } elsif($arg->{fetch_next}) {
306 $marc_fetch_offset++;
307 }
308
309 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
310
311 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
312
313 my $marc_rec = $marc_record->[ $fetch_pos ];
314
315 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
316
317 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
318
319 # first, sort all existing fields
320 # XXX might not be needed, but modern perl might randomize elements in hash
321 my @sorted_marc_record = sort {
322 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
323 } @{ $marc_rec };
324
325 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
326
327 # output marc fields
328 my @m;
329
330 # count unique field-subfields (used for offset when walking to next subfield)
331 my $u;
332 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
333
334 if ($debug) {
335 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
336 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
337 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
338 warn "## subfield count = ", dump( $u ), $/;
339 }
340
341 my $len = $#sorted_marc_record;
342 my $visited;
343 my $i = 0;
344 my $field;
345
346 foreach ( 0 .. $len ) {
347
348 # find next element which isn't visited
349 while ($visited->{$i}) {
350 $i = ($i + 1) % ($len + 1);
351 }
352
353 # mark it visited
354 $visited->{$i}++;
355
356 my $row = dclone( $sorted_marc_record[$i] );
357
358 # field and subfield which is key for
359 # marc_repeatable_subfield and u
360 my $fsf = $row->[0] . ( $row->[3] || '' );
361
362 if ($debug > 1) {
363
364 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
365 print "### this [$i]: ", dump( $row ),$/;
366 print "### sf: ", $row->[3], " vs ", $field->[3],
367 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
368 if ($#$field >= 0);
369
370 }
371
372 # if field exists
373 if ( $#$field >= 0 ) {
374 if (
375 $row->[0] ne $field->[0] || # field
376 $row->[1] ne $field->[1] || # i1
377 $row->[2] ne $field->[2] # i2
378 ) {
379 push @m, $field;
380 warn "## saved/1 ", dump( $field ),$/ if ($debug);
381 $field = $row;
382
383 } elsif (
384 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
385 ||
386 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
387 ! $marc_repeatable_subfield->{ $fsf }
388 )
389 ) {
390 push @m, $field;
391 warn "## saved/2 ", dump( $field ),$/ if ($debug);
392 $field = $row;
393
394 } else {
395 # append new subfields to existing field
396 push @$field, ( $row->[3], $row->[4] );
397 }
398 } else {
399 # insert first field
400 $field = $row;
401 }
402
403 if (! $marc_repeatable_subfield->{ $fsf }) {
404 # make step to next subfield
405 $i = ($i + $u->{ $fsf } ) % ($len + 1);
406 }
407 }
408
409 if ($#$field >= 0) {
410 push @m, $field;
411 warn "## saved/3 ", dump( $field ),$/ if ($debug);
412 }
413
414 return \@m;
415 }
416
417 =head2 _get_marc_leader
418
419 Return leader from currently fetched record by L</_get_marc_fields>
420
421 print WebPAC::Normalize::_get_marc_leader();
422
423 =cut
424
425 sub _get_marc_leader {
426 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
427 return $marc_leader->[ $fetch_pos ];
428 }
429
430 =head2 _debug
431
432 Change level of debug warnings
433
434 _debug( 2 );
435
436 =cut
437
438 sub _debug {
439 my $l = shift;
440 return $debug unless defined($l);
441 warn "debug level $l",$/ if ($l > 0);
442 $debug = $l;
443 }
444
445 =head1 Functions to create C<data_structure>
446
447 Those functions generally have to first in your normalization file.
448
449 =head2 search_display
450
451 Define output for L<search> and L<display> at the same time
452
453 search_display('Title', rec('200','a') );
454
455
456 =cut
457
458 sub search_display {
459 my $name = shift or die "search_display needs name as first argument";
460 my @o = grep { defined($_) && $_ ne '' } @_;
461 return unless (@o);
462 $out->{$name}->{search} = \@o;
463 $out->{$name}->{display} = \@o;
464 }
465
466 =head2 tag
467
468 Old name for L<search_display>, but supported
469
470 =cut
471
472 sub tag {
473 search_display( @_ );
474 }
475
476 =head2 display
477
478 Define output just for I<display>
479
480 @v = display('Title', rec('200','a') );
481
482 =cut
483
484 sub _field {
485 my $type = shift or confess "need type -- BUG?";
486 my $name = shift or confess "needs name as first argument";
487 my @o = grep { defined($_) && $_ ne '' } @_;
488 return unless (@o);
489 $out->{$name}->{$type} = \@o;
490 }
491
492 sub display { _field( 'display', @_ ) }
493
494 =head2 search
495
496 Prepare values just for I<search>
497
498 @v = search('Title', rec('200','a') );
499
500 =cut
501
502 sub search { _field( 'search', @_ ) }
503
504 =head2 sorted
505
506 Insert into lists which will be automatically sorted
507
508 sorted('Title', rec('200','a') );
509
510 =cut
511
512 sub sorted { _field( 'sorted', @_ ) }
513
514
515 =head2 marc_leader
516
517 Setup fields within MARC leader or get leader
518
519 marc_leader('05','c');
520 my $leader = marc_leader();
521
522 =cut
523
524 sub marc_leader {
525 my ($offset,$value) = @_;
526
527 if ($offset) {
528 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
529 } else {
530
531 if (defined($marc_leader)) {
532 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
533 return $marc_leader->[ $marc_record_offset ];
534 } else {
535 return;
536 }
537 }
538 }
539
540 =head2 marc_fixed
541
542 Create control/indentifier fields with values in fixed positions
543
544 marc_fixed('008', 00, '070402');
545 marc_fixed('008', 39, '|');
546
547 Positions not specified will be filled with spaces (C<0x20>).
548
549 There will be no effort to extend last specified value to full length of
550 field in standard.
551
552 =cut
553
554 sub marc_fixed {
555 my ($f, $pos, $val) = @_;
556 die "need marc(field, position, value)" unless defined($f) && defined($pos);
557
558 confess "need val" unless defined $val;
559
560 my $update = 0;
561
562 map {
563 if ($_->[0] eq $f) {
564 my $old = $_->[1];
565 if (length($old) <= $pos) {
566 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
567 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
568 } else {
569 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
570 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
571 }
572 $update++;
573 }
574 } @{ $marc_record->[ $marc_record_offset ] };
575
576 if (! $update) {
577 my $v = ' ' x $pos . $val;
578 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
579 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
580 }
581 }
582
583 =head2 marc
584
585 Save value for MARC field
586
587 marc('900','a', rec('200','a') );
588 marc('001', rec('000') );
589
590 =cut
591
592 sub marc {
593 my $f = shift or die "marc needs field";
594 die "marc field must be numer" unless ($f =~ /^\d+$/);
595
596 my $sf;
597 if ($f >= 10) {
598 $sf = shift or die "marc needs subfield";
599 }
600
601 foreach (@_) {
602 my $v = $_; # make var read-write for Encode
603 next unless (defined($v) && $v !~ /^\s*$/);
604 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
605 if (defined $sf) {
606 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
607 } else {
608 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
609 }
610 }
611 }
612
613 =head2 marc_repeatable_subfield
614
615 Save values for MARC repetable subfield
616
617 marc_repeatable_subfield('910', 'z', rec('909') );
618
619 =cut
620
621 sub marc_repeatable_subfield {
622 my ($f,$sf) = @_;
623 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
624 $marc_repeatable_subfield->{ $f . $sf }++;
625 marc(@_);
626 }
627
628 =head2 marc_indicators
629
630 Set both indicators for MARC field
631
632 marc_indicators('900', ' ', 1);
633
634 Any indicator value other than C<0-9> will be treated as undefined.
635
636 =cut
637
638 sub marc_indicators {
639 my $f = shift || die "marc_indicators need field!\n";
640 my ($i1,$i2) = @_;
641 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
642 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
643
644 $i1 = ' ' if ($i1 !~ /^\d$/);
645 $i2 = ' ' if ($i2 !~ /^\d$/);
646 @{ $marc_indicators->{$f} } = ($i1,$i2);
647 }
648
649 =head2 marc_compose
650
651 Save values for each MARC subfield explicitly
652
653 marc_compose('900',
654 'a', rec('200','a')
655 'b', rec('201','a')
656 'a', rec('200','b')
657 'c', rec('200','c')
658 );
659
660 If you specify C<+> for subfield, value will be appended
661 to previous defined subfield.
662
663 =cut
664
665 sub marc_compose {
666 my $f = shift or die "marc_compose needs field";
667 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
668
669 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
670 my $m = [ $f, $i1, $i2 ];
671
672 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
673
674 if ($#_ % 2 != 1) {
675 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
676 }
677
678 while (@_) {
679 my $sf = shift;
680 my $v = shift;
681
682 next unless (defined($v) && $v !~ /^\s*$/);
683 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
684 if ($sf ne '+') {
685 push @$m, ( $sf, $v );
686 } else {
687 $m->[ $#$m ] .= $v;
688 }
689 }
690
691 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
692
693 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
694 }
695
696 =head2 marc_duplicate
697
698 Generate copy of current MARC record and continue working on copy
699
700 marc_duplicate();
701
702 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
703 C<< _get_marc_fields( offset => 42 ) >>.
704
705 =cut
706
707 sub marc_duplicate {
708 my $m = $marc_record->[ -1 ];
709 die "can't duplicate record which isn't defined" unless ($m);
710 push @{ $marc_record }, dclone( $m );
711 push @{ $marc_leader }, dclone( marc_leader() );
712 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
713 $marc_record_offset = $#{ $marc_record };
714 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
715
716 }
717
718 =head2 marc_remove
719
720 Remove some field or subfield from MARC record.
721
722 marc_remove('200');
723 marc_remove('200','a');
724
725 This will erase field C<200> or C<200^a> from current MARC record.
726
727 marc_remove('*');
728
729 Will remove all fields in current MARC record.
730
731 This is useful after calling C<marc_duplicate> or on it's own (but, you
732 should probably just remove that subfield definition if you are not
733 using C<marc_duplicate>).
734
735 FIXME: support fields < 10.
736
737 =cut
738
739 sub marc_remove {
740 my ($f, $sf) = @_;
741
742 die "marc_remove needs record number" unless defined($f);
743
744 my $marc = $marc_record->[ $marc_record_offset ];
745
746 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
747
748 if ($f eq '*') {
749
750 delete( $marc_record->[ $marc_record_offset ] );
751 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
752
753 } else {
754
755 my $i = 0;
756 foreach ( 0 .. $#{ $marc } ) {
757 last unless (defined $marc->[$i]);
758 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
759 if ($marc->[$i]->[0] eq $f) {
760 if (! defined $sf) {
761 # remove whole field
762 splice @$marc, $i, 1;
763 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
764 $i--;
765 } else {
766 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
767 my $o = ($j * 2) + 3;
768 if ($marc->[$i]->[$o] eq $sf) {
769 # remove subfield
770 splice @{$marc->[$i]}, $o, 2;
771 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
772 # is record now empty?
773 if ($#{ $marc->[$i] } == 2) {
774 splice @$marc, $i, 1;
775 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
776 $i--;
777 };
778 }
779 }
780 }
781 }
782 $i++;
783 }
784
785 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
786
787 $marc_record->[ $marc_record_offset ] = $marc;
788 }
789
790 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
791 }
792
793 =head2 marc_original_order
794
795 Copy all subfields preserving original order to marc field.
796
797 marc_original_order( marc_field_number, original_input_field_number );
798
799 Please note that field numbers are consistent with other commands (marc
800 field number first), but somewhat counter-intuitive (destination and then
801 source).
802
803 You might want to use this command if you are just renaming subfields or
804 using pre-processing modify_record in C<config.yml> and don't need any
805 post-processing or want to preserve order of original subfields.
806
807
808 =cut
809
810 sub marc_original_order {
811
812 my ($to, $from) = @_;
813 die "marc_original_order needs from and to fields\n" unless ($from && $to);
814
815 return unless defined($rec->{$from});
816
817 my $r = $rec->{$from};
818 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
819
820 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
821 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
822
823 foreach my $d (@$r) {
824
825 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
826 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
827 next;
828 }
829
830 my @sfs = @{ $d->{subfields} };
831
832 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
833
834 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
835
836 my $m = [ $to, $i1, $i2 ];
837
838 while (my $sf = shift @sfs) {
839
840 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
841 my $offset = shift @sfs;
842 die "corrupted sufields specification for field $from\n" unless defined($offset);
843
844 my $v;
845 if (ref($d->{$sf}) eq 'ARRAY') {
846 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
847 } elsif ($offset == 0) {
848 $v = $d->{$sf};
849 } else {
850 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
851 }
852 push @$m, ( $sf, $v ) if (defined($v));
853 }
854
855 if ($#{$m} > 2) {
856 push @{ $marc_record->[ $marc_record_offset ] }, $m;
857 }
858 }
859
860 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
861 }
862
863 =head2 marc_template
864
865 =cut
866
867 sub marc_template {
868 my $args = {@_};
869 warn "## marc_template(",dump($args),")";
870
871 foreach ( qw/subfields_rename marc_template/ ) {
872 # warn "ref($_) = ",ref($args->{$_});
873 die "$_ not ARRAY" if ref($args->{$_}) ne 'ARRAY';
874 }
875
876 my $r = $rec->{ $args->{from} }; # || return;
877 die "record field ", $args->{from}, " isn't array\n" unless (ref($r) eq 'ARRAY');
878
879 my @subfields_rename = @{ $args->{subfields_rename} };
880 # warn "### subfields_rename [$#subfields_rename] = ",dump( @subfields_rename );
881
882 confess "need mapping in pairs for subfields_rename"
883 if $#subfields_rename % 2 != 1;
884
885 my ( $subfields_rename, $from_subfields, $to_subfields );
886 while ( my ( $from, $to ) = splice(@subfields_rename, 0, 2) ) {
887 my ( $f, $t ) = (
888 $from_subfields->{ $from }++,
889 $to_subfields->{ $to }++
890 );
891 $subfields_rename->{ $from }->[ $f ] = [ $to => $t ];
892 }
893 warn "### subfields_rename = ",dump( $subfields_rename ),$/;
894 warn "### from_subfields = ", dump( $from_subfields ),$/;
895 warn "### to_subfields = ", dump( $to_subfields ),$/;
896
897 my $fields_re = join('|', keys %$to_subfields );
898
899 my $pos_templates;
900 my $count;
901 my @marc_order;
902 my $marc_template_order;
903 my $fill_in;
904 my @marc_out;
905
906 foreach my $template ( @{ $args->{marc_template} } ) {
907 $count = {};
908 @marc_order = ();
909 sub my_count {
910 my $sf = shift;
911 my $nr = $count->{$sf}++;
912 push @marc_order, [ $sf, $nr ];
913 return $sf . $nr;
914 }
915 my $pos_template = $template;
916 $pos_template =~ s/($fields_re)/my_count($1)/ge;
917 my $count_key = dump( $count );
918 warn "### template: |$template| -> |$pos_template| count = $count_key marc_order = ",dump( @marc_order ),$/;
919 $pos_templates->{ $count_key } = $pos_template;
920 $marc_template_order->{ $pos_template } = [ @marc_order ];
921 }
922 warn "### from ",dump( $args->{marc_template} ), " created ", dump( $pos_templates ), " and ", dump( $marc_template_order );
923
924 my $m;
925
926 foreach my $r ( @{ $rec->{ $args->{from} } } ) {
927
928 my $i1 = $r->{i1} || ' ';
929 my $i2 = $r->{i2} || ' ';
930 $m = [ $args->{to}, $i1, $i2 ];
931
932 warn "### r = ",dump( $r );
933
934 my ( $new_r, $from_count, $to_count );
935 foreach my $sf ( keys %{$r} ) {
936 my $nr = $from_count->{$sf}++;
937 my $rename_to = $subfields_rename->{ $sf }; # ||
938 # die "can't find subfield rename for $sf/$nr in ", dump( $subfields_rename );
939 warn "### rename $sf/$nr to ", dump( $rename_to->[$nr] ), $/;
940 my ( $to_sf, $to_nr ) = @{ $rename_to->[$nr] };
941 $new_r->{ $to_sf }->[ $to_nr ] = [ $sf => $nr ];
942
943 $to_count->{ $to_sf }++;
944 }
945
946 warn "### new_r = ",dump( $new_r );
947
948 my $from_count_key = dump( $to_count );
949
950 warn "### from_count = ",dump( $from_count ), $/;
951 warn "### to_count = ",dump( $to_count ), $/;
952
953 my $template = $pos_templates->{ $from_count_key } ||
954 die "I don't have template for:\n$from_count_key\n## available templates\n", dump( $pos_templates );
955
956 warn "### selected template: |$template|\n";
957
958 $fill_in = {};
959
960 foreach my $sf ( split(/\|/, $template ) ) {
961 sub fill_in {
962 my ( $r, $sf, $nr ) = @_;
963 my ( $from_sf, $from_nr ) = @{ $new_r->{ $sf }->[ $nr ] };
964 my $v = $r->{ $from_sf }; # || die "no $from_sf/$from_nr";
965 warn "#### fill_in( $sf, $nr ) = $from_sf/$from_nr >>>> ",dump( $v ), $/;
966 if ( ref( $v ) eq 'ARRAY' ) {
967 $fill_in->{$sf}->[$nr] = $v->[$from_nr];
968 return $v->[$from_nr];
969 } elsif ( $from_nr == 0 ) {
970 $fill_in->{$sf}->[$nr] = $v;
971 return $v;
972 } else {
973 die "requested subfield $from_sf/$from_nr but it's ",dump( $v );
974 }
975 }
976 warn "#### $sf <<<< $fields_re\n";
977 $sf =~ s/($fields_re)(\d+)/fill_in($r,$1,$2)/ge;
978 warn "#### >>>> $sf with fill_in = ",dump( $fill_in ),$/;
979 }
980
981 warn "## template: |$template|\n## marc_template_order = ",dump( $marc_template_order );
982
983 foreach my $sf ( @{ $marc_template_order->{$template} } ) {
984 my ( $sf, $nr ) = @$sf;
985 my $v = $fill_in->{$sf}->[$nr] || die "can't find fill_in $sf/$nr";
986 warn "++ $sf/$nr |$v|\n";
987 push @$m, [ $sf, $v ];
988 }
989
990 warn "#### >>>> created marc: ", dump( $m );
991
992 push @marc_out, $m;
993 }
994
995 warn "### marc_template produced: ",dump( @marc_out );
996 return @marc_out;
997 }
998
999 =head2 marc_count
1000
1001 Return number of MARC records created using L</marc_duplicate>.
1002
1003 print "created ", marc_count(), " records";
1004
1005 =cut
1006
1007 sub marc_count {
1008 return $#{ $marc_record };
1009 }
1010
1011
1012 =head1 Functions to extract data from input
1013
1014 This function should be used inside functions to create C<data_structure> described
1015 above.
1016
1017 =head2 _pack_subfields_hash
1018
1019 @subfields = _pack_subfields_hash( $h );
1020 $subfields = _pack_subfields_hash( $h, 1 );
1021
1022 Return each subfield value in array or pack them all together and return scalar
1023 with subfields (denoted by C<^>) and values.
1024
1025 =cut
1026
1027 sub _pack_subfields_hash {
1028
1029 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
1030
1031 my ($h,$include_subfields) = @_;
1032
1033 # sanity and ease of use
1034 return $h if (ref($h) ne 'HASH');
1035
1036 if ( defined($h->{subfields}) ) {
1037 my $sfs = delete $h->{subfields} || die "no subfields?";
1038 my @out;
1039 while (@$sfs) {
1040 my $sf = shift @$sfs;
1041 push @out, '^' . $sf if ($include_subfields);
1042 my $o = shift @$sfs;
1043 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
1044 # single element subfields are not arrays
1045 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
1046
1047 push @out, $h->{$sf};
1048 } else {
1049 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
1050 push @out, $h->{$sf}->[$o];
1051 }
1052 }
1053 if ($include_subfields) {
1054 return join('', @out);
1055 } else {
1056 return @out;
1057 }
1058 } else {
1059 if ($include_subfields) {
1060 my $out = '';
1061 foreach my $sf (sort keys %$h) {
1062 if (ref($h->{$sf}) eq 'ARRAY') {
1063 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
1064 } else {
1065 $out .= '^' . $sf . $h->{$sf};
1066 }
1067 }
1068 return $out;
1069 } else {
1070 # FIXME this should probably be in alphabetical order instead of hash order
1071 values %{$h};
1072 }
1073 }
1074 }
1075
1076 =head2 rec1
1077
1078 Return all values in some field
1079
1080 @v = rec1('200')
1081
1082 TODO: order of values is probably same as in source data, need to investigate that
1083
1084 =cut
1085
1086 sub rec1 {
1087 my $f = shift;
1088 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
1089 return unless (defined($rec) && defined($rec->{$f}));
1090 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
1091 if (ref($rec->{$f}) eq 'ARRAY') {
1092 my @out;
1093 foreach my $h ( @{ $rec->{$f} } ) {
1094 if (ref($h) eq 'HASH') {
1095 push @out, ( _pack_subfields_hash( $h ) );
1096 } else {
1097 push @out, $h;
1098 }
1099 }
1100 return @out;
1101 } elsif( defined($rec->{$f}) ) {
1102 return $rec->{$f};
1103 }
1104 }
1105
1106 =head2 rec2
1107
1108 Return all values in specific field and subfield
1109
1110 @v = rec2('200','a')
1111
1112 =cut
1113
1114 sub rec2 {
1115 my $f = shift;
1116 return unless (defined($rec && $rec->{$f}));
1117 my $sf = shift;
1118 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
1119 return map {
1120 if (ref($_->{$sf}) eq 'ARRAY') {
1121 @{ $_->{$sf} };
1122 } else {
1123 $_->{$sf};
1124 }
1125 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
1126 }
1127
1128 =head2 rec
1129
1130 syntaxtic sugar for
1131
1132 @v = rec('200')
1133 @v = rec('200','a')
1134
1135 If rec() returns just single value, it will
1136 return scalar, not array.
1137
1138 =cut
1139
1140 sub rec {
1141 my @out;
1142 if ($#_ == 0) {
1143 @out = rec1(@_);
1144 } elsif ($#_ == 1) {
1145 @out = rec2(@_);
1146 }
1147 if ($#out == 0 && ! wantarray) {
1148 return $out[0];
1149 } elsif (@out) {
1150 return @out;
1151 } else {
1152 return '';
1153 }
1154 }
1155
1156 =head2 frec
1157
1158 Returns first value from field
1159
1160 $v = frec('200');
1161 $v = frec('200','a');
1162
1163 =cut
1164
1165 sub frec {
1166 my @out = rec(@_);
1167 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1168 return shift @out;
1169 }
1170
1171 =head2 regex
1172
1173 Apply regex to some or all values
1174
1175 @v = regex( 's/foo/bar/g', @v );
1176
1177 =cut
1178
1179 sub regex {
1180 my $r = shift;
1181 my @out;
1182 #warn "r: $r\n", dump(\@_);
1183 foreach my $t (@_) {
1184 next unless ($t);
1185 eval "\$t =~ $r";
1186 push @out, $t if ($t && $t ne '');
1187 }
1188 return @out;
1189 }
1190
1191 =head2 prefix
1192
1193 Prefix all values with a string
1194
1195 @v = prefix( 'my_', @v );
1196
1197 =cut
1198
1199 sub prefix {
1200 my $p = shift;
1201 return @_ unless defined( $p );
1202 return map { $p . $_ } grep { defined($_) } @_;
1203 }
1204
1205 =head2 suffix
1206
1207 suffix all values with a string
1208
1209 @v = suffix( '_my', @v );
1210
1211 =cut
1212
1213 sub suffix {
1214 my $s = shift;
1215 return @_ unless defined( $s );
1216 return map { $_ . $s } grep { defined($_) } @_;
1217 }
1218
1219 =head2 surround
1220
1221 surround all values with a two strings
1222
1223 @v = surround( 'prefix_', '_suffix', @v );
1224
1225 =cut
1226
1227 sub surround {
1228 my $p = shift;
1229 my $s = shift;
1230 $p = '' unless defined( $p );
1231 $s = '' unless defined( $s );
1232 return map { $p . $_ . $s } grep { defined($_) } @_;
1233 }
1234
1235 =head2 first
1236
1237 Return first element
1238
1239 $v = first( @v );
1240
1241 =cut
1242
1243 sub first {
1244 my $r = shift;
1245 return $r;
1246 }
1247
1248 =head2 lookup
1249
1250 Consult lookup hashes for some value
1251
1252 @v = lookup(
1253 sub {
1254 'ffkk/peri/mfn'.rec('000')
1255 },
1256 'ffkk','peri','200-a-200-e',
1257 sub {
1258 first(rec(200,'a')).' '.first(rec('200','e'))
1259 }
1260 );
1261
1262 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1263 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1264
1265 lookup(
1266 # which results to return from record recorded in lookup
1267 sub { 'ffkk/peri/mfn' . rec('000') },
1268 # from which database and input
1269 'ffkk','peri',
1270 # such that following values match
1271 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1272 # if this part is missing, we will try to match same fields
1273 # from lookup record and current one, or you can override
1274 # which records to use from current record using
1275 sub { rec('900','x') . ' ' . rec('900','y') },
1276 )
1277
1278 You can think about this lookup as SQL (if that helps):
1279
1280 select
1281 sub { what }
1282 from
1283 database, input
1284 where
1285 sub { filter from lookuped record }
1286 having
1287 sub { optional filter on current record }
1288
1289 Easy as pie, right?
1290
1291 =cut
1292
1293 sub lookup {
1294 my ($what, $database, $input, $key, $having) = @_;
1295
1296 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1297
1298 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1299 return unless (defined($lookup->{$database}->{$input}->{$key}));
1300
1301 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1302
1303 my $mfns;
1304 my @having = $having->();
1305
1306 warn "## having = ", dump( @having ) if ($debug > 2);
1307
1308 foreach my $h ( @having ) {
1309 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1310 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1311 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1312 }
1313 }
1314
1315 return unless ($mfns);
1316
1317 my @mfns = sort keys %$mfns;
1318
1319 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1320
1321 my $old_rec = $rec;
1322 my @out;
1323
1324 foreach my $mfn (@mfns) {
1325 $rec = $load_row_coderef->( $database, $input, $mfn );
1326
1327 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1328
1329 my @vals = $what->();
1330
1331 push @out, ( @vals );
1332
1333 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1334 }
1335
1336 # if (ref($lookup->{$k}) eq 'ARRAY') {
1337 # return @{ $lookup->{$k} };
1338 # } else {
1339 # return $lookup->{$k};
1340 # }
1341
1342 $rec = $old_rec;
1343
1344 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1345
1346 if ($#out == 0) {
1347 return $out[0];
1348 } else {
1349 return @out;
1350 }
1351 }
1352
1353 =head2 save_into_lookup
1354
1355 Save value into lookup. It associates current database, input
1356 and specific keys with one or more values which will be
1357 associated over MFN.
1358
1359 MFN will be extracted from first occurence current of field 000
1360 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1361
1362 my $nr = save_into_lookup($database,$input,$key,sub {
1363 # code which produce one or more values
1364 });
1365
1366 It returns number of items saved.
1367
1368 This function shouldn't be called directly, it's called from code created by
1369 L<WebPAC::Parser>.
1370
1371 =cut
1372
1373 sub save_into_lookup {
1374 my ($database,$input,$key,$coderef) = @_;
1375 die "save_into_lookup needs database" unless defined($database);
1376 die "save_into_lookup needs input" unless defined($input);
1377 die "save_into_lookup needs key" unless defined($key);
1378 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1379
1380 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1381
1382 my $mfn =
1383 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1384 defined($config->{_mfn}) ? $config->{_mfn} :
1385 die "mfn not defined or zero";
1386
1387 my $nr = 0;
1388
1389 foreach my $v ( $coderef->() ) {
1390 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1391 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1392 $nr++;
1393 }
1394
1395 return $nr;
1396 }
1397
1398 =head2 config
1399
1400 Consult config values stored in C<config.yml>
1401
1402 # return database code (key under databases in yaml)
1403 $database_code = config(); # use _ from hash
1404 $database_name = config('name');
1405 $database_input_name = config('input name');
1406
1407 Up to three levels are supported.
1408
1409 =cut
1410
1411 sub config {
1412 return unless ($config);
1413
1414 my $p = shift;
1415
1416 $p ||= '';
1417
1418 my $v;
1419
1420 warn "### getting config($p)\n" if ($debug > 1);
1421
1422 my @p = split(/\s+/,$p);
1423 if ($#p < 0) {
1424 $v = $config->{ '_' }; # special, database code
1425 } else {
1426
1427 my $c = dclone( $config );
1428
1429 foreach my $k (@p) {
1430 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1431 if (ref($c) eq 'ARRAY') {
1432 $c = shift @$c;
1433 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1434 last;
1435 }
1436
1437 if (! defined($c->{$k}) ) {
1438 $c = undef;
1439 last;
1440 } else {
1441 $c = $c->{$k};
1442 }
1443 }
1444 $v = $c if ($c);
1445
1446 }
1447
1448 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1449 warn "config( '$p' ) is empty\n" if (! $v);
1450
1451 return $v;
1452 }
1453
1454 =head2 id
1455
1456 Returns unique id of this record
1457
1458 $id = id();
1459
1460 Returns C<42/2> for 2nd occurence of MFN 42.
1461
1462 =cut
1463
1464 sub id {
1465 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1466 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1467 }
1468
1469 =head2 join_with
1470
1471 Joins walues with some delimiter
1472
1473 $v = join_with(", ", @v);
1474
1475 =cut
1476
1477 sub join_with {
1478 my $d = shift;
1479 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1480 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1481 return '' unless defined($v);
1482 return $v;
1483 }
1484
1485 =head2 split_rec_on
1486
1487 Split record subfield on some regex and take one of parts out
1488
1489 $a_before_semi_column =
1490 split_rec_on('200','a', /\s*;\s*/, $part);
1491
1492 C<$part> is optional number of element. First element is
1493 B<1>, not 0!
1494
1495 If there is no C<$part> parameter or C<$part> is 0, this function will
1496 return all values produced by splitting.
1497
1498 =cut
1499
1500 sub split_rec_on {
1501 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1502
1503 my ($fld, $sf, $regex, $part) = @_;
1504 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1505
1506 my @r = rec( $fld, $sf );
1507 my $v = shift @r;
1508 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1509
1510 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1511
1512 my @s = split( $regex, $v );
1513 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1514 if ($part && $part > 0) {
1515 return $s[ $part - 1 ];
1516 } else {
1517 return @s;
1518 }
1519 }
1520
1521 my $hash;
1522
1523 =head2 set
1524
1525 set( key => 'value' );
1526
1527 =cut
1528
1529 sub set {
1530 my ($k,$v) = @_;
1531 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1532 $hash->{$k} = $v;
1533 };
1534
1535 =head2 get
1536
1537 get( 'key' );
1538
1539 =cut
1540
1541 sub get {
1542 my $k = shift || return;
1543 my $v = $hash->{$k};
1544 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1545 return $v;
1546 }
1547
1548 =head2 count
1549
1550 if ( count( @result ) == 1 ) {
1551 # do something if only 1 result is there
1552 }
1553
1554 =cut
1555
1556 sub count {
1557 warn "## count ",dump(@_),$/ if ( $debug );
1558 return @_ . '';
1559 }
1560
1561 # END
1562 1;

  ViewVC Help
Powered by ViewVC 1.1.26