/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 915 - (show annotations)
Tue Oct 30 20:27:20 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 30547 byte(s)
 r1376@llin:  dpavlin | 2007-10-30 21:27:19 +0100
 Added back tag for backwards compatiblity

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 regex prefix suffix surround
19 first lookup join_with
20 save_into_lookup
21
22 split_rec_on
23
24 get set
25 count
26 /;
27
28 use warnings;
29 use strict;
30
31 #use base qw/WebPAC::Common/;
32 use Data::Dump qw/dump/;
33 use Storable qw/dclone/;
34 use Carp qw/confess/;
35
36 # debugging warn(s)
37 my $debug = 0;
38
39
40 =head1 NAME
41
42 WebPAC::Normalize - describe normalisaton rules using sets
43
44 =head1 VERSION
45
46 Version 0.30
47
48 =cut
49
50 our $VERSION = '0.30';
51
52 =head1 SYNOPSIS
53
54 This module uses C<conf/normalize/*.pl> files to perform normalisation
55 from input records using perl functions which are specialized for set
56 processing.
57
58 Sets are implemented as arrays, and normalisation file is valid perl, which
59 means that you check it's validity before running WebPAC using
60 C<perl -c normalize.pl>.
61
62 Normalisation can generate multiple output normalized data. For now, supported output
63 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
64 C<marc>.
65
66 =head1 FUNCTIONS
67
68 Functions which start with C<_> are private and used by WebPAC internally.
69 All other functions are available for use within normalisation rules.
70
71 =head2 data_structure
72
73 Return data structure
74
75 my $ds = WebPAC::Normalize::data_structure(
76 lookup => $lookup_hash,
77 row => $row,
78 rules => $normalize_pl_config,
79 marc_encoding => 'utf-8',
80 config => $config,
81 load_row_coderef => sub {
82 my ($database,$input,$mfn) = shift;
83 $store->load_row( database => $database, input => $input, id => $mfn );
84 },
85 );
86
87 Options C<row>, C<rules> and C<log> are mandatory while all
88 other are optional.
89
90 C<load_row_coderef> is closure only used when executing lookups, so they will
91 die if it's not defined.
92
93 This function will B<die> if normalizastion can't be evaled.
94
95 Since this function isn't exported you have to call it with
96 C<WebPAC::Normalize::data_structure>.
97
98 =cut
99
100 my $load_row_coderef;
101
102 sub data_structure {
103 my $arg = {@_};
104
105 die "need row argument" unless ($arg->{row});
106 die "need normalisation argument" unless ($arg->{rules});
107
108 no strict 'subs';
109 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
110 _set_rec( $arg->{row} );
111 _set_config( $arg->{config} ) if defined($arg->{config});
112 _clean_ds( %{ $arg } );
113 $load_row_coderef = $arg->{load_row_coderef};
114
115 eval "$arg->{rules}";
116 die "error evaling $arg->{rules}: $@\n" if ($@);
117
118 return _get_ds();
119 }
120
121 =head2 _set_rec
122
123 Set current record hash
124
125 _set_rec( $rec );
126
127 =cut
128
129 my $rec;
130
131 sub _set_rec {
132 $rec = shift or die "no record hash";
133 }
134
135 =head2 _set_config
136
137 Set current config hash
138
139 _set_config( $config );
140
141 Magic keys are:
142
143 =over 4
144
145 =item _
146
147 Code of current database
148
149 =item _mfn
150
151 Current MFN
152
153 =back
154
155 =cut
156
157 my $config;
158
159 sub _set_config {
160 $config = shift;
161 }
162
163 =head2 _get_ds
164
165 Return hash formatted as data structure
166
167 my $ds = _get_ds();
168
169 =cut
170
171 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
172 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
173
174 sub _get_ds {
175 return $out;
176 }
177
178 =head2 _clean_ds
179
180 Clean data structure hash for next record
181
182 _clean_ds();
183
184 =cut
185
186 sub _clean_ds {
187 my $a = {@_};
188 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
189 ($marc_record_offset, $marc_fetch_offset) = (0,0);
190 $marc_encoding = $a->{marc_encoding};
191 }
192
193 =head2 _set_lookup
194
195 Set current lookup hash
196
197 _set_lookup( $lookup );
198
199 =cut
200
201 my $lookup;
202
203 sub _set_lookup {
204 $lookup = shift;
205 }
206
207 =head2 _get_lookup
208
209 Get current lookup hash
210
211 my $lookup = _get_lookup();
212
213 =cut
214
215 sub _get_lookup {
216 return $lookup;
217 }
218
219 =head2 _set_load_row
220
221 Setup code reference which will return L<data_structure> from
222 L<WebPAC::Store>
223
224 _set_load_row(sub {
225 my ($database,$input,$mfn) = @_;
226 $store->load_row( database => $database, input => $input, id => $mfn );
227 });
228
229 =cut
230
231 sub _set_load_row {
232 my $coderef = shift;
233 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
234
235 $load_row_coderef = $coderef;
236 }
237
238 =head2 _get_marc_fields
239
240 Get all fields defined by calls to C<marc>
241
242 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
243
244 We are using I<magic> which detect repeatable fields only from
245 sequence of field/subfield data generated by normalization.
246
247 Repeatable field is created when there is second occurence of same subfield or
248 if any of indicators are different.
249
250 This is sane for most cases. Something like:
251
252 900a-1 900b-1 900c-1
253 900a-2 900b-2
254 900a-3
255
256 will be created from any combination of:
257
258 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
259
260 and following rules:
261
262 marc('900','a', rec('200','a') );
263 marc('900','b', rec('200','b') );
264 marc('900','c', rec('200','c') );
265
266 which might not be what you have in mind. If you need repeatable subfield,
267 define it using C<marc_repeatable_subfield> like this:
268
269 marc_repeatable_subfield('900','a');
270 marc('900','a', rec('200','a') );
271 marc('900','b', rec('200','b') );
272 marc('900','c', rec('200','c') );
273
274 will create:
275
276 900a-1 900a-2 900a-3 900b-1 900c-1
277 900b-2
278
279 There is also support for returning next or specific using:
280
281 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
282 # do something with $mf
283 }
284
285 will always return fields from next MARC record or
286
287 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
288
289 will return 42th copy record (if it exists).
290
291 =cut
292
293 my $fetch_pos;
294
295 sub _get_marc_fields {
296
297 my $arg = {@_};
298 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
299 $fetch_pos = $marc_fetch_offset;
300 if ($arg->{offset}) {
301 $fetch_pos = $arg->{offset};
302 } elsif($arg->{fetch_next}) {
303 $marc_fetch_offset++;
304 }
305
306 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
307
308 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
309
310 my $marc_rec = $marc_record->[ $fetch_pos ];
311
312 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
313
314 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
315
316 # first, sort all existing fields
317 # XXX might not be needed, but modern perl might randomize elements in hash
318 my @sorted_marc_record = sort {
319 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
320 } @{ $marc_rec };
321
322 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
323
324 # output marc fields
325 my @m;
326
327 # count unique field-subfields (used for offset when walking to next subfield)
328 my $u;
329 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
330
331 if ($debug) {
332 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
333 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
334 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
335 warn "## subfield count = ", dump( $u ), $/;
336 }
337
338 my $len = $#sorted_marc_record;
339 my $visited;
340 my $i = 0;
341 my $field;
342
343 foreach ( 0 .. $len ) {
344
345 # find next element which isn't visited
346 while ($visited->{$i}) {
347 $i = ($i + 1) % ($len + 1);
348 }
349
350 # mark it visited
351 $visited->{$i}++;
352
353 my $row = dclone( $sorted_marc_record[$i] );
354
355 # field and subfield which is key for
356 # marc_repeatable_subfield and u
357 my $fsf = $row->[0] . ( $row->[3] || '' );
358
359 if ($debug > 1) {
360
361 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
362 print "### this [$i]: ", dump( $row ),$/;
363 print "### sf: ", $row->[3], " vs ", $field->[3],
364 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
365 if ($#$field >= 0);
366
367 }
368
369 # if field exists
370 if ( $#$field >= 0 ) {
371 if (
372 $row->[0] ne $field->[0] || # field
373 $row->[1] ne $field->[1] || # i1
374 $row->[2] ne $field->[2] # i2
375 ) {
376 push @m, $field;
377 warn "## saved/1 ", dump( $field ),$/ if ($debug);
378 $field = $row;
379
380 } elsif (
381 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
382 ||
383 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
384 ! $marc_repeatable_subfield->{ $fsf }
385 )
386 ) {
387 push @m, $field;
388 warn "## saved/2 ", dump( $field ),$/ if ($debug);
389 $field = $row;
390
391 } else {
392 # append new subfields to existing field
393 push @$field, ( $row->[3], $row->[4] );
394 }
395 } else {
396 # insert first field
397 $field = $row;
398 }
399
400 if (! $marc_repeatable_subfield->{ $fsf }) {
401 # make step to next subfield
402 $i = ($i + $u->{ $fsf } ) % ($len + 1);
403 }
404 }
405
406 if ($#$field >= 0) {
407 push @m, $field;
408 warn "## saved/3 ", dump( $field ),$/ if ($debug);
409 }
410
411 return \@m;
412 }
413
414 =head2 _get_marc_leader
415
416 Return leader from currently fetched record by L</_get_marc_fields>
417
418 print WebPAC::Normalize::_get_marc_leader();
419
420 =cut
421
422 sub _get_marc_leader {
423 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
424 return $marc_leader->[ $fetch_pos ];
425 }
426
427 =head2 _debug
428
429 Change level of debug warnings
430
431 _debug( 2 );
432
433 =cut
434
435 sub _debug {
436 my $l = shift;
437 return $debug unless defined($l);
438 warn "debug level $l",$/ if ($l > 0);
439 $debug = $l;
440 }
441
442 =head1 Functions to create C<data_structure>
443
444 Those functions generally have to first in your normalization file.
445
446 =head2 search_display
447
448 Define output for L<search> and L<display> at the same time
449
450 search_display('Title', rec('200','a') );
451
452
453 =cut
454
455 sub search_display {
456 my $name = shift or die "search_display needs name as first argument";
457 my @o = grep { defined($_) && $_ ne '' } @_;
458 return unless (@o);
459 $out->{$name}->{search} = \@o;
460 $out->{$name}->{display} = \@o;
461 }
462
463 =head2 tag
464
465 Old name for L<search_display>, but supported
466
467 =cut
468
469 sub tag {
470 search_display( @_ );
471 }
472
473 =head2 display
474
475 Define output just for I<display>
476
477 @v = display('Title', rec('200','a') );
478
479 =cut
480
481 sub display {
482 my $name = shift or die "display needs name as first argument";
483 my @o = grep { defined($_) && $_ ne '' } @_;
484 return unless (@o);
485 $out->{$name}->{display} = \@o;
486 }
487
488 =head2 search
489
490 Prepare values just for I<search>
491
492 @v = search('Title', rec('200','a') );
493
494 =cut
495
496 sub search {
497 my $name = shift or die "search needs name as first argument";
498 my @o = grep { defined($_) && $_ ne '' } @_;
499 return unless (@o);
500 $out->{$name}->{search} = \@o;
501 }
502
503 =head2 marc_leader
504
505 Setup fields within MARC leader or get leader
506
507 marc_leader('05','c');
508 my $leader = marc_leader();
509
510 =cut
511
512 sub marc_leader {
513 my ($offset,$value) = @_;
514
515 if ($offset) {
516 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
517 } else {
518
519 if (defined($marc_leader)) {
520 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
521 return $marc_leader->[ $marc_record_offset ];
522 } else {
523 return;
524 }
525 }
526 }
527
528 =head2 marc_fixed
529
530 Create control/indentifier fields with values in fixed positions
531
532 marc_fixed('008', 00, '070402');
533 marc_fixed('008', 39, '|');
534
535 Positions not specified will be filled with spaces (C<0x20>).
536
537 There will be no effort to extend last specified value to full length of
538 field in standard.
539
540 =cut
541
542 sub marc_fixed {
543 my ($f, $pos, $val) = @_;
544 die "need marc(field, position, value)" unless defined($f) && defined($pos);
545
546 confess "need val" unless defined $val;
547
548 my $update = 0;
549
550 map {
551 if ($_->[0] eq $f) {
552 my $old = $_->[1];
553 if (length($old) <= $pos) {
554 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
555 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
556 } else {
557 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
558 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
559 }
560 $update++;
561 }
562 } @{ $marc_record->[ $marc_record_offset ] };
563
564 if (! $update) {
565 my $v = ' ' x $pos . $val;
566 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
567 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
568 }
569 }
570
571 =head2 marc
572
573 Save value for MARC field
574
575 marc('900','a', rec('200','a') );
576 marc('001', rec('000') );
577
578 =cut
579
580 sub marc {
581 my $f = shift or die "marc needs field";
582 die "marc field must be numer" unless ($f =~ /^\d+$/);
583
584 my $sf;
585 if ($f >= 10) {
586 $sf = shift or die "marc needs subfield";
587 }
588
589 foreach (@_) {
590 my $v = $_; # make var read-write for Encode
591 next unless (defined($v) && $v !~ /^\s*$/);
592 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
593 if (defined $sf) {
594 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
595 } else {
596 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
597 }
598 }
599 }
600
601 =head2 marc_repeatable_subfield
602
603 Save values for MARC repetable subfield
604
605 marc_repeatable_subfield('910', 'z', rec('909') );
606
607 =cut
608
609 sub marc_repeatable_subfield {
610 my ($f,$sf) = @_;
611 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
612 $marc_repeatable_subfield->{ $f . $sf }++;
613 marc(@_);
614 }
615
616 =head2 marc_indicators
617
618 Set both indicators for MARC field
619
620 marc_indicators('900', ' ', 1);
621
622 Any indicator value other than C<0-9> will be treated as undefined.
623
624 =cut
625
626 sub marc_indicators {
627 my $f = shift || die "marc_indicators need field!\n";
628 my ($i1,$i2) = @_;
629 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
630 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
631
632 $i1 = ' ' if ($i1 !~ /^\d$/);
633 $i2 = ' ' if ($i2 !~ /^\d$/);
634 @{ $marc_indicators->{$f} } = ($i1,$i2);
635 }
636
637 =head2 marc_compose
638
639 Save values for each MARC subfield explicitly
640
641 marc_compose('900',
642 'a', rec('200','a')
643 'b', rec('201','a')
644 'a', rec('200','b')
645 'c', rec('200','c')
646 );
647
648 If you specify C<+> for subfield, value will be appended
649 to previous defined subfield.
650
651 =cut
652
653 sub marc_compose {
654 my $f = shift or die "marc_compose needs field";
655 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
656
657 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
658 my $m = [ $f, $i1, $i2 ];
659
660 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
661
662 if ($#_ % 2 != 1) {
663 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
664 }
665
666 while (@_) {
667 my $sf = shift;
668 my $v = shift;
669
670 next unless (defined($v) && $v !~ /^\s*$/);
671 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
672 if ($sf ne '+') {
673 push @$m, ( $sf, $v );
674 } else {
675 $m->[ $#$m ] .= $v;
676 }
677 }
678
679 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
680
681 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
682 }
683
684 =head2 marc_duplicate
685
686 Generate copy of current MARC record and continue working on copy
687
688 marc_duplicate();
689
690 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
691 C<< _get_marc_fields( offset => 42 ) >>.
692
693 =cut
694
695 sub marc_duplicate {
696 my $m = $marc_record->[ -1 ];
697 die "can't duplicate record which isn't defined" unless ($m);
698 push @{ $marc_record }, dclone( $m );
699 push @{ $marc_leader }, dclone( marc_leader() );
700 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
701 $marc_record_offset = $#{ $marc_record };
702 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
703
704 }
705
706 =head2 marc_remove
707
708 Remove some field or subfield from MARC record.
709
710 marc_remove('200');
711 marc_remove('200','a');
712
713 This will erase field C<200> or C<200^a> from current MARC record.
714
715 marc_remove('*');
716
717 Will remove all fields in current MARC record.
718
719 This is useful after calling C<marc_duplicate> or on it's own (but, you
720 should probably just remove that subfield definition if you are not
721 using C<marc_duplicate>).
722
723 FIXME: support fields < 10.
724
725 =cut
726
727 sub marc_remove {
728 my ($f, $sf) = @_;
729
730 die "marc_remove needs record number" unless defined($f);
731
732 my $marc = $marc_record->[ $marc_record_offset ];
733
734 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
735
736 if ($f eq '*') {
737
738 delete( $marc_record->[ $marc_record_offset ] );
739 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
740
741 } else {
742
743 my $i = 0;
744 foreach ( 0 .. $#{ $marc } ) {
745 last unless (defined $marc->[$i]);
746 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
747 if ($marc->[$i]->[0] eq $f) {
748 if (! defined $sf) {
749 # remove whole field
750 splice @$marc, $i, 1;
751 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
752 $i--;
753 } else {
754 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
755 my $o = ($j * 2) + 3;
756 if ($marc->[$i]->[$o] eq $sf) {
757 # remove subfield
758 splice @{$marc->[$i]}, $o, 2;
759 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
760 # is record now empty?
761 if ($#{ $marc->[$i] } == 2) {
762 splice @$marc, $i, 1;
763 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
764 $i--;
765 };
766 }
767 }
768 }
769 }
770 $i++;
771 }
772
773 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
774
775 $marc_record->[ $marc_record_offset ] = $marc;
776 }
777
778 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
779 }
780
781 =head2 marc_original_order
782
783 Copy all subfields preserving original order to marc field.
784
785 marc_original_order( marc_field_number, original_input_field_number );
786
787 Please note that field numbers are consistent with other commands (marc
788 field number first), but somewhat counter-intuitive (destination and then
789 source).
790
791 You might want to use this command if you are just renaming subfields or
792 using pre-processing modify_record in C<config.yml> and don't need any
793 post-processing or want to preserve order of original subfields.
794
795
796 =cut
797
798 sub marc_original_order {
799
800 my ($to, $from) = @_;
801 die "marc_original_order needs from and to fields\n" unless ($from && $to);
802
803 return unless defined($rec->{$from});
804
805 my $r = $rec->{$from};
806 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
807
808 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
809 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
810
811 foreach my $d (@$r) {
812
813 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
814 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
815 next;
816 }
817
818 my @sfs = @{ $d->{subfields} };
819
820 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
821
822 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
823
824 my $m = [ $to, $i1, $i2 ];
825
826 while (my $sf = shift @sfs) {
827
828 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
829 my $offset = shift @sfs;
830 die "corrupted sufields specification for field $from\n" unless defined($offset);
831
832 my $v;
833 if (ref($d->{$sf}) eq 'ARRAY') {
834 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
835 } elsif ($offset == 0) {
836 $v = $d->{$sf};
837 } else {
838 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
839 }
840 push @$m, ( $sf, $v ) if (defined($v));
841 }
842
843 if ($#{$m} > 2) {
844 push @{ $marc_record->[ $marc_record_offset ] }, $m;
845 }
846 }
847
848 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
849 }
850
851 =head2 marc_count
852
853 Return number of MARC records created using L</marc_duplicate>.
854
855 print "created ", marc_count(), " records";
856
857 =cut
858
859 sub marc_count {
860 return $#{ $marc_record };
861 }
862
863
864 =head1 Functions to extract data from input
865
866 This function should be used inside functions to create C<data_structure> described
867 above.
868
869 =head2 _pack_subfields_hash
870
871 @subfields = _pack_subfields_hash( $h );
872 $subfields = _pack_subfields_hash( $h, 1 );
873
874 Return each subfield value in array or pack them all together and return scalar
875 with subfields (denoted by C<^>) and values.
876
877 =cut
878
879 sub _pack_subfields_hash {
880
881 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
882
883 my ($h,$include_subfields) = @_;
884
885 # sanity and ease of use
886 return $h if (ref($h) ne 'HASH');
887
888 if ( defined($h->{subfields}) ) {
889 my $sfs = delete $h->{subfields} || die "no subfields?";
890 my @out;
891 while (@$sfs) {
892 my $sf = shift @$sfs;
893 push @out, '^' . $sf if ($include_subfields);
894 my $o = shift @$sfs;
895 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
896 # single element subfields are not arrays
897 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
898
899 push @out, $h->{$sf};
900 } else {
901 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
902 push @out, $h->{$sf}->[$o];
903 }
904 }
905 if ($include_subfields) {
906 return join('', @out);
907 } else {
908 return @out;
909 }
910 } else {
911 if ($include_subfields) {
912 my $out = '';
913 foreach my $sf (sort keys %$h) {
914 if (ref($h->{$sf}) eq 'ARRAY') {
915 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
916 } else {
917 $out .= '^' . $sf . $h->{$sf};
918 }
919 }
920 return $out;
921 } else {
922 # FIXME this should probably be in alphabetical order instead of hash order
923 values %{$h};
924 }
925 }
926 }
927
928 =head2 rec1
929
930 Return all values in some field
931
932 @v = rec1('200')
933
934 TODO: order of values is probably same as in source data, need to investigate that
935
936 =cut
937
938 sub rec1 {
939 my $f = shift;
940 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
941 return unless (defined($rec) && defined($rec->{$f}));
942 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
943 if (ref($rec->{$f}) eq 'ARRAY') {
944 my @out;
945 foreach my $h ( @{ $rec->{$f} } ) {
946 if (ref($h) eq 'HASH') {
947 push @out, ( _pack_subfields_hash( $h ) );
948 } else {
949 push @out, $h;
950 }
951 }
952 return @out;
953 } elsif( defined($rec->{$f}) ) {
954 return $rec->{$f};
955 }
956 }
957
958 =head2 rec2
959
960 Return all values in specific field and subfield
961
962 @v = rec2('200','a')
963
964 =cut
965
966 sub rec2 {
967 my $f = shift;
968 return unless (defined($rec && $rec->{$f}));
969 my $sf = shift;
970 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
971 return map {
972 if (ref($_->{$sf}) eq 'ARRAY') {
973 @{ $_->{$sf} };
974 } else {
975 $_->{$sf};
976 }
977 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
978 }
979
980 =head2 rec
981
982 syntaxtic sugar for
983
984 @v = rec('200')
985 @v = rec('200','a')
986
987 If rec() returns just single value, it will
988 return scalar, not array.
989
990 =cut
991
992 sub rec {
993 my @out;
994 if ($#_ == 0) {
995 @out = rec1(@_);
996 } elsif ($#_ == 1) {
997 @out = rec2(@_);
998 }
999 if ($#out == 0 && ! wantarray) {
1000 return $out[0];
1001 } elsif (@out) {
1002 return @out;
1003 } else {
1004 return '';
1005 }
1006 }
1007
1008 =head2 regex
1009
1010 Apply regex to some or all values
1011
1012 @v = regex( 's/foo/bar/g', @v );
1013
1014 =cut
1015
1016 sub regex {
1017 my $r = shift;
1018 my @out;
1019 #warn "r: $r\n", dump(\@_);
1020 foreach my $t (@_) {
1021 next unless ($t);
1022 eval "\$t =~ $r";
1023 push @out, $t if ($t && $t ne '');
1024 }
1025 return @out;
1026 }
1027
1028 =head2 prefix
1029
1030 Prefix all values with a string
1031
1032 @v = prefix( 'my_', @v );
1033
1034 =cut
1035
1036 sub prefix {
1037 my $p = shift;
1038 return @_ unless defined( $p );
1039 return map { $p . $_ } grep { defined($_) } @_;
1040 }
1041
1042 =head2 suffix
1043
1044 suffix all values with a string
1045
1046 @v = suffix( '_my', @v );
1047
1048 =cut
1049
1050 sub suffix {
1051 my $s = shift;
1052 return @_ unless defined( $s );
1053 return map { $_ . $s } grep { defined($_) } @_;
1054 }
1055
1056 =head2 surround
1057
1058 surround all values with a two strings
1059
1060 @v = surround( 'prefix_', '_suffix', @v );
1061
1062 =cut
1063
1064 sub surround {
1065 my $p = shift;
1066 my $s = shift;
1067 $p = '' unless defined( $p );
1068 $s = '' unless defined( $s );
1069 return map { $p . $_ . $s } grep { defined($_) } @_;
1070 }
1071
1072 =head2 first
1073
1074 Return first element
1075
1076 $v = first( @v );
1077
1078 =cut
1079
1080 sub first {
1081 my $r = shift;
1082 return $r;
1083 }
1084
1085 =head2 lookup
1086
1087 Consult lookup hashes for some value
1088
1089 @v = lookup(
1090 sub {
1091 'ffkk/peri/mfn'.rec('000')
1092 },
1093 'ffkk','peri','200-a-200-e',
1094 sub {
1095 first(rec(200,'a')).' '.first(rec('200','e'))
1096 }
1097 );
1098
1099 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1100 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1101
1102 lookup(
1103 # which results to return from record recorded in lookup
1104 sub { 'ffkk/peri/mfn' . rec('000') },
1105 # from which database and input
1106 'ffkk','peri',
1107 # such that following values match
1108 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1109 # if this part is missing, we will try to match same fields
1110 # from lookup record and current one, or you can override
1111 # which records to use from current record using
1112 sub { rec('900','x') . ' ' . rec('900','y') },
1113 )
1114
1115 You can think about this lookup as SQL (if that helps):
1116
1117 select
1118 sub { what }
1119 from
1120 database, input
1121 where
1122 sub { filter from lookuped record }
1123 having
1124 sub { optional filter on current record }
1125
1126 Easy as pie, right?
1127
1128 =cut
1129
1130 sub lookup {
1131 my ($what, $database, $input, $key, $having) = @_;
1132
1133 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1134
1135 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1136 return unless (defined($lookup->{$database}->{$input}->{$key}));
1137
1138 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1139
1140 my $mfns;
1141 my @having = $having->();
1142
1143 warn "## having = ", dump( @having ) if ($debug > 2);
1144
1145 foreach my $h ( @having ) {
1146 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1147 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1148 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1149 }
1150 }
1151
1152 return unless ($mfns);
1153
1154 my @mfns = sort keys %$mfns;
1155
1156 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1157
1158 my $old_rec = $rec;
1159 my @out;
1160
1161 foreach my $mfn (@mfns) {
1162 $rec = $load_row_coderef->( $database, $input, $mfn );
1163
1164 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1165
1166 my @vals = $what->();
1167
1168 push @out, ( @vals );
1169
1170 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1171 }
1172
1173 # if (ref($lookup->{$k}) eq 'ARRAY') {
1174 # return @{ $lookup->{$k} };
1175 # } else {
1176 # return $lookup->{$k};
1177 # }
1178
1179 $rec = $old_rec;
1180
1181 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1182
1183 if ($#out == 0) {
1184 return $out[0];
1185 } else {
1186 return @out;
1187 }
1188 }
1189
1190 =head2 save_into_lookup
1191
1192 Save value into lookup. It associates current database, input
1193 and specific keys with one or more values which will be
1194 associated over MFN.
1195
1196 MFN will be extracted from first occurence current of field 000
1197 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1198
1199 my $nr = save_into_lookup($database,$input,$key,sub {
1200 # code which produce one or more values
1201 });
1202
1203 It returns number of items saved.
1204
1205 This function shouldn't be called directly, it's called from code created by
1206 L<WebPAC::Parser>.
1207
1208 =cut
1209
1210 sub save_into_lookup {
1211 my ($database,$input,$key,$coderef) = @_;
1212 die "save_into_lookup needs database" unless defined($database);
1213 die "save_into_lookup needs input" unless defined($input);
1214 die "save_into_lookup needs key" unless defined($key);
1215 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1216
1217 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1218
1219 my $mfn =
1220 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1221 defined($config->{_mfn}) ? $config->{_mfn} :
1222 die "mfn not defined or zero";
1223
1224 my $nr = 0;
1225
1226 foreach my $v ( $coderef->() ) {
1227 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1228 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1229 $nr++;
1230 }
1231
1232 return $nr;
1233 }
1234
1235 =head2 config
1236
1237 Consult config values stored in C<config.yml>
1238
1239 # return database code (key under databases in yaml)
1240 $database_code = config(); # use _ from hash
1241 $database_name = config('name');
1242 $database_input_name = config('input name');
1243
1244 Up to three levels are supported.
1245
1246 =cut
1247
1248 sub config {
1249 return unless ($config);
1250
1251 my $p = shift;
1252
1253 $p ||= '';
1254
1255 my $v;
1256
1257 warn "### getting config($p)\n" if ($debug > 1);
1258
1259 my @p = split(/\s+/,$p);
1260 if ($#p < 0) {
1261 $v = $config->{ '_' }; # special, database code
1262 } else {
1263
1264 my $c = dclone( $config );
1265
1266 foreach my $k (@p) {
1267 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1268 if (ref($c) eq 'ARRAY') {
1269 $c = shift @$c;
1270 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1271 last;
1272 }
1273
1274 if (! defined($c->{$k}) ) {
1275 $c = undef;
1276 last;
1277 } else {
1278 $c = $c->{$k};
1279 }
1280 }
1281 $v = $c if ($c);
1282
1283 }
1284
1285 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1286 warn "config( '$p' ) is empty\n" if (! $v);
1287
1288 return $v;
1289 }
1290
1291 =head2 id
1292
1293 Returns unique id of this record
1294
1295 $id = id();
1296
1297 Returns C<42/2> for 2nd occurence of MFN 42.
1298
1299 =cut
1300
1301 sub id {
1302 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1303 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1304 }
1305
1306 =head2 join_with
1307
1308 Joins walues with some delimiter
1309
1310 $v = join_with(", ", @v);
1311
1312 =cut
1313
1314 sub join_with {
1315 my $d = shift;
1316 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1317 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1318 return '' unless defined($v);
1319 return $v;
1320 }
1321
1322 =head2 split_rec_on
1323
1324 Split record subfield on some regex and take one of parts out
1325
1326 $a_before_semi_column =
1327 split_rec_on('200','a', /\s*;\s*/, $part);
1328
1329 C<$part> is optional number of element. First element is
1330 B<1>, not 0!
1331
1332 If there is no C<$part> parameter or C<$part> is 0, this function will
1333 return all values produced by splitting.
1334
1335 =cut
1336
1337 sub split_rec_on {
1338 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1339
1340 my ($fld, $sf, $regex, $part) = @_;
1341 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1342
1343 my @r = rec( $fld, $sf );
1344 my $v = shift @r;
1345 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1346
1347 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1348
1349 my @s = split( $regex, $v );
1350 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1351 if ($part && $part > 0) {
1352 return $s[ $part - 1 ];
1353 } else {
1354 return @s;
1355 }
1356 }
1357
1358 my $hash;
1359
1360 =head2 set
1361
1362 set( key => 'value' );
1363
1364 =cut
1365
1366 sub set {
1367 my ($k,$v) = @_;
1368 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1369 $hash->{$k} = $v;
1370 };
1371
1372 =head2 get
1373
1374 get( 'key' );
1375
1376 =cut
1377
1378 sub get {
1379 my $k = shift || return;
1380 my $v = $hash->{$k};
1381 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1382 return $v;
1383 }
1384
1385 =head2 count
1386
1387 if ( count( @result ) == 1 ) {
1388 # do something if only 1 result is there
1389 }
1390
1391 =cut
1392
1393 sub count {
1394 warn "## count ",dump(@_),$/ if ( $debug );
1395 return @_ . '';
1396 }
1397
1398 # END
1399 1;

  ViewVC Help
Powered by ViewVC 1.1.26