/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 912 - (show annotations)
Tue Oct 30 17:40:13 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 30446 byte(s)
 r1370@llin:  dpavlin | 2007-10-30 18:40:02 +0100
 rename tag to search_display which makes it more clean what does it do

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 regex prefix suffix surround
19 first lookup join_with
20 save_into_lookup
21
22 split_rec_on
23
24 get set
25 count
26 /;
27
28 use warnings;
29 use strict;
30
31 #use base qw/WebPAC::Common/;
32 use Data::Dump qw/dump/;
33 use Storable qw/dclone/;
34 use Carp qw/confess/;
35
36 # debugging warn(s)
37 my $debug = 0;
38
39
40 =head1 NAME
41
42 WebPAC::Normalize - describe normalisaton rules using sets
43
44 =head1 VERSION
45
46 Version 0.30
47
48 =cut
49
50 our $VERSION = '0.30';
51
52 =head1 SYNOPSIS
53
54 This module uses C<conf/normalize/*.pl> files to perform normalisation
55 from input records using perl functions which are specialized for set
56 processing.
57
58 Sets are implemented as arrays, and normalisation file is valid perl, which
59 means that you check it's validity before running WebPAC using
60 C<perl -c normalize.pl>.
61
62 Normalisation can generate multiple output normalized data. For now, supported output
63 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
64 C<marc>.
65
66 =head1 FUNCTIONS
67
68 Functions which start with C<_> are private and used by WebPAC internally.
69 All other functions are available for use within normalisation rules.
70
71 =head2 data_structure
72
73 Return data structure
74
75 my $ds = WebPAC::Normalize::data_structure(
76 lookup => $lookup_hash,
77 row => $row,
78 rules => $normalize_pl_config,
79 marc_encoding => 'utf-8',
80 config => $config,
81 load_row_coderef => sub {
82 my ($database,$input,$mfn) = shift;
83 $store->load_row( database => $database, input => $input, id => $mfn );
84 },
85 );
86
87 Options C<row>, C<rules> and C<log> are mandatory while all
88 other are optional.
89
90 C<load_row_coderef> is closure only used when executing lookups, so they will
91 die if it's not defined.
92
93 This function will B<die> if normalizastion can't be evaled.
94
95 Since this function isn't exported you have to call it with
96 C<WebPAC::Normalize::data_structure>.
97
98 =cut
99
100 my $load_row_coderef;
101
102 sub data_structure {
103 my $arg = {@_};
104
105 die "need row argument" unless ($arg->{row});
106 die "need normalisation argument" unless ($arg->{rules});
107
108 no strict 'subs';
109 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
110 _set_rec( $arg->{row} );
111 _set_config( $arg->{config} ) if defined($arg->{config});
112 _clean_ds( %{ $arg } );
113 $load_row_coderef = $arg->{load_row_coderef};
114
115 eval "$arg->{rules}";
116 die "error evaling $arg->{rules}: $@\n" if ($@);
117
118 return _get_ds();
119 }
120
121 =head2 _set_rec
122
123 Set current record hash
124
125 _set_rec( $rec );
126
127 =cut
128
129 my $rec;
130
131 sub _set_rec {
132 $rec = shift or die "no record hash";
133 }
134
135 =head2 _set_config
136
137 Set current config hash
138
139 _set_config( $config );
140
141 Magic keys are:
142
143 =over 4
144
145 =item _
146
147 Code of current database
148
149 =item _mfn
150
151 Current MFN
152
153 =back
154
155 =cut
156
157 my $config;
158
159 sub _set_config {
160 $config = shift;
161 }
162
163 =head2 _get_ds
164
165 Return hash formatted as data structure
166
167 my $ds = _get_ds();
168
169 =cut
170
171 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
172 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
173
174 sub _get_ds {
175 return $out;
176 }
177
178 =head2 _clean_ds
179
180 Clean data structure hash for next record
181
182 _clean_ds();
183
184 =cut
185
186 sub _clean_ds {
187 my $a = {@_};
188 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
189 ($marc_record_offset, $marc_fetch_offset) = (0,0);
190 $marc_encoding = $a->{marc_encoding};
191 }
192
193 =head2 _set_lookup
194
195 Set current lookup hash
196
197 _set_lookup( $lookup );
198
199 =cut
200
201 my $lookup;
202
203 sub _set_lookup {
204 $lookup = shift;
205 }
206
207 =head2 _get_lookup
208
209 Get current lookup hash
210
211 my $lookup = _get_lookup();
212
213 =cut
214
215 sub _get_lookup {
216 return $lookup;
217 }
218
219 =head2 _set_load_row
220
221 Setup code reference which will return L<data_structure> from
222 L<WebPAC::Store>
223
224 _set_load_row(sub {
225 my ($database,$input,$mfn) = @_;
226 $store->load_row( database => $database, input => $input, id => $mfn );
227 });
228
229 =cut
230
231 sub _set_load_row {
232 my $coderef = shift;
233 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
234
235 $load_row_coderef = $coderef;
236 }
237
238 =head2 _get_marc_fields
239
240 Get all fields defined by calls to C<marc>
241
242 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
243
244 We are using I<magic> which detect repeatable fields only from
245 sequence of field/subfield data generated by normalization.
246
247 Repeatable field is created when there is second occurence of same subfield or
248 if any of indicators are different.
249
250 This is sane for most cases. Something like:
251
252 900a-1 900b-1 900c-1
253 900a-2 900b-2
254 900a-3
255
256 will be created from any combination of:
257
258 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
259
260 and following rules:
261
262 marc('900','a', rec('200','a') );
263 marc('900','b', rec('200','b') );
264 marc('900','c', rec('200','c') );
265
266 which might not be what you have in mind. If you need repeatable subfield,
267 define it using C<marc_repeatable_subfield> like this:
268
269 marc_repeatable_subfield('900','a');
270 marc('900','a', rec('200','a') );
271 marc('900','b', rec('200','b') );
272 marc('900','c', rec('200','c') );
273
274 will create:
275
276 900a-1 900a-2 900a-3 900b-1 900c-1
277 900b-2
278
279 There is also support for returning next or specific using:
280
281 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
282 # do something with $mf
283 }
284
285 will always return fields from next MARC record or
286
287 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
288
289 will return 42th copy record (if it exists).
290
291 =cut
292
293 my $fetch_pos;
294
295 sub _get_marc_fields {
296
297 my $arg = {@_};
298 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
299 $fetch_pos = $marc_fetch_offset;
300 if ($arg->{offset}) {
301 $fetch_pos = $arg->{offset};
302 } elsif($arg->{fetch_next}) {
303 $marc_fetch_offset++;
304 }
305
306 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
307
308 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
309
310 my $marc_rec = $marc_record->[ $fetch_pos ];
311
312 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
313
314 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
315
316 # first, sort all existing fields
317 # XXX might not be needed, but modern perl might randomize elements in hash
318 my @sorted_marc_record = sort {
319 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
320 } @{ $marc_rec };
321
322 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
323
324 # output marc fields
325 my @m;
326
327 # count unique field-subfields (used for offset when walking to next subfield)
328 my $u;
329 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
330
331 if ($debug) {
332 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
333 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
334 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
335 warn "## subfield count = ", dump( $u ), $/;
336 }
337
338 my $len = $#sorted_marc_record;
339 my $visited;
340 my $i = 0;
341 my $field;
342
343 foreach ( 0 .. $len ) {
344
345 # find next element which isn't visited
346 while ($visited->{$i}) {
347 $i = ($i + 1) % ($len + 1);
348 }
349
350 # mark it visited
351 $visited->{$i}++;
352
353 my $row = dclone( $sorted_marc_record[$i] );
354
355 # field and subfield which is key for
356 # marc_repeatable_subfield and u
357 my $fsf = $row->[0] . ( $row->[3] || '' );
358
359 if ($debug > 1) {
360
361 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
362 print "### this [$i]: ", dump( $row ),$/;
363 print "### sf: ", $row->[3], " vs ", $field->[3],
364 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
365 if ($#$field >= 0);
366
367 }
368
369 # if field exists
370 if ( $#$field >= 0 ) {
371 if (
372 $row->[0] ne $field->[0] || # field
373 $row->[1] ne $field->[1] || # i1
374 $row->[2] ne $field->[2] # i2
375 ) {
376 push @m, $field;
377 warn "## saved/1 ", dump( $field ),$/ if ($debug);
378 $field = $row;
379
380 } elsif (
381 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
382 ||
383 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
384 ! $marc_repeatable_subfield->{ $fsf }
385 )
386 ) {
387 push @m, $field;
388 warn "## saved/2 ", dump( $field ),$/ if ($debug);
389 $field = $row;
390
391 } else {
392 # append new subfields to existing field
393 push @$field, ( $row->[3], $row->[4] );
394 }
395 } else {
396 # insert first field
397 $field = $row;
398 }
399
400 if (! $marc_repeatable_subfield->{ $fsf }) {
401 # make step to next subfield
402 $i = ($i + $u->{ $fsf } ) % ($len + 1);
403 }
404 }
405
406 if ($#$field >= 0) {
407 push @m, $field;
408 warn "## saved/3 ", dump( $field ),$/ if ($debug);
409 }
410
411 return \@m;
412 }
413
414 =head2 _get_marc_leader
415
416 Return leader from currently fetched record by L</_get_marc_fields>
417
418 print WebPAC::Normalize::_get_marc_leader();
419
420 =cut
421
422 sub _get_marc_leader {
423 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
424 return $marc_leader->[ $fetch_pos ];
425 }
426
427 =head2 _debug
428
429 Change level of debug warnings
430
431 _debug( 2 );
432
433 =cut
434
435 sub _debug {
436 my $l = shift;
437 return $debug unless defined($l);
438 warn "debug level $l",$/ if ($l > 0);
439 $debug = $l;
440 }
441
442 =head1 Functions to create C<data_structure>
443
444 Those functions generally have to first in your normalization file.
445
446 =head2 search_display
447
448 Define output for L<search> and L<display> at the same time
449
450 search_display('Title', rec('200','a') );
451
452
453 =cut
454
455 sub search_display {
456 my $name = shift or die "search_display needs name as first argument";
457 my @o = grep { defined($_) && $_ ne '' } @_;
458 return unless (@o);
459 $out->{$name}->{search} = \@o;
460 $out->{$name}->{display} = \@o;
461 }
462
463 =head2 display
464
465 Define output just for I<display>
466
467 @v = display('Title', rec('200','a') );
468
469 =cut
470
471 sub display {
472 my $name = shift or die "display needs name as first argument";
473 my @o = grep { defined($_) && $_ ne '' } @_;
474 return unless (@o);
475 $out->{$name}->{display} = \@o;
476 }
477
478 =head2 search
479
480 Prepare values just for I<search>
481
482 @v = search('Title', rec('200','a') );
483
484 =cut
485
486 sub search {
487 my $name = shift or die "search needs name as first argument";
488 my @o = grep { defined($_) && $_ ne '' } @_;
489 return unless (@o);
490 $out->{$name}->{search} = \@o;
491 }
492
493 =head2 marc_leader
494
495 Setup fields within MARC leader or get leader
496
497 marc_leader('05','c');
498 my $leader = marc_leader();
499
500 =cut
501
502 sub marc_leader {
503 my ($offset,$value) = @_;
504
505 if ($offset) {
506 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
507 } else {
508
509 if (defined($marc_leader)) {
510 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
511 return $marc_leader->[ $marc_record_offset ];
512 } else {
513 return;
514 }
515 }
516 }
517
518 =head2 marc_fixed
519
520 Create control/indentifier fields with values in fixed positions
521
522 marc_fixed('008', 00, '070402');
523 marc_fixed('008', 39, '|');
524
525 Positions not specified will be filled with spaces (C<0x20>).
526
527 There will be no effort to extend last specified value to full length of
528 field in standard.
529
530 =cut
531
532 sub marc_fixed {
533 my ($f, $pos, $val) = @_;
534 die "need marc(field, position, value)" unless defined($f) && defined($pos);
535
536 confess "need val" unless defined $val;
537
538 my $update = 0;
539
540 map {
541 if ($_->[0] eq $f) {
542 my $old = $_->[1];
543 if (length($old) <= $pos) {
544 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
545 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
546 } else {
547 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
548 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
549 }
550 $update++;
551 }
552 } @{ $marc_record->[ $marc_record_offset ] };
553
554 if (! $update) {
555 my $v = ' ' x $pos . $val;
556 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
557 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
558 }
559 }
560
561 =head2 marc
562
563 Save value for MARC field
564
565 marc('900','a', rec('200','a') );
566 marc('001', rec('000') );
567
568 =cut
569
570 sub marc {
571 my $f = shift or die "marc needs field";
572 die "marc field must be numer" unless ($f =~ /^\d+$/);
573
574 my $sf;
575 if ($f >= 10) {
576 $sf = shift or die "marc needs subfield";
577 }
578
579 foreach (@_) {
580 my $v = $_; # make var read-write for Encode
581 next unless (defined($v) && $v !~ /^\s*$/);
582 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
583 if (defined $sf) {
584 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
585 } else {
586 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
587 }
588 }
589 }
590
591 =head2 marc_repeatable_subfield
592
593 Save values for MARC repetable subfield
594
595 marc_repeatable_subfield('910', 'z', rec('909') );
596
597 =cut
598
599 sub marc_repeatable_subfield {
600 my ($f,$sf) = @_;
601 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
602 $marc_repeatable_subfield->{ $f . $sf }++;
603 marc(@_);
604 }
605
606 =head2 marc_indicators
607
608 Set both indicators for MARC field
609
610 marc_indicators('900', ' ', 1);
611
612 Any indicator value other than C<0-9> will be treated as undefined.
613
614 =cut
615
616 sub marc_indicators {
617 my $f = shift || die "marc_indicators need field!\n";
618 my ($i1,$i2) = @_;
619 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
620 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
621
622 $i1 = ' ' if ($i1 !~ /^\d$/);
623 $i2 = ' ' if ($i2 !~ /^\d$/);
624 @{ $marc_indicators->{$f} } = ($i1,$i2);
625 }
626
627 =head2 marc_compose
628
629 Save values for each MARC subfield explicitly
630
631 marc_compose('900',
632 'a', rec('200','a')
633 'b', rec('201','a')
634 'a', rec('200','b')
635 'c', rec('200','c')
636 );
637
638 If you specify C<+> for subfield, value will be appended
639 to previous defined subfield.
640
641 =cut
642
643 sub marc_compose {
644 my $f = shift or die "marc_compose needs field";
645 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
646
647 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
648 my $m = [ $f, $i1, $i2 ];
649
650 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
651
652 if ($#_ % 2 != 1) {
653 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
654 }
655
656 while (@_) {
657 my $sf = shift;
658 my $v = shift;
659
660 next unless (defined($v) && $v !~ /^\s*$/);
661 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
662 if ($sf ne '+') {
663 push @$m, ( $sf, $v );
664 } else {
665 $m->[ $#$m ] .= $v;
666 }
667 }
668
669 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
670
671 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
672 }
673
674 =head2 marc_duplicate
675
676 Generate copy of current MARC record and continue working on copy
677
678 marc_duplicate();
679
680 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
681 C<< _get_marc_fields( offset => 42 ) >>.
682
683 =cut
684
685 sub marc_duplicate {
686 my $m = $marc_record->[ -1 ];
687 die "can't duplicate record which isn't defined" unless ($m);
688 push @{ $marc_record }, dclone( $m );
689 push @{ $marc_leader }, dclone( marc_leader() );
690 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
691 $marc_record_offset = $#{ $marc_record };
692 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
693
694 }
695
696 =head2 marc_remove
697
698 Remove some field or subfield from MARC record.
699
700 marc_remove('200');
701 marc_remove('200','a');
702
703 This will erase field C<200> or C<200^a> from current MARC record.
704
705 marc_remove('*');
706
707 Will remove all fields in current MARC record.
708
709 This is useful after calling C<marc_duplicate> or on it's own (but, you
710 should probably just remove that subfield definition if you are not
711 using C<marc_duplicate>).
712
713 FIXME: support fields < 10.
714
715 =cut
716
717 sub marc_remove {
718 my ($f, $sf) = @_;
719
720 die "marc_remove needs record number" unless defined($f);
721
722 my $marc = $marc_record->[ $marc_record_offset ];
723
724 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
725
726 if ($f eq '*') {
727
728 delete( $marc_record->[ $marc_record_offset ] );
729 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
730
731 } else {
732
733 my $i = 0;
734 foreach ( 0 .. $#{ $marc } ) {
735 last unless (defined $marc->[$i]);
736 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
737 if ($marc->[$i]->[0] eq $f) {
738 if (! defined $sf) {
739 # remove whole field
740 splice @$marc, $i, 1;
741 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
742 $i--;
743 } else {
744 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
745 my $o = ($j * 2) + 3;
746 if ($marc->[$i]->[$o] eq $sf) {
747 # remove subfield
748 splice @{$marc->[$i]}, $o, 2;
749 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
750 # is record now empty?
751 if ($#{ $marc->[$i] } == 2) {
752 splice @$marc, $i, 1;
753 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
754 $i--;
755 };
756 }
757 }
758 }
759 }
760 $i++;
761 }
762
763 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
764
765 $marc_record->[ $marc_record_offset ] = $marc;
766 }
767
768 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
769 }
770
771 =head2 marc_original_order
772
773 Copy all subfields preserving original order to marc field.
774
775 marc_original_order( marc_field_number, original_input_field_number );
776
777 Please note that field numbers are consistent with other commands (marc
778 field number first), but somewhat counter-intuitive (destination and then
779 source).
780
781 You might want to use this command if you are just renaming subfields or
782 using pre-processing modify_record in C<config.yml> and don't need any
783 post-processing or want to preserve order of original subfields.
784
785
786 =cut
787
788 sub marc_original_order {
789
790 my ($to, $from) = @_;
791 die "marc_original_order needs from and to fields\n" unless ($from && $to);
792
793 return unless defined($rec->{$from});
794
795 my $r = $rec->{$from};
796 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
797
798 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
799 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
800
801 foreach my $d (@$r) {
802
803 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
804 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
805 next;
806 }
807
808 my @sfs = @{ $d->{subfields} };
809
810 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
811
812 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
813
814 my $m = [ $to, $i1, $i2 ];
815
816 while (my $sf = shift @sfs) {
817
818 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
819 my $offset = shift @sfs;
820 die "corrupted sufields specification for field $from\n" unless defined($offset);
821
822 my $v;
823 if (ref($d->{$sf}) eq 'ARRAY') {
824 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
825 } elsif ($offset == 0) {
826 $v = $d->{$sf};
827 } else {
828 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
829 }
830 push @$m, ( $sf, $v ) if (defined($v));
831 }
832
833 if ($#{$m} > 2) {
834 push @{ $marc_record->[ $marc_record_offset ] }, $m;
835 }
836 }
837
838 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
839 }
840
841 =head2 marc_count
842
843 Return number of MARC records created using L</marc_duplicate>.
844
845 print "created ", marc_count(), " records";
846
847 =cut
848
849 sub marc_count {
850 return $#{ $marc_record };
851 }
852
853
854 =head1 Functions to extract data from input
855
856 This function should be used inside functions to create C<data_structure> described
857 above.
858
859 =head2 _pack_subfields_hash
860
861 @subfields = _pack_subfields_hash( $h );
862 $subfields = _pack_subfields_hash( $h, 1 );
863
864 Return each subfield value in array or pack them all together and return scalar
865 with subfields (denoted by C<^>) and values.
866
867 =cut
868
869 sub _pack_subfields_hash {
870
871 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
872
873 my ($h,$include_subfields) = @_;
874
875 # sanity and ease of use
876 return $h if (ref($h) ne 'HASH');
877
878 if ( defined($h->{subfields}) ) {
879 my $sfs = delete $h->{subfields} || die "no subfields?";
880 my @out;
881 while (@$sfs) {
882 my $sf = shift @$sfs;
883 push @out, '^' . $sf if ($include_subfields);
884 my $o = shift @$sfs;
885 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
886 # single element subfields are not arrays
887 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
888
889 push @out, $h->{$sf};
890 } else {
891 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
892 push @out, $h->{$sf}->[$o];
893 }
894 }
895 if ($include_subfields) {
896 return join('', @out);
897 } else {
898 return @out;
899 }
900 } else {
901 if ($include_subfields) {
902 my $out = '';
903 foreach my $sf (sort keys %$h) {
904 if (ref($h->{$sf}) eq 'ARRAY') {
905 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
906 } else {
907 $out .= '^' . $sf . $h->{$sf};
908 }
909 }
910 return $out;
911 } else {
912 # FIXME this should probably be in alphabetical order instead of hash order
913 values %{$h};
914 }
915 }
916 }
917
918 =head2 rec1
919
920 Return all values in some field
921
922 @v = rec1('200')
923
924 TODO: order of values is probably same as in source data, need to investigate that
925
926 =cut
927
928 sub rec1 {
929 my $f = shift;
930 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
931 return unless (defined($rec) && defined($rec->{$f}));
932 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
933 if (ref($rec->{$f}) eq 'ARRAY') {
934 my @out;
935 foreach my $h ( @{ $rec->{$f} } ) {
936 if (ref($h) eq 'HASH') {
937 push @out, ( _pack_subfields_hash( $h ) );
938 } else {
939 push @out, $h;
940 }
941 }
942 return @out;
943 } elsif( defined($rec->{$f}) ) {
944 return $rec->{$f};
945 }
946 }
947
948 =head2 rec2
949
950 Return all values in specific field and subfield
951
952 @v = rec2('200','a')
953
954 =cut
955
956 sub rec2 {
957 my $f = shift;
958 return unless (defined($rec && $rec->{$f}));
959 my $sf = shift;
960 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
961 return map {
962 if (ref($_->{$sf}) eq 'ARRAY') {
963 @{ $_->{$sf} };
964 } else {
965 $_->{$sf};
966 }
967 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
968 }
969
970 =head2 rec
971
972 syntaxtic sugar for
973
974 @v = rec('200')
975 @v = rec('200','a')
976
977 If rec() returns just single value, it will
978 return scalar, not array.
979
980 =cut
981
982 sub rec {
983 my @out;
984 if ($#_ == 0) {
985 @out = rec1(@_);
986 } elsif ($#_ == 1) {
987 @out = rec2(@_);
988 }
989 if ($#out == 0 && ! wantarray) {
990 return $out[0];
991 } elsif (@out) {
992 return @out;
993 } else {
994 return '';
995 }
996 }
997
998 =head2 regex
999
1000 Apply regex to some or all values
1001
1002 @v = regex( 's/foo/bar/g', @v );
1003
1004 =cut
1005
1006 sub regex {
1007 my $r = shift;
1008 my @out;
1009 #warn "r: $r\n", dump(\@_);
1010 foreach my $t (@_) {
1011 next unless ($t);
1012 eval "\$t =~ $r";
1013 push @out, $t if ($t && $t ne '');
1014 }
1015 return @out;
1016 }
1017
1018 =head2 prefix
1019
1020 Prefix all values with a string
1021
1022 @v = prefix( 'my_', @v );
1023
1024 =cut
1025
1026 sub prefix {
1027 my $p = shift;
1028 return @_ unless defined( $p );
1029 return map { $p . $_ } grep { defined($_) } @_;
1030 }
1031
1032 =head2 suffix
1033
1034 suffix all values with a string
1035
1036 @v = suffix( '_my', @v );
1037
1038 =cut
1039
1040 sub suffix {
1041 my $s = shift;
1042 return @_ unless defined( $s );
1043 return map { $_ . $s } grep { defined($_) } @_;
1044 }
1045
1046 =head2 surround
1047
1048 surround all values with a two strings
1049
1050 @v = surround( 'prefix_', '_suffix', @v );
1051
1052 =cut
1053
1054 sub surround {
1055 my $p = shift;
1056 my $s = shift;
1057 $p = '' unless defined( $p );
1058 $s = '' unless defined( $s );
1059 return map { $p . $_ . $s } grep { defined($_) } @_;
1060 }
1061
1062 =head2 first
1063
1064 Return first element
1065
1066 $v = first( @v );
1067
1068 =cut
1069
1070 sub first {
1071 my $r = shift;
1072 return $r;
1073 }
1074
1075 =head2 lookup
1076
1077 Consult lookup hashes for some value
1078
1079 @v = lookup(
1080 sub {
1081 'ffkk/peri/mfn'.rec('000')
1082 },
1083 'ffkk','peri','200-a-200-e',
1084 sub {
1085 first(rec(200,'a')).' '.first(rec('200','e'))
1086 }
1087 );
1088
1089 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1090 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1091
1092 lookup(
1093 # which results to return from record recorded in lookup
1094 sub { 'ffkk/peri/mfn' . rec('000') },
1095 # from which database and input
1096 'ffkk','peri',
1097 # such that following values match
1098 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1099 # if this part is missing, we will try to match same fields
1100 # from lookup record and current one, or you can override
1101 # which records to use from current record using
1102 sub { rec('900','x') . ' ' . rec('900','y') },
1103 )
1104
1105 You can think about this lookup as SQL (if that helps):
1106
1107 select
1108 sub { what }
1109 from
1110 database, input
1111 where
1112 sub { filter from lookuped record }
1113 having
1114 sub { optional filter on current record }
1115
1116 Easy as pie, right?
1117
1118 =cut
1119
1120 sub lookup {
1121 my ($what, $database, $input, $key, $having) = @_;
1122
1123 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1124
1125 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1126 return unless (defined($lookup->{$database}->{$input}->{$key}));
1127
1128 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1129
1130 my $mfns;
1131 my @having = $having->();
1132
1133 warn "## having = ", dump( @having ) if ($debug > 2);
1134
1135 foreach my $h ( @having ) {
1136 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1137 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1138 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1139 }
1140 }
1141
1142 return unless ($mfns);
1143
1144 my @mfns = sort keys %$mfns;
1145
1146 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1147
1148 my $old_rec = $rec;
1149 my @out;
1150
1151 foreach my $mfn (@mfns) {
1152 $rec = $load_row_coderef->( $database, $input, $mfn );
1153
1154 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1155
1156 my @vals = $what->();
1157
1158 push @out, ( @vals );
1159
1160 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1161 }
1162
1163 # if (ref($lookup->{$k}) eq 'ARRAY') {
1164 # return @{ $lookup->{$k} };
1165 # } else {
1166 # return $lookup->{$k};
1167 # }
1168
1169 $rec = $old_rec;
1170
1171 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1172
1173 if ($#out == 0) {
1174 return $out[0];
1175 } else {
1176 return @out;
1177 }
1178 }
1179
1180 =head2 save_into_lookup
1181
1182 Save value into lookup. It associates current database, input
1183 and specific keys with one or more values which will be
1184 associated over MFN.
1185
1186 MFN will be extracted from first occurence current of field 000
1187 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1188
1189 my $nr = save_into_lookup($database,$input,$key,sub {
1190 # code which produce one or more values
1191 });
1192
1193 It returns number of items saved.
1194
1195 This function shouldn't be called directly, it's called from code created by
1196 L<WebPAC::Parser>.
1197
1198 =cut
1199
1200 sub save_into_lookup {
1201 my ($database,$input,$key,$coderef) = @_;
1202 die "save_into_lookup needs database" unless defined($database);
1203 die "save_into_lookup needs input" unless defined($input);
1204 die "save_into_lookup needs key" unless defined($key);
1205 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1206
1207 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1208
1209 my $mfn =
1210 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1211 defined($config->{_mfn}) ? $config->{_mfn} :
1212 die "mfn not defined or zero";
1213
1214 my $nr = 0;
1215
1216 foreach my $v ( $coderef->() ) {
1217 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1218 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1219 $nr++;
1220 }
1221
1222 return $nr;
1223 }
1224
1225 =head2 config
1226
1227 Consult config values stored in C<config.yml>
1228
1229 # return database code (key under databases in yaml)
1230 $database_code = config(); # use _ from hash
1231 $database_name = config('name');
1232 $database_input_name = config('input name');
1233
1234 Up to three levels are supported.
1235
1236 =cut
1237
1238 sub config {
1239 return unless ($config);
1240
1241 my $p = shift;
1242
1243 $p ||= '';
1244
1245 my $v;
1246
1247 warn "### getting config($p)\n" if ($debug > 1);
1248
1249 my @p = split(/\s+/,$p);
1250 if ($#p < 0) {
1251 $v = $config->{ '_' }; # special, database code
1252 } else {
1253
1254 my $c = dclone( $config );
1255
1256 foreach my $k (@p) {
1257 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1258 if (ref($c) eq 'ARRAY') {
1259 $c = shift @$c;
1260 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1261 last;
1262 }
1263
1264 if (! defined($c->{$k}) ) {
1265 $c = undef;
1266 last;
1267 } else {
1268 $c = $c->{$k};
1269 }
1270 }
1271 $v = $c if ($c);
1272
1273 }
1274
1275 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1276 warn "config( '$p' ) is empty\n" if (! $v);
1277
1278 return $v;
1279 }
1280
1281 =head2 id
1282
1283 Returns unique id of this record
1284
1285 $id = id();
1286
1287 Returns C<42/2> for 2nd occurence of MFN 42.
1288
1289 =cut
1290
1291 sub id {
1292 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1293 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1294 }
1295
1296 =head2 join_with
1297
1298 Joins walues with some delimiter
1299
1300 $v = join_with(", ", @v);
1301
1302 =cut
1303
1304 sub join_with {
1305 my $d = shift;
1306 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1307 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1308 return '' unless defined($v);
1309 return $v;
1310 }
1311
1312 =head2 split_rec_on
1313
1314 Split record subfield on some regex and take one of parts out
1315
1316 $a_before_semi_column =
1317 split_rec_on('200','a', /\s*;\s*/, $part);
1318
1319 C<$part> is optional number of element. First element is
1320 B<1>, not 0!
1321
1322 If there is no C<$part> parameter or C<$part> is 0, this function will
1323 return all values produced by splitting.
1324
1325 =cut
1326
1327 sub split_rec_on {
1328 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1329
1330 my ($fld, $sf, $regex, $part) = @_;
1331 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1332
1333 my @r = rec( $fld, $sf );
1334 my $v = shift @r;
1335 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1336
1337 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1338
1339 my @s = split( $regex, $v );
1340 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1341 if ($part && $part > 0) {
1342 return $s[ $part - 1 ];
1343 } else {
1344 return @s;
1345 }
1346 }
1347
1348 my $hash;
1349
1350 =head2 set
1351
1352 set( key => 'value' );
1353
1354 =cut
1355
1356 sub set {
1357 my ($k,$v) = @_;
1358 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1359 $hash->{$k} = $v;
1360 };
1361
1362 =head2 get
1363
1364 get( 'key' );
1365
1366 =cut
1367
1368 sub get {
1369 my $k = shift || return;
1370 my $v = $hash->{$k};
1371 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1372 return $v;
1373 }
1374
1375 =head2 count
1376
1377 if ( count( @result ) == 1 ) {
1378 # do something if only 1 result is there
1379 }
1380
1381 =cut
1382
1383 sub count {
1384 warn "## count ",dump(@_),$/ if ( $debug );
1385 return @_ . '';
1386 }
1387
1388 # END
1389 1;

  ViewVC Help
Powered by ViewVC 1.1.26