/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1021 - (show annotations)
Sat Nov 10 11:11:16 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 31988 byte(s)
 r1579@llin:  dpavlin | 2007-11-10 11:59:27 +0100
 Begin extraction of MARC functionality from WebPAC::Normalize to
 WebPAC::Normalize::MARC

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16 marc_template
17
18 rec1 rec2 rec
19 frec frec_eq frec_ne
20 regex prefix suffix surround
21 first lookup join_with
22 save_into_lookup
23
24 split_rec_on
25
26 get set
27 count
28
29 /;
30
31 use warnings;
32 use strict;
33
34 #use base qw/WebPAC::Common/;
35 use Data::Dump qw/dump/;
36 use Storable qw/dclone/;
37 use Carp qw/confess/;
38
39 # debugging warn(s)
40 my $debug = 0;
41
42 # FIXME
43 use WebPAC::Normalize::ISBN;
44 push @EXPORT, ( 'isbn_10', 'isbn_13' );
45
46 use WebPAC::Normalize::MARC;
47 push @EXPORT, ( 'marc_template' );
48
49 =head1 NAME
50
51 WebPAC::Normalize - describe normalisaton rules using sets
52
53 =cut
54
55 our $VERSION = '0.35';
56
57 =head1 SYNOPSIS
58
59 This module uses C<conf/normalize/*.pl> files to perform normalisation
60 from input records using perl functions which are specialized for set
61 processing.
62
63 Sets are implemented as arrays, and normalisation file is valid perl, which
64 means that you check it's validity before running WebPAC using
65 C<perl -c normalize.pl>.
66
67 Normalisation can generate multiple output normalized data. For now, supported output
68 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69 C<marc>.
70
71 =head1 FUNCTIONS
72
73 Functions which start with C<_> are private and used by WebPAC internally.
74 All other functions are available for use within normalisation rules.
75
76 =head2 data_structure
77
78 Return data structure
79
80 my $ds = WebPAC::Normalize::data_structure(
81 lookup => $lookup_hash,
82 row => $row,
83 rules => $normalize_pl_config,
84 marc_encoding => 'utf-8',
85 config => $config,
86 load_row_coderef => sub {
87 my ($database,$input,$mfn) = @_;
88 $store->load_row( database => $database, input => $input, id => $mfn );
89 },
90 );
91
92 Options C<row>, C<rules> and C<log> are mandatory while all
93 other are optional.
94
95 C<load_row_coderef> is closure only used when executing lookups, so they will
96 die if it's not defined.
97
98 This function will B<die> if normalizastion can't be evaled.
99
100 Since this function isn't exported you have to call it with
101 C<WebPAC::Normalize::data_structure>.
102
103 =cut
104
105 my $load_row_coderef;
106
107 sub data_structure {
108 my $arg = {@_};
109
110 die "need row argument" unless ($arg->{row});
111 die "need normalisation argument" unless ($arg->{rules});
112
113 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114 _set_ds( $arg->{row} );
115 _set_config( $arg->{config} ) if defined($arg->{config});
116 _clean_ds( %{ $arg } );
117 $load_row_coderef = $arg->{load_row_coderef};
118
119 no strict 'subs';
120 no warnings 'redefine';
121 eval "$arg->{rules};";
122 die "error evaling $arg->{rules}: $@\n" if ($@);
123
124 return _get_ds();
125 }
126
127 =head2 _set_ds
128
129 Set current record hash
130
131 _set_ds( $rec );
132
133 =cut
134
135 my $rec;
136
137 sub _set_ds {
138 $rec = shift or die "no record hash";
139 }
140
141 =head2
142
143 my $rec = _get_rec();
144
145 =cut
146
147 sub _get_rec { $rec };
148
149 =head2 _set_config
150
151 Set current config hash
152
153 _set_config( $config );
154
155 Magic keys are:
156
157 =over 4
158
159 =item _
160
161 Code of current database
162
163 =item _mfn
164
165 Current MFN
166
167 =back
168
169 =cut
170
171 my $config;
172
173 sub _set_config {
174 $config = shift;
175 }
176
177 =head2 _get_ds
178
179 Return hash formatted as data structure
180
181 my $ds = _get_ds();
182
183 =cut
184
185 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
186 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
187
188 sub _get_ds {
189 #warn "## out = ",dump($out);
190 return $out;
191 }
192
193 =head2 _clean_ds
194
195 Clean data structure hash for next record
196
197 _clean_ds();
198
199 =cut
200
201 sub _clean_ds {
202 my $a = {@_};
203 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
204 ($marc_record_offset, $marc_fetch_offset) = (0,0);
205 $marc_encoding = $a->{marc_encoding};
206 }
207
208 =head2 _set_lookup
209
210 Set current lookup hash
211
212 _set_lookup( $lookup );
213
214 =cut
215
216 my $lookup;
217
218 sub _set_lookup {
219 $lookup = shift;
220 }
221
222 =head2 _get_lookup
223
224 Get current lookup hash
225
226 my $lookup = _get_lookup();
227
228 =cut
229
230 sub _get_lookup {
231 return $lookup;
232 }
233
234 =head2 _set_load_row
235
236 Setup code reference which will return L<data_structure> from
237 L<WebPAC::Store>
238
239 _set_load_row(sub {
240 my ($database,$input,$mfn) = @_;
241 $store->load_row( database => $database, input => $input, id => $mfn );
242 });
243
244 =cut
245
246 sub _set_load_row {
247 my $coderef = shift;
248 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
249
250 $load_row_coderef = $coderef;
251 }
252
253 =head2 _get_marc_fields
254
255 Get all fields defined by calls to C<marc>
256
257 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
258
259 We are using I<magic> which detect repeatable fields only from
260 sequence of field/subfield data generated by normalization.
261
262 Repeatable field is created when there is second occurence of same subfield or
263 if any of indicators are different.
264
265 This is sane for most cases. Something like:
266
267 900a-1 900b-1 900c-1
268 900a-2 900b-2
269 900a-3
270
271 will be created from any combination of:
272
273 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
274
275 and following rules:
276
277 marc('900','a', rec('200','a') );
278 marc('900','b', rec('200','b') );
279 marc('900','c', rec('200','c') );
280
281 which might not be what you have in mind. If you need repeatable subfield,
282 define it using C<marc_repeatable_subfield> like this:
283
284 marc_repeatable_subfield('900','a');
285 marc('900','a', rec('200','a') );
286 marc('900','b', rec('200','b') );
287 marc('900','c', rec('200','c') );
288
289 will create:
290
291 900a-1 900a-2 900a-3 900b-1 900c-1
292 900b-2
293
294 There is also support for returning next or specific using:
295
296 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
297 # do something with $mf
298 }
299
300 will always return fields from next MARC record or
301
302 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
303
304 will return 42th copy record (if it exists).
305
306 =cut
307
308 my $fetch_pos;
309
310 sub _get_marc_fields {
311
312 my $arg = {@_};
313 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
314 $fetch_pos = $marc_fetch_offset;
315 if ($arg->{offset}) {
316 $fetch_pos = $arg->{offset};
317 } elsif($arg->{fetch_next}) {
318 $marc_fetch_offset++;
319 }
320
321 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
322
323 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
324
325 my $marc_rec = $marc_record->[ $fetch_pos ];
326
327 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
328
329 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
330
331 # first, sort all existing fields
332 # XXX might not be needed, but modern perl might randomize elements in hash
333 my @sorted_marc_record = sort {
334 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
335 } @{ $marc_rec };
336
337 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
338
339 # output marc fields
340 my @m;
341
342 # count unique field-subfields (used for offset when walking to next subfield)
343 my $u;
344 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
345
346 if ($debug) {
347 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
348 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
349 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
350 warn "## subfield count = ", dump( $u ), $/;
351 }
352
353 my $len = $#sorted_marc_record;
354 my $visited;
355 my $i = 0;
356 my $field;
357
358 foreach ( 0 .. $len ) {
359
360 # find next element which isn't visited
361 while ($visited->{$i}) {
362 $i = ($i + 1) % ($len + 1);
363 }
364
365 # mark it visited
366 $visited->{$i}++;
367
368 my $row = dclone( $sorted_marc_record[$i] );
369
370 # field and subfield which is key for
371 # marc_repeatable_subfield and u
372 my $fsf = $row->[0] . ( $row->[3] || '' );
373
374 if ($debug > 1) {
375
376 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
377 print "### this [$i]: ", dump( $row ),$/;
378 print "### sf: ", $row->[3], " vs ", $field->[3],
379 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
380 if ($#$field >= 0);
381
382 }
383
384 # if field exists
385 if ( $#$field >= 0 ) {
386 if (
387 $row->[0] ne $field->[0] || # field
388 $row->[1] ne $field->[1] || # i1
389 $row->[2] ne $field->[2] # i2
390 ) {
391 push @m, $field;
392 warn "## saved/1 ", dump( $field ),$/ if ($debug);
393 $field = $row;
394
395 } elsif (
396 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
397 ||
398 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
399 ! $marc_repeatable_subfield->{ $fsf }
400 )
401 ) {
402 push @m, $field;
403 warn "## saved/2 ", dump( $field ),$/ if ($debug);
404 $field = $row;
405
406 } else {
407 # append new subfields to existing field
408 push @$field, ( $row->[3], $row->[4] );
409 }
410 } else {
411 # insert first field
412 $field = $row;
413 }
414
415 if (! $marc_repeatable_subfield->{ $fsf }) {
416 # make step to next subfield
417 $i = ($i + $u->{ $fsf } ) % ($len + 1);
418 }
419 }
420
421 if ($#$field >= 0) {
422 push @m, $field;
423 warn "## saved/3 ", dump( $field ),$/ if ($debug);
424 }
425
426 return \@m;
427 }
428
429 =head2 _get_marc_leader
430
431 Return leader from currently fetched record by L</_get_marc_fields>
432
433 print WebPAC::Normalize::_get_marc_leader();
434
435 =cut
436
437 sub _get_marc_leader {
438 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
439 return $marc_leader->[ $fetch_pos ];
440 }
441
442 =head2 _debug
443
444 Change level of debug warnings
445
446 _debug( 2 );
447
448 =cut
449
450 sub _debug {
451 my $l = shift;
452 return $debug unless defined($l);
453 warn "debug level $l",$/ if ($l > 0);
454 $debug = $l;
455 }
456
457 =head1 Functions to create C<data_structure>
458
459 Those functions generally have to first in your normalization file.
460
461 =head2 search_display
462
463 Define output for L<search> and L<display> at the same time
464
465 search_display('Title', rec('200','a') );
466
467
468 =cut
469
470 sub search_display {
471 my $name = shift or die "search_display needs name as first argument";
472 my @o = grep { defined($_) && $_ ne '' } @_;
473 return unless (@o);
474 $out->{$name}->{search} = \@o;
475 $out->{$name}->{display} = \@o;
476 }
477
478 =head2 tag
479
480 Old name for L<search_display>, but supported
481
482 =cut
483
484 sub tag {
485 search_display( @_ );
486 }
487
488 =head2 display
489
490 Define output just for I<display>
491
492 @v = display('Title', rec('200','a') );
493
494 =cut
495
496 sub _field {
497 my $type = shift or confess "need type -- BUG?";
498 my $name = shift or confess "needs name as first argument";
499 my @o = grep { defined($_) && $_ ne '' } @_;
500 return unless (@o);
501 $out->{$name}->{$type} = \@o;
502 }
503
504 sub display { _field( 'display', @_ ) }
505
506 =head2 search
507
508 Prepare values just for I<search>
509
510 @v = search('Title', rec('200','a') );
511
512 =cut
513
514 sub search { _field( 'search', @_ ) }
515
516 =head2 sorted
517
518 Insert into lists which will be automatically sorted
519
520 sorted('Title', rec('200','a') );
521
522 =cut
523
524 sub sorted { _field( 'sorted', @_ ) }
525
526
527 =head2 marc_leader
528
529 Setup fields within MARC leader or get leader
530
531 marc_leader('05','c');
532 my $leader = marc_leader();
533
534 =cut
535
536 sub marc_leader {
537 my ($offset,$value) = @_;
538
539 if ($offset) {
540 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
541 } else {
542
543 if (defined($marc_leader)) {
544 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
545 return $marc_leader->[ $marc_record_offset ];
546 } else {
547 return;
548 }
549 }
550 }
551
552 =head2 marc_fixed
553
554 Create control/indentifier fields with values in fixed positions
555
556 marc_fixed('008', 00, '070402');
557 marc_fixed('008', 39, '|');
558
559 Positions not specified will be filled with spaces (C<0x20>).
560
561 There will be no effort to extend last specified value to full length of
562 field in standard.
563
564 =cut
565
566 sub marc_fixed {
567 my ($f, $pos, $val) = @_;
568 die "need marc(field, position, value)" unless defined($f) && defined($pos);
569
570 confess "need val" unless defined $val;
571
572 my $update = 0;
573
574 map {
575 if ($_->[0] eq $f) {
576 my $old = $_->[1];
577 if (length($old) <= $pos) {
578 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
579 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
580 } else {
581 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
582 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
583 }
584 $update++;
585 }
586 } @{ $marc_record->[ $marc_record_offset ] };
587
588 if (! $update) {
589 my $v = ' ' x $pos . $val;
590 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
591 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
592 }
593 }
594
595 =head2 marc
596
597 Save value for MARC field
598
599 marc('900','a', rec('200','a') );
600 marc('001', rec('000') );
601
602 =cut
603
604 sub marc {
605 my $f = shift or die "marc needs field";
606 die "marc field must be numer" unless ($f =~ /^\d+$/);
607
608 my $sf;
609 if ($f >= 10) {
610 $sf = shift or die "marc needs subfield";
611 }
612
613 foreach (@_) {
614 my $v = $_; # make var read-write for Encode
615 next unless (defined($v) && $v !~ /^\s*$/);
616 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
617 if (defined $sf) {
618 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
619 } else {
620 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
621 }
622 }
623 }
624
625 =head2 marc_repeatable_subfield
626
627 Save values for MARC repetable subfield
628
629 marc_repeatable_subfield('910', 'z', rec('909') );
630
631 =cut
632
633 sub marc_repeatable_subfield {
634 my ($f,$sf) = @_;
635 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
636 $marc_repeatable_subfield->{ $f . $sf }++;
637 marc(@_);
638 }
639
640 =head2 marc_indicators
641
642 Set both indicators for MARC field
643
644 marc_indicators('900', ' ', 1);
645
646 Any indicator value other than C<0-9> will be treated as undefined.
647
648 =cut
649
650 sub marc_indicators {
651 my $f = shift || die "marc_indicators need field!\n";
652 my ($i1,$i2) = @_;
653 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
654 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
655
656 $i1 = ' ' if ($i1 !~ /^\d$/);
657 $i2 = ' ' if ($i2 !~ /^\d$/);
658 @{ $marc_indicators->{$f} } = ($i1,$i2);
659 }
660
661 =head2 marc_compose
662
663 Save values for each MARC subfield explicitly
664
665 marc_compose('900',
666 'a', rec('200','a')
667 'b', rec('201','a')
668 'a', rec('200','b')
669 'c', rec('200','c')
670 );
671
672 If you specify C<+> for subfield, value will be appended
673 to previous defined subfield.
674
675 =cut
676
677 sub marc_compose {
678 my $f = shift or die "marc_compose needs field";
679 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
680
681 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
682 my $m = [ $f, $i1, $i2 ];
683
684 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
685
686 if ($#_ % 2 != 1) {
687 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
688 }
689
690 while (@_) {
691 my $sf = shift;
692 my $v = shift;
693
694 next unless (defined($v) && $v !~ /^\s*$/);
695 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
696 if ($sf ne '+') {
697 push @$m, ( $sf, $v );
698 } else {
699 $m->[ $#$m ] .= $v;
700 }
701 }
702
703 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
704
705 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
706 }
707
708 =head2 marc_duplicate
709
710 Generate copy of current MARC record and continue working on copy
711
712 marc_duplicate();
713
714 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
715 C<< _get_marc_fields( offset => 42 ) >>.
716
717 =cut
718
719 sub marc_duplicate {
720 my $m = $marc_record->[ -1 ];
721 die "can't duplicate record which isn't defined" unless ($m);
722 push @{ $marc_record }, dclone( $m );
723 push @{ $marc_leader }, dclone( marc_leader() );
724 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
725 $marc_record_offset = $#{ $marc_record };
726 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
727
728 }
729
730 =head2 marc_remove
731
732 Remove some field or subfield from MARC record.
733
734 marc_remove('200');
735 marc_remove('200','a');
736
737 This will erase field C<200> or C<200^a> from current MARC record.
738
739 marc_remove('*');
740
741 Will remove all fields in current MARC record.
742
743 This is useful after calling C<marc_duplicate> or on it's own (but, you
744 should probably just remove that subfield definition if you are not
745 using C<marc_duplicate>).
746
747 FIXME: support fields < 10.
748
749 =cut
750
751 sub marc_remove {
752 my ($f, $sf) = @_;
753
754 die "marc_remove needs record number" unless defined($f);
755
756 my $marc = $marc_record->[ $marc_record_offset ];
757
758 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
759
760 if ($f eq '*') {
761
762 delete( $marc_record->[ $marc_record_offset ] );
763 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
764
765 } else {
766
767 my $i = 0;
768 foreach ( 0 .. $#{ $marc } ) {
769 last unless (defined $marc->[$i]);
770 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
771 if ($marc->[$i]->[0] eq $f) {
772 if (! defined $sf) {
773 # remove whole field
774 splice @$marc, $i, 1;
775 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
776 $i--;
777 } else {
778 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
779 my $o = ($j * 2) + 3;
780 if ($marc->[$i]->[$o] eq $sf) {
781 # remove subfield
782 splice @{$marc->[$i]}, $o, 2;
783 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
784 # is record now empty?
785 if ($#{ $marc->[$i] } == 2) {
786 splice @$marc, $i, 1;
787 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
788 $i--;
789 };
790 }
791 }
792 }
793 }
794 $i++;
795 }
796
797 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
798
799 $marc_record->[ $marc_record_offset ] = $marc;
800 }
801
802 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
803 }
804
805 =head2 marc_original_order
806
807 Copy all subfields preserving original order to marc field.
808
809 marc_original_order( marc_field_number, original_input_field_number );
810
811 Please note that field numbers are consistent with other commands (marc
812 field number first), but somewhat counter-intuitive (destination and then
813 source).
814
815 You might want to use this command if you are just renaming subfields or
816 using pre-processing modify_record in C<config.yml> and don't need any
817 post-processing or want to preserve order of original subfields.
818
819
820 =cut
821
822 sub marc_original_order {
823
824 my ($to, $from) = @_;
825 die "marc_original_order needs from and to fields\n" unless ($from && $to);
826
827 return unless defined($rec->{$from});
828
829 my $r = $rec->{$from};
830 die "record field $from isn't array ",dump( $rec ) unless (ref($r) eq 'ARRAY');
831
832 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
833 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
834
835 foreach my $d (@$r) {
836
837 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
838 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
839 next;
840 }
841
842 my @sfs = @{ $d->{subfields} };
843
844 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
845
846 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
847
848 my $m = [ $to, $i1, $i2 ];
849
850 while (my $sf = shift @sfs) {
851
852 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
853 my $offset = shift @sfs;
854 die "corrupted sufields specification for field $from\n" unless defined($offset);
855
856 my $v;
857 if (ref($d->{$sf}) eq 'ARRAY') {
858 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
859 } elsif ($offset == 0) {
860 $v = $d->{$sf};
861 } else {
862 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
863 }
864 push @$m, ( $sf, $v ) if (defined($v));
865 }
866
867 if ($#{$m} > 2) {
868 push @{ $marc_record->[ $marc_record_offset ] }, $m;
869 }
870 }
871
872 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
873 }
874
875
876 =head2 marc_count
877
878 Return number of MARC records created using L</marc_duplicate>.
879
880 print "created ", marc_count(), " records";
881
882 =cut
883
884 sub marc_count {
885 return $#{ $marc_record };
886 }
887
888 =head2 _marc_push
889
890 _marc_push( $marc );
891
892 =cut
893
894 sub _marc_push {
895 my $marc = shift || die "no marc?";
896 push @{ $marc_record->[ $marc_record_offset ] }, $marc;
897 }
898
899
900 =head1 Functions to extract data from input
901
902 This function should be used inside functions to create C<data_structure> described
903 above.
904
905 =head2 _pack_subfields_hash
906
907 @subfields = _pack_subfields_hash( $h );
908 $subfields = _pack_subfields_hash( $h, 1 );
909
910 Return each subfield value in array or pack them all together and return scalar
911 with subfields (denoted by C<^>) and values.
912
913 =cut
914
915 sub _pack_subfields_hash {
916
917 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
918
919 my ($h,$include_subfields) = @_;
920
921 # sanity and ease of use
922 return $h if (ref($h) ne 'HASH');
923
924 if ( defined($h->{subfields}) ) {
925 my $sfs = delete $h->{subfields} || die "no subfields?";
926 my @out;
927 while (@$sfs) {
928 my $sf = shift @$sfs;
929 push @out, '^' . $sf if ($include_subfields);
930 my $o = shift @$sfs;
931 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
932 # single element subfields are not arrays
933 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
934
935 push @out, $h->{$sf};
936 } else {
937 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
938 push @out, $h->{$sf}->[$o];
939 }
940 }
941 if ($include_subfields) {
942 return join('', @out);
943 } else {
944 return @out;
945 }
946 } else {
947 if ($include_subfields) {
948 my $out = '';
949 foreach my $sf (sort keys %$h) {
950 if (ref($h->{$sf}) eq 'ARRAY') {
951 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
952 } else {
953 $out .= '^' . $sf . $h->{$sf};
954 }
955 }
956 return $out;
957 } else {
958 # FIXME this should probably be in alphabetical order instead of hash order
959 values %{$h};
960 }
961 }
962 }
963
964 =head2 rec1
965
966 Return all values in some field
967
968 @v = rec1('200')
969
970 TODO: order of values is probably same as in source data, need to investigate that
971
972 =cut
973
974 sub rec1 {
975 my $f = shift;
976 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
977 return unless (defined($rec) && defined($rec->{$f}));
978 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
979 if (ref($rec->{$f}) eq 'ARRAY') {
980 my @out;
981 foreach my $h ( @{ $rec->{$f} } ) {
982 if (ref($h) eq 'HASH') {
983 push @out, ( _pack_subfields_hash( $h ) );
984 } else {
985 push @out, $h;
986 }
987 }
988 return @out;
989 } elsif( defined($rec->{$f}) ) {
990 return $rec->{$f};
991 }
992 }
993
994 =head2 rec2
995
996 Return all values in specific field and subfield
997
998 @v = rec2('200','a')
999
1000 =cut
1001
1002 sub rec2 {
1003 my $f = shift;
1004 return unless (defined($rec && $rec->{$f}));
1005 my $sf = shift;
1006 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
1007 return map {
1008 if (ref($_->{$sf}) eq 'ARRAY') {
1009 @{ $_->{$sf} };
1010 } else {
1011 $_->{$sf};
1012 }
1013 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
1014 }
1015
1016 =head2 rec
1017
1018 syntaxtic sugar for
1019
1020 @v = rec('200')
1021 @v = rec('200','a')
1022
1023 If rec() returns just single value, it will
1024 return scalar, not array.
1025
1026 =cut
1027
1028 sub rec {
1029 my @out;
1030 if ($#_ == 0) {
1031 @out = rec1(@_);
1032 } elsif ($#_ == 1) {
1033 @out = rec2(@_);
1034 }
1035 if ($#out == 0 && ! wantarray) {
1036 return $out[0];
1037 } elsif (@out) {
1038 return @out;
1039 } else {
1040 return '';
1041 }
1042 }
1043
1044 =head2 frec
1045
1046 Returns first value from field
1047
1048 $v = frec('200');
1049 $v = frec('200','a');
1050
1051 =cut
1052
1053 sub frec {
1054 my @out = rec(@_);
1055 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1056 return shift @out;
1057 }
1058
1059 =head2 frec_eq
1060
1061 =head2 frec_ne
1062
1063 Check if first values from two fields are same or different
1064
1065 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
1066 # values are same
1067 } else {
1068 # values are different
1069 }
1070
1071 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
1072 could write something like:
1073
1074 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
1075 # yada tada
1076 }
1077
1078 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
1079 in order to parse text and create invalid function C<eqfrec>.
1080
1081 =cut
1082
1083 sub frec_eq {
1084 my ( $f1,$sf1, $f2, $sf2 ) = @_;
1085 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
1086 }
1087
1088 sub frec_ne {
1089 return ! frec_eq( @_ );
1090 }
1091
1092 =head2 regex
1093
1094 Apply regex to some or all values
1095
1096 @v = regex( 's/foo/bar/g', @v );
1097
1098 =cut
1099
1100 sub regex {
1101 my $r = shift;
1102 my @out;
1103 #warn "r: $r\n", dump(\@_);
1104 foreach my $t (@_) {
1105 next unless ($t);
1106 eval "\$t =~ $r";
1107 push @out, $t if ($t && $t ne '');
1108 }
1109 return @out;
1110 }
1111
1112 =head2 prefix
1113
1114 Prefix all values with a string
1115
1116 @v = prefix( 'my_', @v );
1117
1118 =cut
1119
1120 sub prefix {
1121 my $p = shift;
1122 return @_ unless defined( $p );
1123 return map { $p . $_ } grep { defined($_) } @_;
1124 }
1125
1126 =head2 suffix
1127
1128 suffix all values with a string
1129
1130 @v = suffix( '_my', @v );
1131
1132 =cut
1133
1134 sub suffix {
1135 my $s = shift;
1136 return @_ unless defined( $s );
1137 return map { $_ . $s } grep { defined($_) } @_;
1138 }
1139
1140 =head2 surround
1141
1142 surround all values with a two strings
1143
1144 @v = surround( 'prefix_', '_suffix', @v );
1145
1146 =cut
1147
1148 sub surround {
1149 my $p = shift;
1150 my $s = shift;
1151 $p = '' unless defined( $p );
1152 $s = '' unless defined( $s );
1153 return map { $p . $_ . $s } grep { defined($_) } @_;
1154 }
1155
1156 =head2 first
1157
1158 Return first element
1159
1160 $v = first( @v );
1161
1162 =cut
1163
1164 sub first {
1165 my $r = shift;
1166 return $r;
1167 }
1168
1169 =head2 lookup
1170
1171 Consult lookup hashes for some value
1172
1173 @v = lookup(
1174 sub {
1175 'ffkk/peri/mfn'.rec('000')
1176 },
1177 'ffkk','peri','200-a-200-e',
1178 sub {
1179 first(rec(200,'a')).' '.first(rec('200','e'))
1180 }
1181 );
1182
1183 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1184 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1185
1186 lookup(
1187 # which results to return from record recorded in lookup
1188 sub { 'ffkk/peri/mfn' . rec('000') },
1189 # from which database and input
1190 'ffkk','peri',
1191 # such that following values match
1192 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1193 # if this part is missing, we will try to match same fields
1194 # from lookup record and current one, or you can override
1195 # which records to use from current record using
1196 sub { rec('900','x') . ' ' . rec('900','y') },
1197 )
1198
1199 You can think about this lookup as SQL (if that helps):
1200
1201 select
1202 sub { what }
1203 from
1204 database, input
1205 where
1206 sub { filter from lookuped record }
1207 having
1208 sub { optional filter on current record }
1209
1210 Easy as pie, right?
1211
1212 =cut
1213
1214 sub lookup {
1215 my ($what, $database, $input, $key, $having) = @_;
1216
1217 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1218
1219 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1220 return unless (defined($lookup->{$database}->{$input}->{$key}));
1221
1222 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1223
1224 my $mfns;
1225 my @having = $having->();
1226
1227 warn "## having = ", dump( @having ) if ($debug > 2);
1228
1229 foreach my $h ( @having ) {
1230 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1231 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1232 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1233 }
1234 }
1235
1236 return unless ($mfns);
1237
1238 my @mfns = sort keys %$mfns;
1239
1240 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1241
1242 my $old_rec = $rec;
1243 my @out;
1244
1245 foreach my $mfn (@mfns) {
1246 $rec = $load_row_coderef->( $database, $input, $mfn );
1247
1248 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1249
1250 my @vals = $what->();
1251
1252 push @out, ( @vals );
1253
1254 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1255 }
1256
1257 # if (ref($lookup->{$k}) eq 'ARRAY') {
1258 # return @{ $lookup->{$k} };
1259 # } else {
1260 # return $lookup->{$k};
1261 # }
1262
1263 $rec = $old_rec;
1264
1265 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1266
1267 if ($#out == 0) {
1268 return $out[0];
1269 } else {
1270 return @out;
1271 }
1272 }
1273
1274 =head2 save_into_lookup
1275
1276 Save value into lookup. It associates current database, input
1277 and specific keys with one or more values which will be
1278 associated over MFN.
1279
1280 MFN will be extracted from first occurence current of field 000
1281 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1282
1283 my $nr = save_into_lookup($database,$input,$key,sub {
1284 # code which produce one or more values
1285 });
1286
1287 It returns number of items saved.
1288
1289 This function shouldn't be called directly, it's called from code created by
1290 L<WebPAC::Parser>.
1291
1292 =cut
1293
1294 sub save_into_lookup {
1295 my ($database,$input,$key,$coderef) = @_;
1296 die "save_into_lookup needs database" unless defined($database);
1297 die "save_into_lookup needs input" unless defined($input);
1298 die "save_into_lookup needs key" unless defined($key);
1299 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1300
1301 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1302
1303 my $mfn =
1304 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1305 defined($config->{_mfn}) ? $config->{_mfn} :
1306 die "mfn not defined or zero";
1307
1308 my $nr = 0;
1309
1310 foreach my $v ( $coderef->() ) {
1311 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1312 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1313 $nr++;
1314 }
1315
1316 return $nr;
1317 }
1318
1319 =head2 config
1320
1321 Consult config values stored in C<config.yml>
1322
1323 # return database code (key under databases in yaml)
1324 $database_code = config(); # use _ from hash
1325 $database_name = config('name');
1326 $database_input_name = config('input name');
1327
1328 Up to three levels are supported.
1329
1330 =cut
1331
1332 sub config {
1333 return unless ($config);
1334
1335 my $p = shift;
1336
1337 $p ||= '';
1338
1339 my $v;
1340
1341 warn "### getting config($p)\n" if ($debug > 1);
1342
1343 my @p = split(/\s+/,$p);
1344 if ($#p < 0) {
1345 $v = $config->{ '_' }; # special, database code
1346 } else {
1347
1348 my $c = dclone( $config );
1349
1350 foreach my $k (@p) {
1351 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1352 if (ref($c) eq 'ARRAY') {
1353 $c = shift @$c;
1354 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1355 last;
1356 }
1357
1358 if (! defined($c->{$k}) ) {
1359 $c = undef;
1360 last;
1361 } else {
1362 $c = $c->{$k};
1363 }
1364 }
1365 $v = $c if ($c);
1366
1367 }
1368
1369 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1370 warn "config( '$p' ) is empty\n" if (! $v);
1371
1372 return $v;
1373 }
1374
1375 =head2 id
1376
1377 Returns unique id of this record
1378
1379 $id = id();
1380
1381 Returns C<42/2> for 2nd occurence of MFN 42.
1382
1383 =cut
1384
1385 sub id {
1386 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1387 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1388 }
1389
1390 =head2 join_with
1391
1392 Joins walues with some delimiter
1393
1394 $v = join_with(", ", @v);
1395
1396 =cut
1397
1398 sub join_with {
1399 my $d = shift;
1400 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1401 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1402 return '' unless defined($v);
1403 return $v;
1404 }
1405
1406 =head2 split_rec_on
1407
1408 Split record subfield on some regex and take one of parts out
1409
1410 $a_before_semi_column =
1411 split_rec_on('200','a', /\s*;\s*/, $part);
1412
1413 C<$part> is optional number of element. First element is
1414 B<1>, not 0!
1415
1416 If there is no C<$part> parameter or C<$part> is 0, this function will
1417 return all values produced by splitting.
1418
1419 =cut
1420
1421 sub split_rec_on {
1422 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1423
1424 my ($fld, $sf, $regex, $part) = @_;
1425 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1426
1427 my @r = rec( $fld, $sf );
1428 my $v = shift @r;
1429 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1430
1431 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1432
1433 my @s = split( $regex, $v );
1434 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1435 if ($part && $part > 0) {
1436 return $s[ $part - 1 ];
1437 } else {
1438 return @s;
1439 }
1440 }
1441
1442 my $hash;
1443
1444 =head2 set
1445
1446 set( key => 'value' );
1447
1448 =cut
1449
1450 sub set {
1451 my ($k,$v) = @_;
1452 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1453 $hash->{$k} = $v;
1454 };
1455
1456 =head2 get
1457
1458 get( 'key' );
1459
1460 =cut
1461
1462 sub get {
1463 my $k = shift || return;
1464 my $v = $hash->{$k};
1465 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1466 return $v;
1467 }
1468
1469 =head2 count
1470
1471 if ( count( @result ) == 1 ) {
1472 # do something if only 1 result is there
1473 }
1474
1475 =cut
1476
1477 sub count {
1478 warn "## count ",dump(@_),$/ if ( $debug );
1479 return @_ . '';
1480 }
1481
1482 # END
1483 1;

  ViewVC Help
Powered by ViewVC 1.1.26