/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 979 - (show annotations)
Sat Nov 3 12:37:43 2007 UTC (16 years, 4 months ago) by dpavlin
File size: 30855 byte(s)
added frec which returns first occurence of record
as scalar (for eq/ne comparison and things like that)

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 frec
19 regex prefix suffix surround
20 first lookup join_with
21 save_into_lookup
22
23 split_rec_on
24
25 get set
26 count
27 /;
28
29 use warnings;
30 use strict;
31
32 #use base qw/WebPAC::Common/;
33 use Data::Dump qw/dump/;
34 use Storable qw/dclone/;
35 use Carp qw/confess/;
36
37 # debugging warn(s)
38 my $debug = 0;
39
40
41 =head1 NAME
42
43 WebPAC::Normalize - describe normalisaton rules using sets
44
45 =cut
46
47 our $VERSION = '0.31';
48
49 =head1 SYNOPSIS
50
51 This module uses C<conf/normalize/*.pl> files to perform normalisation
52 from input records using perl functions which are specialized for set
53 processing.
54
55 Sets are implemented as arrays, and normalisation file is valid perl, which
56 means that you check it's validity before running WebPAC using
57 C<perl -c normalize.pl>.
58
59 Normalisation can generate multiple output normalized data. For now, supported output
60 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
61 C<marc>.
62
63 =head1 FUNCTIONS
64
65 Functions which start with C<_> are private and used by WebPAC internally.
66 All other functions are available for use within normalisation rules.
67
68 =head2 data_structure
69
70 Return data structure
71
72 my $ds = WebPAC::Normalize::data_structure(
73 lookup => $lookup_hash,
74 row => $row,
75 rules => $normalize_pl_config,
76 marc_encoding => 'utf-8',
77 config => $config,
78 load_row_coderef => sub {
79 my ($database,$input,$mfn) = @_;
80 $store->load_row( database => $database, input => $input, id => $mfn );
81 },
82 );
83
84 Options C<row>, C<rules> and C<log> are mandatory while all
85 other are optional.
86
87 C<load_row_coderef> is closure only used when executing lookups, so they will
88 die if it's not defined.
89
90 This function will B<die> if normalizastion can't be evaled.
91
92 Since this function isn't exported you have to call it with
93 C<WebPAC::Normalize::data_structure>.
94
95 =cut
96
97 my $load_row_coderef;
98
99 sub data_structure {
100 my $arg = {@_};
101
102 die "need row argument" unless ($arg->{row});
103 die "need normalisation argument" unless ($arg->{rules});
104
105 # FIXME load this conditionally
106 # use WebPAC::Normalize::ISBN;
107
108 no strict 'subs';
109 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
110 _set_rec( $arg->{row} );
111 _set_config( $arg->{config} ) if defined($arg->{config});
112 _clean_ds( %{ $arg } );
113 $load_row_coderef = $arg->{load_row_coderef};
114
115 eval "$arg->{rules}";
116 die "error evaling $arg->{rules}: $@\n" if ($@);
117
118 return _get_ds();
119 }
120
121 =head2 _set_rec
122
123 Set current record hash
124
125 _set_rec( $rec );
126
127 =cut
128
129 my $rec;
130
131 sub _set_rec {
132 $rec = shift or die "no record hash";
133 }
134
135 =head2 _set_config
136
137 Set current config hash
138
139 _set_config( $config );
140
141 Magic keys are:
142
143 =over 4
144
145 =item _
146
147 Code of current database
148
149 =item _mfn
150
151 Current MFN
152
153 =back
154
155 =cut
156
157 my $config;
158
159 sub _set_config {
160 $config = shift;
161 }
162
163 =head2 _get_ds
164
165 Return hash formatted as data structure
166
167 my $ds = _get_ds();
168
169 =cut
170
171 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
172 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
173
174 sub _get_ds {
175
176 warn "## out = ",dump($out);
177
178 return $out;
179 }
180
181 =head2 _clean_ds
182
183 Clean data structure hash for next record
184
185 _clean_ds();
186
187 =cut
188
189 sub _clean_ds {
190 my $a = {@_};
191 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
192 ($marc_record_offset, $marc_fetch_offset) = (0,0);
193 $marc_encoding = $a->{marc_encoding};
194 }
195
196 =head2 _set_lookup
197
198 Set current lookup hash
199
200 _set_lookup( $lookup );
201
202 =cut
203
204 my $lookup;
205
206 sub _set_lookup {
207 $lookup = shift;
208 }
209
210 =head2 _get_lookup
211
212 Get current lookup hash
213
214 my $lookup = _get_lookup();
215
216 =cut
217
218 sub _get_lookup {
219 return $lookup;
220 }
221
222 =head2 _set_load_row
223
224 Setup code reference which will return L<data_structure> from
225 L<WebPAC::Store>
226
227 _set_load_row(sub {
228 my ($database,$input,$mfn) = @_;
229 $store->load_row( database => $database, input => $input, id => $mfn );
230 });
231
232 =cut
233
234 sub _set_load_row {
235 my $coderef = shift;
236 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
237
238 $load_row_coderef = $coderef;
239 }
240
241 =head2 _get_marc_fields
242
243 Get all fields defined by calls to C<marc>
244
245 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
246
247 We are using I<magic> which detect repeatable fields only from
248 sequence of field/subfield data generated by normalization.
249
250 Repeatable field is created when there is second occurence of same subfield or
251 if any of indicators are different.
252
253 This is sane for most cases. Something like:
254
255 900a-1 900b-1 900c-1
256 900a-2 900b-2
257 900a-3
258
259 will be created from any combination of:
260
261 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
262
263 and following rules:
264
265 marc('900','a', rec('200','a') );
266 marc('900','b', rec('200','b') );
267 marc('900','c', rec('200','c') );
268
269 which might not be what you have in mind. If you need repeatable subfield,
270 define it using C<marc_repeatable_subfield> like this:
271
272 marc_repeatable_subfield('900','a');
273 marc('900','a', rec('200','a') );
274 marc('900','b', rec('200','b') );
275 marc('900','c', rec('200','c') );
276
277 will create:
278
279 900a-1 900a-2 900a-3 900b-1 900c-1
280 900b-2
281
282 There is also support for returning next or specific using:
283
284 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
285 # do something with $mf
286 }
287
288 will always return fields from next MARC record or
289
290 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
291
292 will return 42th copy record (if it exists).
293
294 =cut
295
296 my $fetch_pos;
297
298 sub _get_marc_fields {
299
300 my $arg = {@_};
301 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
302 $fetch_pos = $marc_fetch_offset;
303 if ($arg->{offset}) {
304 $fetch_pos = $arg->{offset};
305 } elsif($arg->{fetch_next}) {
306 $marc_fetch_offset++;
307 }
308
309 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
310
311 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
312
313 my $marc_rec = $marc_record->[ $fetch_pos ];
314
315 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
316
317 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
318
319 # first, sort all existing fields
320 # XXX might not be needed, but modern perl might randomize elements in hash
321 my @sorted_marc_record = sort {
322 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
323 } @{ $marc_rec };
324
325 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
326
327 # output marc fields
328 my @m;
329
330 # count unique field-subfields (used for offset when walking to next subfield)
331 my $u;
332 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
333
334 if ($debug) {
335 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
336 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
337 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
338 warn "## subfield count = ", dump( $u ), $/;
339 }
340
341 my $len = $#sorted_marc_record;
342 my $visited;
343 my $i = 0;
344 my $field;
345
346 foreach ( 0 .. $len ) {
347
348 # find next element which isn't visited
349 while ($visited->{$i}) {
350 $i = ($i + 1) % ($len + 1);
351 }
352
353 # mark it visited
354 $visited->{$i}++;
355
356 my $row = dclone( $sorted_marc_record[$i] );
357
358 # field and subfield which is key for
359 # marc_repeatable_subfield and u
360 my $fsf = $row->[0] . ( $row->[3] || '' );
361
362 if ($debug > 1) {
363
364 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
365 print "### this [$i]: ", dump( $row ),$/;
366 print "### sf: ", $row->[3], " vs ", $field->[3],
367 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
368 if ($#$field >= 0);
369
370 }
371
372 # if field exists
373 if ( $#$field >= 0 ) {
374 if (
375 $row->[0] ne $field->[0] || # field
376 $row->[1] ne $field->[1] || # i1
377 $row->[2] ne $field->[2] # i2
378 ) {
379 push @m, $field;
380 warn "## saved/1 ", dump( $field ),$/ if ($debug);
381 $field = $row;
382
383 } elsif (
384 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
385 ||
386 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
387 ! $marc_repeatable_subfield->{ $fsf }
388 )
389 ) {
390 push @m, $field;
391 warn "## saved/2 ", dump( $field ),$/ if ($debug);
392 $field = $row;
393
394 } else {
395 # append new subfields to existing field
396 push @$field, ( $row->[3], $row->[4] );
397 }
398 } else {
399 # insert first field
400 $field = $row;
401 }
402
403 if (! $marc_repeatable_subfield->{ $fsf }) {
404 # make step to next subfield
405 $i = ($i + $u->{ $fsf } ) % ($len + 1);
406 }
407 }
408
409 if ($#$field >= 0) {
410 push @m, $field;
411 warn "## saved/3 ", dump( $field ),$/ if ($debug);
412 }
413
414 return \@m;
415 }
416
417 =head2 _get_marc_leader
418
419 Return leader from currently fetched record by L</_get_marc_fields>
420
421 print WebPAC::Normalize::_get_marc_leader();
422
423 =cut
424
425 sub _get_marc_leader {
426 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
427 return $marc_leader->[ $fetch_pos ];
428 }
429
430 =head2 _debug
431
432 Change level of debug warnings
433
434 _debug( 2 );
435
436 =cut
437
438 sub _debug {
439 my $l = shift;
440 return $debug unless defined($l);
441 warn "debug level $l",$/ if ($l > 0);
442 $debug = $l;
443 }
444
445 =head1 Functions to create C<data_structure>
446
447 Those functions generally have to first in your normalization file.
448
449 =head2 search_display
450
451 Define output for L<search> and L<display> at the same time
452
453 search_display('Title', rec('200','a') );
454
455
456 =cut
457
458 sub search_display {
459 my $name = shift or die "search_display needs name as first argument";
460 my @o = grep { defined($_) && $_ ne '' } @_;
461 return unless (@o);
462 $out->{$name}->{search} = \@o;
463 $out->{$name}->{display} = \@o;
464 }
465
466 =head2 tag
467
468 Old name for L<search_display>, but supported
469
470 =cut
471
472 sub tag {
473 search_display( @_ );
474 }
475
476 =head2 display
477
478 Define output just for I<display>
479
480 @v = display('Title', rec('200','a') );
481
482 =cut
483
484 sub _field {
485 my $type = shift or confess "need type -- BUG?";
486 my $name = shift or confess "needs name as first argument";
487 my @o = grep { defined($_) && $_ ne '' } @_;
488 return unless (@o);
489 $out->{$name}->{$type} = \@o;
490 }
491
492 sub display { _field( 'display', @_ ) }
493
494 =head2 search
495
496 Prepare values just for I<search>
497
498 @v = search('Title', rec('200','a') );
499
500 =cut
501
502 sub search { _field( 'search', @_ ) }
503
504 =head2 sorted
505
506 Insert into lists which will be automatically sorted
507
508 sorted('Title', rec('200','a') );
509
510 =cut
511
512 sub sorted { _field( 'sorted', @_ ) }
513
514
515 =head2 marc_leader
516
517 Setup fields within MARC leader or get leader
518
519 marc_leader('05','c');
520 my $leader = marc_leader();
521
522 =cut
523
524 sub marc_leader {
525 my ($offset,$value) = @_;
526
527 if ($offset) {
528 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
529 } else {
530
531 if (defined($marc_leader)) {
532 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
533 return $marc_leader->[ $marc_record_offset ];
534 } else {
535 return;
536 }
537 }
538 }
539
540 =head2 marc_fixed
541
542 Create control/indentifier fields with values in fixed positions
543
544 marc_fixed('008', 00, '070402');
545 marc_fixed('008', 39, '|');
546
547 Positions not specified will be filled with spaces (C<0x20>).
548
549 There will be no effort to extend last specified value to full length of
550 field in standard.
551
552 =cut
553
554 sub marc_fixed {
555 my ($f, $pos, $val) = @_;
556 die "need marc(field, position, value)" unless defined($f) && defined($pos);
557
558 confess "need val" unless defined $val;
559
560 my $update = 0;
561
562 map {
563 if ($_->[0] eq $f) {
564 my $old = $_->[1];
565 if (length($old) <= $pos) {
566 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
567 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
568 } else {
569 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
570 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
571 }
572 $update++;
573 }
574 } @{ $marc_record->[ $marc_record_offset ] };
575
576 if (! $update) {
577 my $v = ' ' x $pos . $val;
578 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
579 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
580 }
581 }
582
583 =head2 marc
584
585 Save value for MARC field
586
587 marc('900','a', rec('200','a') );
588 marc('001', rec('000') );
589
590 =cut
591
592 sub marc {
593 my $f = shift or die "marc needs field";
594 die "marc field must be numer" unless ($f =~ /^\d+$/);
595
596 my $sf;
597 if ($f >= 10) {
598 $sf = shift or die "marc needs subfield";
599 }
600
601 foreach (@_) {
602 my $v = $_; # make var read-write for Encode
603 next unless (defined($v) && $v !~ /^\s*$/);
604 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
605 if (defined $sf) {
606 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
607 } else {
608 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
609 }
610 }
611 }
612
613 =head2 marc_repeatable_subfield
614
615 Save values for MARC repetable subfield
616
617 marc_repeatable_subfield('910', 'z', rec('909') );
618
619 =cut
620
621 sub marc_repeatable_subfield {
622 my ($f,$sf) = @_;
623 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
624 $marc_repeatable_subfield->{ $f . $sf }++;
625 marc(@_);
626 }
627
628 =head2 marc_indicators
629
630 Set both indicators for MARC field
631
632 marc_indicators('900', ' ', 1);
633
634 Any indicator value other than C<0-9> will be treated as undefined.
635
636 =cut
637
638 sub marc_indicators {
639 my $f = shift || die "marc_indicators need field!\n";
640 my ($i1,$i2) = @_;
641 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
642 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
643
644 $i1 = ' ' if ($i1 !~ /^\d$/);
645 $i2 = ' ' if ($i2 !~ /^\d$/);
646 @{ $marc_indicators->{$f} } = ($i1,$i2);
647 }
648
649 =head2 marc_compose
650
651 Save values for each MARC subfield explicitly
652
653 marc_compose('900',
654 'a', rec('200','a')
655 'b', rec('201','a')
656 'a', rec('200','b')
657 'c', rec('200','c')
658 );
659
660 If you specify C<+> for subfield, value will be appended
661 to previous defined subfield.
662
663 =cut
664
665 sub marc_compose {
666 my $f = shift or die "marc_compose needs field";
667 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
668
669 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
670 my $m = [ $f, $i1, $i2 ];
671
672 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
673
674 if ($#_ % 2 != 1) {
675 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
676 }
677
678 while (@_) {
679 my $sf = shift;
680 my $v = shift;
681
682 next unless (defined($v) && $v !~ /^\s*$/);
683 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
684 if ($sf ne '+') {
685 push @$m, ( $sf, $v );
686 } else {
687 $m->[ $#$m ] .= $v;
688 }
689 }
690
691 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
692
693 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
694 }
695
696 =head2 marc_duplicate
697
698 Generate copy of current MARC record and continue working on copy
699
700 marc_duplicate();
701
702 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
703 C<< _get_marc_fields( offset => 42 ) >>.
704
705 =cut
706
707 sub marc_duplicate {
708 my $m = $marc_record->[ -1 ];
709 die "can't duplicate record which isn't defined" unless ($m);
710 push @{ $marc_record }, dclone( $m );
711 push @{ $marc_leader }, dclone( marc_leader() );
712 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
713 $marc_record_offset = $#{ $marc_record };
714 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
715
716 }
717
718 =head2 marc_remove
719
720 Remove some field or subfield from MARC record.
721
722 marc_remove('200');
723 marc_remove('200','a');
724
725 This will erase field C<200> or C<200^a> from current MARC record.
726
727 marc_remove('*');
728
729 Will remove all fields in current MARC record.
730
731 This is useful after calling C<marc_duplicate> or on it's own (but, you
732 should probably just remove that subfield definition if you are not
733 using C<marc_duplicate>).
734
735 FIXME: support fields < 10.
736
737 =cut
738
739 sub marc_remove {
740 my ($f, $sf) = @_;
741
742 die "marc_remove needs record number" unless defined($f);
743
744 my $marc = $marc_record->[ $marc_record_offset ];
745
746 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
747
748 if ($f eq '*') {
749
750 delete( $marc_record->[ $marc_record_offset ] );
751 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
752
753 } else {
754
755 my $i = 0;
756 foreach ( 0 .. $#{ $marc } ) {
757 last unless (defined $marc->[$i]);
758 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
759 if ($marc->[$i]->[0] eq $f) {
760 if (! defined $sf) {
761 # remove whole field
762 splice @$marc, $i, 1;
763 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
764 $i--;
765 } else {
766 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
767 my $o = ($j * 2) + 3;
768 if ($marc->[$i]->[$o] eq $sf) {
769 # remove subfield
770 splice @{$marc->[$i]}, $o, 2;
771 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
772 # is record now empty?
773 if ($#{ $marc->[$i] } == 2) {
774 splice @$marc, $i, 1;
775 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
776 $i--;
777 };
778 }
779 }
780 }
781 }
782 $i++;
783 }
784
785 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
786
787 $marc_record->[ $marc_record_offset ] = $marc;
788 }
789
790 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
791 }
792
793 =head2 marc_original_order
794
795 Copy all subfields preserving original order to marc field.
796
797 marc_original_order( marc_field_number, original_input_field_number );
798
799 Please note that field numbers are consistent with other commands (marc
800 field number first), but somewhat counter-intuitive (destination and then
801 source).
802
803 You might want to use this command if you are just renaming subfields or
804 using pre-processing modify_record in C<config.yml> and don't need any
805 post-processing or want to preserve order of original subfields.
806
807
808 =cut
809
810 sub marc_original_order {
811
812 my ($to, $from) = @_;
813 die "marc_original_order needs from and to fields\n" unless ($from && $to);
814
815 return unless defined($rec->{$from});
816
817 my $r = $rec->{$from};
818 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
819
820 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
821 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
822
823 foreach my $d (@$r) {
824
825 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
826 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
827 next;
828 }
829
830 my @sfs = @{ $d->{subfields} };
831
832 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
833
834 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
835
836 my $m = [ $to, $i1, $i2 ];
837
838 while (my $sf = shift @sfs) {
839
840 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
841 my $offset = shift @sfs;
842 die "corrupted sufields specification for field $from\n" unless defined($offset);
843
844 my $v;
845 if (ref($d->{$sf}) eq 'ARRAY') {
846 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
847 } elsif ($offset == 0) {
848 $v = $d->{$sf};
849 } else {
850 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
851 }
852 push @$m, ( $sf, $v ) if (defined($v));
853 }
854
855 if ($#{$m} > 2) {
856 push @{ $marc_record->[ $marc_record_offset ] }, $m;
857 }
858 }
859
860 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
861 }
862
863 =head2 marc_count
864
865 Return number of MARC records created using L</marc_duplicate>.
866
867 print "created ", marc_count(), " records";
868
869 =cut
870
871 sub marc_count {
872 return $#{ $marc_record };
873 }
874
875
876 =head1 Functions to extract data from input
877
878 This function should be used inside functions to create C<data_structure> described
879 above.
880
881 =head2 _pack_subfields_hash
882
883 @subfields = _pack_subfields_hash( $h );
884 $subfields = _pack_subfields_hash( $h, 1 );
885
886 Return each subfield value in array or pack them all together and return scalar
887 with subfields (denoted by C<^>) and values.
888
889 =cut
890
891 sub _pack_subfields_hash {
892
893 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
894
895 my ($h,$include_subfields) = @_;
896
897 # sanity and ease of use
898 return $h if (ref($h) ne 'HASH');
899
900 if ( defined($h->{subfields}) ) {
901 my $sfs = delete $h->{subfields} || die "no subfields?";
902 my @out;
903 while (@$sfs) {
904 my $sf = shift @$sfs;
905 push @out, '^' . $sf if ($include_subfields);
906 my $o = shift @$sfs;
907 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
908 # single element subfields are not arrays
909 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
910
911 push @out, $h->{$sf};
912 } else {
913 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
914 push @out, $h->{$sf}->[$o];
915 }
916 }
917 if ($include_subfields) {
918 return join('', @out);
919 } else {
920 return @out;
921 }
922 } else {
923 if ($include_subfields) {
924 my $out = '';
925 foreach my $sf (sort keys %$h) {
926 if (ref($h->{$sf}) eq 'ARRAY') {
927 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
928 } else {
929 $out .= '^' . $sf . $h->{$sf};
930 }
931 }
932 return $out;
933 } else {
934 # FIXME this should probably be in alphabetical order instead of hash order
935 values %{$h};
936 }
937 }
938 }
939
940 =head2 rec1
941
942 Return all values in some field
943
944 @v = rec1('200')
945
946 TODO: order of values is probably same as in source data, need to investigate that
947
948 =cut
949
950 sub rec1 {
951 my $f = shift;
952 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
953 return unless (defined($rec) && defined($rec->{$f}));
954 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
955 if (ref($rec->{$f}) eq 'ARRAY') {
956 my @out;
957 foreach my $h ( @{ $rec->{$f} } ) {
958 if (ref($h) eq 'HASH') {
959 push @out, ( _pack_subfields_hash( $h ) );
960 } else {
961 push @out, $h;
962 }
963 }
964 return @out;
965 } elsif( defined($rec->{$f}) ) {
966 return $rec->{$f};
967 }
968 }
969
970 =head2 rec2
971
972 Return all values in specific field and subfield
973
974 @v = rec2('200','a')
975
976 =cut
977
978 sub rec2 {
979 my $f = shift;
980 return unless (defined($rec && $rec->{$f}));
981 my $sf = shift;
982 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
983 return map {
984 if (ref($_->{$sf}) eq 'ARRAY') {
985 @{ $_->{$sf} };
986 } else {
987 $_->{$sf};
988 }
989 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
990 }
991
992 =head2 rec
993
994 syntaxtic sugar for
995
996 @v = rec('200')
997 @v = rec('200','a')
998
999 If rec() returns just single value, it will
1000 return scalar, not array.
1001
1002 =cut
1003
1004 sub frec {
1005 my @out = rec(@_);
1006 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
1007 return shift @out;
1008 }
1009
1010 sub rec {
1011 my @out;
1012 if ($#_ == 0) {
1013 @out = rec1(@_);
1014 } elsif ($#_ == 1) {
1015 @out = rec2(@_);
1016 }
1017 if ($#out == 0 && ! wantarray) {
1018 return $out[0];
1019 } elsif (@out) {
1020 return @out;
1021 } else {
1022 return '';
1023 }
1024 }
1025
1026 =head2 regex
1027
1028 Apply regex to some or all values
1029
1030 @v = regex( 's/foo/bar/g', @v );
1031
1032 =cut
1033
1034 sub regex {
1035 my $r = shift;
1036 my @out;
1037 #warn "r: $r\n", dump(\@_);
1038 foreach my $t (@_) {
1039 next unless ($t);
1040 eval "\$t =~ $r";
1041 push @out, $t if ($t && $t ne '');
1042 }
1043 return @out;
1044 }
1045
1046 =head2 prefix
1047
1048 Prefix all values with a string
1049
1050 @v = prefix( 'my_', @v );
1051
1052 =cut
1053
1054 sub prefix {
1055 my $p = shift;
1056 return @_ unless defined( $p );
1057 return map { $p . $_ } grep { defined($_) } @_;
1058 }
1059
1060 =head2 suffix
1061
1062 suffix all values with a string
1063
1064 @v = suffix( '_my', @v );
1065
1066 =cut
1067
1068 sub suffix {
1069 my $s = shift;
1070 return @_ unless defined( $s );
1071 return map { $_ . $s } grep { defined($_) } @_;
1072 }
1073
1074 =head2 surround
1075
1076 surround all values with a two strings
1077
1078 @v = surround( 'prefix_', '_suffix', @v );
1079
1080 =cut
1081
1082 sub surround {
1083 my $p = shift;
1084 my $s = shift;
1085 $p = '' unless defined( $p );
1086 $s = '' unless defined( $s );
1087 return map { $p . $_ . $s } grep { defined($_) } @_;
1088 }
1089
1090 =head2 first
1091
1092 Return first element
1093
1094 $v = first( @v );
1095
1096 =cut
1097
1098 sub first {
1099 my $r = shift;
1100 return $r;
1101 }
1102
1103 =head2 lookup
1104
1105 Consult lookup hashes for some value
1106
1107 @v = lookup(
1108 sub {
1109 'ffkk/peri/mfn'.rec('000')
1110 },
1111 'ffkk','peri','200-a-200-e',
1112 sub {
1113 first(rec(200,'a')).' '.first(rec('200','e'))
1114 }
1115 );
1116
1117 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1118 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1119
1120 lookup(
1121 # which results to return from record recorded in lookup
1122 sub { 'ffkk/peri/mfn' . rec('000') },
1123 # from which database and input
1124 'ffkk','peri',
1125 # such that following values match
1126 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1127 # if this part is missing, we will try to match same fields
1128 # from lookup record and current one, or you can override
1129 # which records to use from current record using
1130 sub { rec('900','x') . ' ' . rec('900','y') },
1131 )
1132
1133 You can think about this lookup as SQL (if that helps):
1134
1135 select
1136 sub { what }
1137 from
1138 database, input
1139 where
1140 sub { filter from lookuped record }
1141 having
1142 sub { optional filter on current record }
1143
1144 Easy as pie, right?
1145
1146 =cut
1147
1148 sub lookup {
1149 my ($what, $database, $input, $key, $having) = @_;
1150
1151 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1152
1153 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1154 return unless (defined($lookup->{$database}->{$input}->{$key}));
1155
1156 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1157
1158 my $mfns;
1159 my @having = $having->();
1160
1161 warn "## having = ", dump( @having ) if ($debug > 2);
1162
1163 foreach my $h ( @having ) {
1164 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1165 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1166 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1167 }
1168 }
1169
1170 return unless ($mfns);
1171
1172 my @mfns = sort keys %$mfns;
1173
1174 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1175
1176 my $old_rec = $rec;
1177 my @out;
1178
1179 foreach my $mfn (@mfns) {
1180 $rec = $load_row_coderef->( $database, $input, $mfn );
1181
1182 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1183
1184 my @vals = $what->();
1185
1186 push @out, ( @vals );
1187
1188 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1189 }
1190
1191 # if (ref($lookup->{$k}) eq 'ARRAY') {
1192 # return @{ $lookup->{$k} };
1193 # } else {
1194 # return $lookup->{$k};
1195 # }
1196
1197 $rec = $old_rec;
1198
1199 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1200
1201 if ($#out == 0) {
1202 return $out[0];
1203 } else {
1204 return @out;
1205 }
1206 }
1207
1208 =head2 save_into_lookup
1209
1210 Save value into lookup. It associates current database, input
1211 and specific keys with one or more values which will be
1212 associated over MFN.
1213
1214 MFN will be extracted from first occurence current of field 000
1215 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1216
1217 my $nr = save_into_lookup($database,$input,$key,sub {
1218 # code which produce one or more values
1219 });
1220
1221 It returns number of items saved.
1222
1223 This function shouldn't be called directly, it's called from code created by
1224 L<WebPAC::Parser>.
1225
1226 =cut
1227
1228 sub save_into_lookup {
1229 my ($database,$input,$key,$coderef) = @_;
1230 die "save_into_lookup needs database" unless defined($database);
1231 die "save_into_lookup needs input" unless defined($input);
1232 die "save_into_lookup needs key" unless defined($key);
1233 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1234
1235 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1236
1237 my $mfn =
1238 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1239 defined($config->{_mfn}) ? $config->{_mfn} :
1240 die "mfn not defined or zero";
1241
1242 my $nr = 0;
1243
1244 foreach my $v ( $coderef->() ) {
1245 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1246 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1247 $nr++;
1248 }
1249
1250 return $nr;
1251 }
1252
1253 =head2 config
1254
1255 Consult config values stored in C<config.yml>
1256
1257 # return database code (key under databases in yaml)
1258 $database_code = config(); # use _ from hash
1259 $database_name = config('name');
1260 $database_input_name = config('input name');
1261
1262 Up to three levels are supported.
1263
1264 =cut
1265
1266 sub config {
1267 return unless ($config);
1268
1269 my $p = shift;
1270
1271 $p ||= '';
1272
1273 my $v;
1274
1275 warn "### getting config($p)\n" if ($debug > 1);
1276
1277 my @p = split(/\s+/,$p);
1278 if ($#p < 0) {
1279 $v = $config->{ '_' }; # special, database code
1280 } else {
1281
1282 my $c = dclone( $config );
1283
1284 foreach my $k (@p) {
1285 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1286 if (ref($c) eq 'ARRAY') {
1287 $c = shift @$c;
1288 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1289 last;
1290 }
1291
1292 if (! defined($c->{$k}) ) {
1293 $c = undef;
1294 last;
1295 } else {
1296 $c = $c->{$k};
1297 }
1298 }
1299 $v = $c if ($c);
1300
1301 }
1302
1303 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1304 warn "config( '$p' ) is empty\n" if (! $v);
1305
1306 return $v;
1307 }
1308
1309 =head2 id
1310
1311 Returns unique id of this record
1312
1313 $id = id();
1314
1315 Returns C<42/2> for 2nd occurence of MFN 42.
1316
1317 =cut
1318
1319 sub id {
1320 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1321 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1322 }
1323
1324 =head2 join_with
1325
1326 Joins walues with some delimiter
1327
1328 $v = join_with(", ", @v);
1329
1330 =cut
1331
1332 sub join_with {
1333 my $d = shift;
1334 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1335 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1336 return '' unless defined($v);
1337 return $v;
1338 }
1339
1340 =head2 split_rec_on
1341
1342 Split record subfield on some regex and take one of parts out
1343
1344 $a_before_semi_column =
1345 split_rec_on('200','a', /\s*;\s*/, $part);
1346
1347 C<$part> is optional number of element. First element is
1348 B<1>, not 0!
1349
1350 If there is no C<$part> parameter or C<$part> is 0, this function will
1351 return all values produced by splitting.
1352
1353 =cut
1354
1355 sub split_rec_on {
1356 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1357
1358 my ($fld, $sf, $regex, $part) = @_;
1359 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1360
1361 my @r = rec( $fld, $sf );
1362 my $v = shift @r;
1363 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1364
1365 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1366
1367 my @s = split( $regex, $v );
1368 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1369 if ($part && $part > 0) {
1370 return $s[ $part - 1 ];
1371 } else {
1372 return @s;
1373 }
1374 }
1375
1376 my $hash;
1377
1378 =head2 set
1379
1380 set( key => 'value' );
1381
1382 =cut
1383
1384 sub set {
1385 my ($k,$v) = @_;
1386 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1387 $hash->{$k} = $v;
1388 };
1389
1390 =head2 get
1391
1392 get( 'key' );
1393
1394 =cut
1395
1396 sub get {
1397 my $k = shift || return;
1398 my $v = $hash->{$k};
1399 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1400 return $v;
1401 }
1402
1403 =head2 count
1404
1405 if ( count( @result ) == 1 ) {
1406 # do something if only 1 result is there
1407 }
1408
1409 =cut
1410
1411 sub count {
1412 warn "## count ",dump(@_),$/ if ( $debug );
1413 return @_ . '';
1414 }
1415
1416 # END
1417 1;

  ViewVC Help
Powered by ViewVC 1.1.26