/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 973 - (show annotations)
Fri Nov 2 14:59:12 2007 UTC (15 years, 2 months ago) by dpavlin
File size: 30683 byte(s)
 r1489@llin:  dpavlin | 2007-11-02 15:59:07 +0100
 load WebPAC::Normalize::ISBN at right place

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 regex prefix suffix surround
19 first lookup join_with
20 save_into_lookup
21
22 split_rec_on
23
24 get set
25 count
26 /;
27
28 use warnings;
29 use strict;
30
31 #use base qw/WebPAC::Common/;
32 use Data::Dump qw/dump/;
33 use Storable qw/dclone/;
34 use Carp qw/confess/;
35
36 # debugging warn(s)
37 my $debug = 0;
38
39
40 =head1 NAME
41
42 WebPAC::Normalize - describe normalisaton rules using sets
43
44 =cut
45
46 our $VERSION = '0.31';
47
48 =head1 SYNOPSIS
49
50 This module uses C<conf/normalize/*.pl> files to perform normalisation
51 from input records using perl functions which are specialized for set
52 processing.
53
54 Sets are implemented as arrays, and normalisation file is valid perl, which
55 means that you check it's validity before running WebPAC using
56 C<perl -c normalize.pl>.
57
58 Normalisation can generate multiple output normalized data. For now, supported output
59 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
60 C<marc>.
61
62 =head1 FUNCTIONS
63
64 Functions which start with C<_> are private and used by WebPAC internally.
65 All other functions are available for use within normalisation rules.
66
67 =head2 data_structure
68
69 Return data structure
70
71 my $ds = WebPAC::Normalize::data_structure(
72 lookup => $lookup_hash,
73 row => $row,
74 rules => $normalize_pl_config,
75 marc_encoding => 'utf-8',
76 config => $config,
77 load_row_coderef => sub {
78 my ($database,$input,$mfn) = shift;
79 $store->load_row( database => $database, input => $input, id => $mfn );
80 },
81 );
82
83 Options C<row>, C<rules> and C<log> are mandatory while all
84 other are optional.
85
86 C<load_row_coderef> is closure only used when executing lookups, so they will
87 die if it's not defined.
88
89 This function will B<die> if normalizastion can't be evaled.
90
91 Since this function isn't exported you have to call it with
92 C<WebPAC::Normalize::data_structure>.
93
94 =cut
95
96 my $load_row_coderef;
97
98 sub data_structure {
99 my $arg = {@_};
100
101 die "need row argument" unless ($arg->{row});
102 die "need normalisation argument" unless ($arg->{rules});
103
104 no strict 'subs';
105 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 _set_rec( $arg->{row} );
107 _set_config( $arg->{config} ) if defined($arg->{config});
108 _clean_ds( %{ $arg } );
109 $load_row_coderef = $arg->{load_row_coderef};
110
111 # FIXME load this conditionally
112 use WebPAC::Normalize::ISBN;
113
114 eval "$arg->{rules}";
115 die "error evaling $arg->{rules}: $@\n" if ($@);
116
117 return _get_ds();
118 }
119
120 =head2 _set_rec
121
122 Set current record hash
123
124 _set_rec( $rec );
125
126 =cut
127
128 my $rec;
129
130 sub _set_rec {
131 $rec = shift or die "no record hash";
132 }
133
134 =head2 _set_config
135
136 Set current config hash
137
138 _set_config( $config );
139
140 Magic keys are:
141
142 =over 4
143
144 =item _
145
146 Code of current database
147
148 =item _mfn
149
150 Current MFN
151
152 =back
153
154 =cut
155
156 my $config;
157
158 sub _set_config {
159 $config = shift;
160 }
161
162 =head2 _get_ds
163
164 Return hash formatted as data structure
165
166 my $ds = _get_ds();
167
168 =cut
169
170 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
171 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
172
173 sub _get_ds {
174 return $out;
175 }
176
177 =head2 _clean_ds
178
179 Clean data structure hash for next record
180
181 _clean_ds();
182
183 =cut
184
185 sub _clean_ds {
186 my $a = {@_};
187 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
188 ($marc_record_offset, $marc_fetch_offset) = (0,0);
189 $marc_encoding = $a->{marc_encoding};
190 }
191
192 =head2 _set_lookup
193
194 Set current lookup hash
195
196 _set_lookup( $lookup );
197
198 =cut
199
200 my $lookup;
201
202 sub _set_lookup {
203 $lookup = shift;
204 }
205
206 =head2 _get_lookup
207
208 Get current lookup hash
209
210 my $lookup = _get_lookup();
211
212 =cut
213
214 sub _get_lookup {
215 return $lookup;
216 }
217
218 =head2 _set_load_row
219
220 Setup code reference which will return L<data_structure> from
221 L<WebPAC::Store>
222
223 _set_load_row(sub {
224 my ($database,$input,$mfn) = @_;
225 $store->load_row( database => $database, input => $input, id => $mfn );
226 });
227
228 =cut
229
230 sub _set_load_row {
231 my $coderef = shift;
232 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
233
234 $load_row_coderef = $coderef;
235 }
236
237 =head2 _get_marc_fields
238
239 Get all fields defined by calls to C<marc>
240
241 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
242
243 We are using I<magic> which detect repeatable fields only from
244 sequence of field/subfield data generated by normalization.
245
246 Repeatable field is created when there is second occurence of same subfield or
247 if any of indicators are different.
248
249 This is sane for most cases. Something like:
250
251 900a-1 900b-1 900c-1
252 900a-2 900b-2
253 900a-3
254
255 will be created from any combination of:
256
257 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
258
259 and following rules:
260
261 marc('900','a', rec('200','a') );
262 marc('900','b', rec('200','b') );
263 marc('900','c', rec('200','c') );
264
265 which might not be what you have in mind. If you need repeatable subfield,
266 define it using C<marc_repeatable_subfield> like this:
267
268 marc_repeatable_subfield('900','a');
269 marc('900','a', rec('200','a') );
270 marc('900','b', rec('200','b') );
271 marc('900','c', rec('200','c') );
272
273 will create:
274
275 900a-1 900a-2 900a-3 900b-1 900c-1
276 900b-2
277
278 There is also support for returning next or specific using:
279
280 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
281 # do something with $mf
282 }
283
284 will always return fields from next MARC record or
285
286 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
287
288 will return 42th copy record (if it exists).
289
290 =cut
291
292 my $fetch_pos;
293
294 sub _get_marc_fields {
295
296 my $arg = {@_};
297 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
298 $fetch_pos = $marc_fetch_offset;
299 if ($arg->{offset}) {
300 $fetch_pos = $arg->{offset};
301 } elsif($arg->{fetch_next}) {
302 $marc_fetch_offset++;
303 }
304
305 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
306
307 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
308
309 my $marc_rec = $marc_record->[ $fetch_pos ];
310
311 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
312
313 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
314
315 # first, sort all existing fields
316 # XXX might not be needed, but modern perl might randomize elements in hash
317 my @sorted_marc_record = sort {
318 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
319 } @{ $marc_rec };
320
321 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
322
323 # output marc fields
324 my @m;
325
326 # count unique field-subfields (used for offset when walking to next subfield)
327 my $u;
328 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
329
330 if ($debug) {
331 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
332 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
333 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
334 warn "## subfield count = ", dump( $u ), $/;
335 }
336
337 my $len = $#sorted_marc_record;
338 my $visited;
339 my $i = 0;
340 my $field;
341
342 foreach ( 0 .. $len ) {
343
344 # find next element which isn't visited
345 while ($visited->{$i}) {
346 $i = ($i + 1) % ($len + 1);
347 }
348
349 # mark it visited
350 $visited->{$i}++;
351
352 my $row = dclone( $sorted_marc_record[$i] );
353
354 # field and subfield which is key for
355 # marc_repeatable_subfield and u
356 my $fsf = $row->[0] . ( $row->[3] || '' );
357
358 if ($debug > 1) {
359
360 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
361 print "### this [$i]: ", dump( $row ),$/;
362 print "### sf: ", $row->[3], " vs ", $field->[3],
363 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
364 if ($#$field >= 0);
365
366 }
367
368 # if field exists
369 if ( $#$field >= 0 ) {
370 if (
371 $row->[0] ne $field->[0] || # field
372 $row->[1] ne $field->[1] || # i1
373 $row->[2] ne $field->[2] # i2
374 ) {
375 push @m, $field;
376 warn "## saved/1 ", dump( $field ),$/ if ($debug);
377 $field = $row;
378
379 } elsif (
380 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
381 ||
382 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
383 ! $marc_repeatable_subfield->{ $fsf }
384 )
385 ) {
386 push @m, $field;
387 warn "## saved/2 ", dump( $field ),$/ if ($debug);
388 $field = $row;
389
390 } else {
391 # append new subfields to existing field
392 push @$field, ( $row->[3], $row->[4] );
393 }
394 } else {
395 # insert first field
396 $field = $row;
397 }
398
399 if (! $marc_repeatable_subfield->{ $fsf }) {
400 # make step to next subfield
401 $i = ($i + $u->{ $fsf } ) % ($len + 1);
402 }
403 }
404
405 if ($#$field >= 0) {
406 push @m, $field;
407 warn "## saved/3 ", dump( $field ),$/ if ($debug);
408 }
409
410 return \@m;
411 }
412
413 =head2 _get_marc_leader
414
415 Return leader from currently fetched record by L</_get_marc_fields>
416
417 print WebPAC::Normalize::_get_marc_leader();
418
419 =cut
420
421 sub _get_marc_leader {
422 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
423 return $marc_leader->[ $fetch_pos ];
424 }
425
426 =head2 _debug
427
428 Change level of debug warnings
429
430 _debug( 2 );
431
432 =cut
433
434 sub _debug {
435 my $l = shift;
436 return $debug unless defined($l);
437 warn "debug level $l",$/ if ($l > 0);
438 $debug = $l;
439 }
440
441 =head1 Functions to create C<data_structure>
442
443 Those functions generally have to first in your normalization file.
444
445 =head2 search_display
446
447 Define output for L<search> and L<display> at the same time
448
449 search_display('Title', rec('200','a') );
450
451
452 =cut
453
454 sub search_display {
455 my $name = shift or die "search_display needs name as first argument";
456 my @o = grep { defined($_) && $_ ne '' } @_;
457 return unless (@o);
458 $out->{$name}->{search} = \@o;
459 $out->{$name}->{display} = \@o;
460 }
461
462 =head2 tag
463
464 Old name for L<search_display>, but supported
465
466 =cut
467
468 sub tag {
469 search_display( @_ );
470 }
471
472 =head2 display
473
474 Define output just for I<display>
475
476 @v = display('Title', rec('200','a') );
477
478 =cut
479
480 sub _field {
481 my $type = shift or confess "need type -- BUG?";
482 my $name = shift or confess "needs name as first argument";
483 my @o = grep { defined($_) && $_ ne '' } @_;
484 return unless (@o);
485 $out->{$name}->{$type} = \@o;
486 }
487
488 sub display { _field( 'display', @_ ) }
489
490 =head2 search
491
492 Prepare values just for I<search>
493
494 @v = search('Title', rec('200','a') );
495
496 =cut
497
498 sub search { _field( 'search', @_ ) }
499
500 =head2 sorted
501
502 Insert into lists which will be automatically sorted
503
504 sorted('Title', rec('200','a') );
505
506 =cut
507
508 sub sorted { _field( 'sorted', @_ ) }
509
510
511 =head2 marc_leader
512
513 Setup fields within MARC leader or get leader
514
515 marc_leader('05','c');
516 my $leader = marc_leader();
517
518 =cut
519
520 sub marc_leader {
521 my ($offset,$value) = @_;
522
523 if ($offset) {
524 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
525 } else {
526
527 if (defined($marc_leader)) {
528 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
529 return $marc_leader->[ $marc_record_offset ];
530 } else {
531 return;
532 }
533 }
534 }
535
536 =head2 marc_fixed
537
538 Create control/indentifier fields with values in fixed positions
539
540 marc_fixed('008', 00, '070402');
541 marc_fixed('008', 39, '|');
542
543 Positions not specified will be filled with spaces (C<0x20>).
544
545 There will be no effort to extend last specified value to full length of
546 field in standard.
547
548 =cut
549
550 sub marc_fixed {
551 my ($f, $pos, $val) = @_;
552 die "need marc(field, position, value)" unless defined($f) && defined($pos);
553
554 confess "need val" unless defined $val;
555
556 my $update = 0;
557
558 map {
559 if ($_->[0] eq $f) {
560 my $old = $_->[1];
561 if (length($old) <= $pos) {
562 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
563 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
564 } else {
565 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
566 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
567 }
568 $update++;
569 }
570 } @{ $marc_record->[ $marc_record_offset ] };
571
572 if (! $update) {
573 my $v = ' ' x $pos . $val;
574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
575 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
576 }
577 }
578
579 =head2 marc
580
581 Save value for MARC field
582
583 marc('900','a', rec('200','a') );
584 marc('001', rec('000') );
585
586 =cut
587
588 sub marc {
589 my $f = shift or die "marc needs field";
590 die "marc field must be numer" unless ($f =~ /^\d+$/);
591
592 my $sf;
593 if ($f >= 10) {
594 $sf = shift or die "marc needs subfield";
595 }
596
597 foreach (@_) {
598 my $v = $_; # make var read-write for Encode
599 next unless (defined($v) && $v !~ /^\s*$/);
600 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
601 if (defined $sf) {
602 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
603 } else {
604 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
605 }
606 }
607 }
608
609 =head2 marc_repeatable_subfield
610
611 Save values for MARC repetable subfield
612
613 marc_repeatable_subfield('910', 'z', rec('909') );
614
615 =cut
616
617 sub marc_repeatable_subfield {
618 my ($f,$sf) = @_;
619 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
620 $marc_repeatable_subfield->{ $f . $sf }++;
621 marc(@_);
622 }
623
624 =head2 marc_indicators
625
626 Set both indicators for MARC field
627
628 marc_indicators('900', ' ', 1);
629
630 Any indicator value other than C<0-9> will be treated as undefined.
631
632 =cut
633
634 sub marc_indicators {
635 my $f = shift || die "marc_indicators need field!\n";
636 my ($i1,$i2) = @_;
637 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
638 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
639
640 $i1 = ' ' if ($i1 !~ /^\d$/);
641 $i2 = ' ' if ($i2 !~ /^\d$/);
642 @{ $marc_indicators->{$f} } = ($i1,$i2);
643 }
644
645 =head2 marc_compose
646
647 Save values for each MARC subfield explicitly
648
649 marc_compose('900',
650 'a', rec('200','a')
651 'b', rec('201','a')
652 'a', rec('200','b')
653 'c', rec('200','c')
654 );
655
656 If you specify C<+> for subfield, value will be appended
657 to previous defined subfield.
658
659 =cut
660
661 sub marc_compose {
662 my $f = shift or die "marc_compose needs field";
663 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
664
665 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
666 my $m = [ $f, $i1, $i2 ];
667
668 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
669
670 if ($#_ % 2 != 1) {
671 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
672 }
673
674 while (@_) {
675 my $sf = shift;
676 my $v = shift;
677
678 next unless (defined($v) && $v !~ /^\s*$/);
679 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
680 if ($sf ne '+') {
681 push @$m, ( $sf, $v );
682 } else {
683 $m->[ $#$m ] .= $v;
684 }
685 }
686
687 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
688
689 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
690 }
691
692 =head2 marc_duplicate
693
694 Generate copy of current MARC record and continue working on copy
695
696 marc_duplicate();
697
698 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
699 C<< _get_marc_fields( offset => 42 ) >>.
700
701 =cut
702
703 sub marc_duplicate {
704 my $m = $marc_record->[ -1 ];
705 die "can't duplicate record which isn't defined" unless ($m);
706 push @{ $marc_record }, dclone( $m );
707 push @{ $marc_leader }, dclone( marc_leader() );
708 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
709 $marc_record_offset = $#{ $marc_record };
710 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
711
712 }
713
714 =head2 marc_remove
715
716 Remove some field or subfield from MARC record.
717
718 marc_remove('200');
719 marc_remove('200','a');
720
721 This will erase field C<200> or C<200^a> from current MARC record.
722
723 marc_remove('*');
724
725 Will remove all fields in current MARC record.
726
727 This is useful after calling C<marc_duplicate> or on it's own (but, you
728 should probably just remove that subfield definition if you are not
729 using C<marc_duplicate>).
730
731 FIXME: support fields < 10.
732
733 =cut
734
735 sub marc_remove {
736 my ($f, $sf) = @_;
737
738 die "marc_remove needs record number" unless defined($f);
739
740 my $marc = $marc_record->[ $marc_record_offset ];
741
742 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
743
744 if ($f eq '*') {
745
746 delete( $marc_record->[ $marc_record_offset ] );
747 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
748
749 } else {
750
751 my $i = 0;
752 foreach ( 0 .. $#{ $marc } ) {
753 last unless (defined $marc->[$i]);
754 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
755 if ($marc->[$i]->[0] eq $f) {
756 if (! defined $sf) {
757 # remove whole field
758 splice @$marc, $i, 1;
759 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
760 $i--;
761 } else {
762 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
763 my $o = ($j * 2) + 3;
764 if ($marc->[$i]->[$o] eq $sf) {
765 # remove subfield
766 splice @{$marc->[$i]}, $o, 2;
767 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
768 # is record now empty?
769 if ($#{ $marc->[$i] } == 2) {
770 splice @$marc, $i, 1;
771 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
772 $i--;
773 };
774 }
775 }
776 }
777 }
778 $i++;
779 }
780
781 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
782
783 $marc_record->[ $marc_record_offset ] = $marc;
784 }
785
786 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
787 }
788
789 =head2 marc_original_order
790
791 Copy all subfields preserving original order to marc field.
792
793 marc_original_order( marc_field_number, original_input_field_number );
794
795 Please note that field numbers are consistent with other commands (marc
796 field number first), but somewhat counter-intuitive (destination and then
797 source).
798
799 You might want to use this command if you are just renaming subfields or
800 using pre-processing modify_record in C<config.yml> and don't need any
801 post-processing or want to preserve order of original subfields.
802
803
804 =cut
805
806 sub marc_original_order {
807
808 my ($to, $from) = @_;
809 die "marc_original_order needs from and to fields\n" unless ($from && $to);
810
811 return unless defined($rec->{$from});
812
813 my $r = $rec->{$from};
814 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
815
816 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
817 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
818
819 foreach my $d (@$r) {
820
821 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
822 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
823 next;
824 }
825
826 my @sfs = @{ $d->{subfields} };
827
828 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
829
830 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
831
832 my $m = [ $to, $i1, $i2 ];
833
834 while (my $sf = shift @sfs) {
835
836 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
837 my $offset = shift @sfs;
838 die "corrupted sufields specification for field $from\n" unless defined($offset);
839
840 my $v;
841 if (ref($d->{$sf}) eq 'ARRAY') {
842 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
843 } elsif ($offset == 0) {
844 $v = $d->{$sf};
845 } else {
846 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
847 }
848 push @$m, ( $sf, $v ) if (defined($v));
849 }
850
851 if ($#{$m} > 2) {
852 push @{ $marc_record->[ $marc_record_offset ] }, $m;
853 }
854 }
855
856 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
857 }
858
859 =head2 marc_count
860
861 Return number of MARC records created using L</marc_duplicate>.
862
863 print "created ", marc_count(), " records";
864
865 =cut
866
867 sub marc_count {
868 return $#{ $marc_record };
869 }
870
871
872 =head1 Functions to extract data from input
873
874 This function should be used inside functions to create C<data_structure> described
875 above.
876
877 =head2 _pack_subfields_hash
878
879 @subfields = _pack_subfields_hash( $h );
880 $subfields = _pack_subfields_hash( $h, 1 );
881
882 Return each subfield value in array or pack them all together and return scalar
883 with subfields (denoted by C<^>) and values.
884
885 =cut
886
887 sub _pack_subfields_hash {
888
889 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
890
891 my ($h,$include_subfields) = @_;
892
893 # sanity and ease of use
894 return $h if (ref($h) ne 'HASH');
895
896 if ( defined($h->{subfields}) ) {
897 my $sfs = delete $h->{subfields} || die "no subfields?";
898 my @out;
899 while (@$sfs) {
900 my $sf = shift @$sfs;
901 push @out, '^' . $sf if ($include_subfields);
902 my $o = shift @$sfs;
903 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
904 # single element subfields are not arrays
905 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
906
907 push @out, $h->{$sf};
908 } else {
909 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
910 push @out, $h->{$sf}->[$o];
911 }
912 }
913 if ($include_subfields) {
914 return join('', @out);
915 } else {
916 return @out;
917 }
918 } else {
919 if ($include_subfields) {
920 my $out = '';
921 foreach my $sf (sort keys %$h) {
922 if (ref($h->{$sf}) eq 'ARRAY') {
923 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
924 } else {
925 $out .= '^' . $sf . $h->{$sf};
926 }
927 }
928 return $out;
929 } else {
930 # FIXME this should probably be in alphabetical order instead of hash order
931 values %{$h};
932 }
933 }
934 }
935
936 =head2 rec1
937
938 Return all values in some field
939
940 @v = rec1('200')
941
942 TODO: order of values is probably same as in source data, need to investigate that
943
944 =cut
945
946 sub rec1 {
947 my $f = shift;
948 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
949 return unless (defined($rec) && defined($rec->{$f}));
950 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
951 if (ref($rec->{$f}) eq 'ARRAY') {
952 my @out;
953 foreach my $h ( @{ $rec->{$f} } ) {
954 if (ref($h) eq 'HASH') {
955 push @out, ( _pack_subfields_hash( $h ) );
956 } else {
957 push @out, $h;
958 }
959 }
960 return @out;
961 } elsif( defined($rec->{$f}) ) {
962 return $rec->{$f};
963 }
964 }
965
966 =head2 rec2
967
968 Return all values in specific field and subfield
969
970 @v = rec2('200','a')
971
972 =cut
973
974 sub rec2 {
975 my $f = shift;
976 return unless (defined($rec && $rec->{$f}));
977 my $sf = shift;
978 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
979 return map {
980 if (ref($_->{$sf}) eq 'ARRAY') {
981 @{ $_->{$sf} };
982 } else {
983 $_->{$sf};
984 }
985 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
986 }
987
988 =head2 rec
989
990 syntaxtic sugar for
991
992 @v = rec('200')
993 @v = rec('200','a')
994
995 If rec() returns just single value, it will
996 return scalar, not array.
997
998 =cut
999
1000 sub rec {
1001 my @out;
1002 if ($#_ == 0) {
1003 @out = rec1(@_);
1004 } elsif ($#_ == 1) {
1005 @out = rec2(@_);
1006 }
1007 if ($#out == 0 && ! wantarray) {
1008 return $out[0];
1009 } elsif (@out) {
1010 return @out;
1011 } else {
1012 return '';
1013 }
1014 }
1015
1016 =head2 regex
1017
1018 Apply regex to some or all values
1019
1020 @v = regex( 's/foo/bar/g', @v );
1021
1022 =cut
1023
1024 sub regex {
1025 my $r = shift;
1026 my @out;
1027 #warn "r: $r\n", dump(\@_);
1028 foreach my $t (@_) {
1029 next unless ($t);
1030 eval "\$t =~ $r";
1031 push @out, $t if ($t && $t ne '');
1032 }
1033 return @out;
1034 }
1035
1036 =head2 prefix
1037
1038 Prefix all values with a string
1039
1040 @v = prefix( 'my_', @v );
1041
1042 =cut
1043
1044 sub prefix {
1045 my $p = shift;
1046 return @_ unless defined( $p );
1047 return map { $p . $_ } grep { defined($_) } @_;
1048 }
1049
1050 =head2 suffix
1051
1052 suffix all values with a string
1053
1054 @v = suffix( '_my', @v );
1055
1056 =cut
1057
1058 sub suffix {
1059 my $s = shift;
1060 return @_ unless defined( $s );
1061 return map { $_ . $s } grep { defined($_) } @_;
1062 }
1063
1064 =head2 surround
1065
1066 surround all values with a two strings
1067
1068 @v = surround( 'prefix_', '_suffix', @v );
1069
1070 =cut
1071
1072 sub surround {
1073 my $p = shift;
1074 my $s = shift;
1075 $p = '' unless defined( $p );
1076 $s = '' unless defined( $s );
1077 return map { $p . $_ . $s } grep { defined($_) } @_;
1078 }
1079
1080 =head2 first
1081
1082 Return first element
1083
1084 $v = first( @v );
1085
1086 =cut
1087
1088 sub first {
1089 my $r = shift;
1090 return $r;
1091 }
1092
1093 =head2 lookup
1094
1095 Consult lookup hashes for some value
1096
1097 @v = lookup(
1098 sub {
1099 'ffkk/peri/mfn'.rec('000')
1100 },
1101 'ffkk','peri','200-a-200-e',
1102 sub {
1103 first(rec(200,'a')).' '.first(rec('200','e'))
1104 }
1105 );
1106
1107 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1108 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1109
1110 lookup(
1111 # which results to return from record recorded in lookup
1112 sub { 'ffkk/peri/mfn' . rec('000') },
1113 # from which database and input
1114 'ffkk','peri',
1115 # such that following values match
1116 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1117 # if this part is missing, we will try to match same fields
1118 # from lookup record and current one, or you can override
1119 # which records to use from current record using
1120 sub { rec('900','x') . ' ' . rec('900','y') },
1121 )
1122
1123 You can think about this lookup as SQL (if that helps):
1124
1125 select
1126 sub { what }
1127 from
1128 database, input
1129 where
1130 sub { filter from lookuped record }
1131 having
1132 sub { optional filter on current record }
1133
1134 Easy as pie, right?
1135
1136 =cut
1137
1138 sub lookup {
1139 my ($what, $database, $input, $key, $having) = @_;
1140
1141 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1142
1143 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1144 return unless (defined($lookup->{$database}->{$input}->{$key}));
1145
1146 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1147
1148 my $mfns;
1149 my @having = $having->();
1150
1151 warn "## having = ", dump( @having ) if ($debug > 2);
1152
1153 foreach my $h ( @having ) {
1154 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1155 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1156 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1157 }
1158 }
1159
1160 return unless ($mfns);
1161
1162 my @mfns = sort keys %$mfns;
1163
1164 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1165
1166 my $old_rec = $rec;
1167 my @out;
1168
1169 foreach my $mfn (@mfns) {
1170 $rec = $load_row_coderef->( $database, $input, $mfn );
1171
1172 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1173
1174 my @vals = $what->();
1175
1176 push @out, ( @vals );
1177
1178 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1179 }
1180
1181 # if (ref($lookup->{$k}) eq 'ARRAY') {
1182 # return @{ $lookup->{$k} };
1183 # } else {
1184 # return $lookup->{$k};
1185 # }
1186
1187 $rec = $old_rec;
1188
1189 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1190
1191 if ($#out == 0) {
1192 return $out[0];
1193 } else {
1194 return @out;
1195 }
1196 }
1197
1198 =head2 save_into_lookup
1199
1200 Save value into lookup. It associates current database, input
1201 and specific keys with one or more values which will be
1202 associated over MFN.
1203
1204 MFN will be extracted from first occurence current of field 000
1205 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1206
1207 my $nr = save_into_lookup($database,$input,$key,sub {
1208 # code which produce one or more values
1209 });
1210
1211 It returns number of items saved.
1212
1213 This function shouldn't be called directly, it's called from code created by
1214 L<WebPAC::Parser>.
1215
1216 =cut
1217
1218 sub save_into_lookup {
1219 my ($database,$input,$key,$coderef) = @_;
1220 die "save_into_lookup needs database" unless defined($database);
1221 die "save_into_lookup needs input" unless defined($input);
1222 die "save_into_lookup needs key" unless defined($key);
1223 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1224
1225 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1226
1227 my $mfn =
1228 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1229 defined($config->{_mfn}) ? $config->{_mfn} :
1230 die "mfn not defined or zero";
1231
1232 my $nr = 0;
1233
1234 foreach my $v ( $coderef->() ) {
1235 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1236 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1237 $nr++;
1238 }
1239
1240 return $nr;
1241 }
1242
1243 =head2 config
1244
1245 Consult config values stored in C<config.yml>
1246
1247 # return database code (key under databases in yaml)
1248 $database_code = config(); # use _ from hash
1249 $database_name = config('name');
1250 $database_input_name = config('input name');
1251
1252 Up to three levels are supported.
1253
1254 =cut
1255
1256 sub config {
1257 return unless ($config);
1258
1259 my $p = shift;
1260
1261 $p ||= '';
1262
1263 my $v;
1264
1265 warn "### getting config($p)\n" if ($debug > 1);
1266
1267 my @p = split(/\s+/,$p);
1268 if ($#p < 0) {
1269 $v = $config->{ '_' }; # special, database code
1270 } else {
1271
1272 my $c = dclone( $config );
1273
1274 foreach my $k (@p) {
1275 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1276 if (ref($c) eq 'ARRAY') {
1277 $c = shift @$c;
1278 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1279 last;
1280 }
1281
1282 if (! defined($c->{$k}) ) {
1283 $c = undef;
1284 last;
1285 } else {
1286 $c = $c->{$k};
1287 }
1288 }
1289 $v = $c if ($c);
1290
1291 }
1292
1293 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1294 warn "config( '$p' ) is empty\n" if (! $v);
1295
1296 return $v;
1297 }
1298
1299 =head2 id
1300
1301 Returns unique id of this record
1302
1303 $id = id();
1304
1305 Returns C<42/2> for 2nd occurence of MFN 42.
1306
1307 =cut
1308
1309 sub id {
1310 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1311 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1312 }
1313
1314 =head2 join_with
1315
1316 Joins walues with some delimiter
1317
1318 $v = join_with(", ", @v);
1319
1320 =cut
1321
1322 sub join_with {
1323 my $d = shift;
1324 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1325 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1326 return '' unless defined($v);
1327 return $v;
1328 }
1329
1330 =head2 split_rec_on
1331
1332 Split record subfield on some regex and take one of parts out
1333
1334 $a_before_semi_column =
1335 split_rec_on('200','a', /\s*;\s*/, $part);
1336
1337 C<$part> is optional number of element. First element is
1338 B<1>, not 0!
1339
1340 If there is no C<$part> parameter or C<$part> is 0, this function will
1341 return all values produced by splitting.
1342
1343 =cut
1344
1345 sub split_rec_on {
1346 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1347
1348 my ($fld, $sf, $regex, $part) = @_;
1349 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1350
1351 my @r = rec( $fld, $sf );
1352 my $v = shift @r;
1353 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1354
1355 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1356
1357 my @s = split( $regex, $v );
1358 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1359 if ($part && $part > 0) {
1360 return $s[ $part - 1 ];
1361 } else {
1362 return @s;
1363 }
1364 }
1365
1366 my $hash;
1367
1368 =head2 set
1369
1370 set( key => 'value' );
1371
1372 =cut
1373
1374 sub set {
1375 my ($k,$v) = @_;
1376 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1377 $hash->{$k} = $v;
1378 };
1379
1380 =head2 get
1381
1382 get( 'key' );
1383
1384 =cut
1385
1386 sub get {
1387 my $k = shift || return;
1388 my $v = $hash->{$k};
1389 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1390 return $v;
1391 }
1392
1393 =head2 count
1394
1395 if ( count( @result ) == 1 ) {
1396 # do something if only 1 result is there
1397 }
1398
1399 =cut
1400
1401 sub count {
1402 warn "## count ",dump(@_),$/ if ( $debug );
1403 return @_ . '';
1404 }
1405
1406 # END
1407 1;

  ViewVC Help
Powered by ViewVC 1.1.26