/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 923 - (show annotations)
Wed Oct 31 00:26:43 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 30619 byte(s)
 r1393@llin:  dpavlin | 2007-10-31 00:52:52 +0100
 added sorted to WebPAC::Normalize to define values which should go
 into sorted lists

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 marc marc_indicators marc_repeatable_subfield
13 marc_compose marc_leader marc_fixed
14 marc_duplicate marc_remove marc_count
15 marc_original_order
16
17 rec1 rec2 rec
18 regex prefix suffix surround
19 first lookup join_with
20 save_into_lookup
21
22 split_rec_on
23
24 get set
25 count
26 /;
27
28 use warnings;
29 use strict;
30
31 #use base qw/WebPAC::Common/;
32 use Data::Dump qw/dump/;
33 use Storable qw/dclone/;
34 use Carp qw/confess/;
35
36 # debugging warn(s)
37 my $debug = 0;
38
39
40 =head1 NAME
41
42 WebPAC::Normalize - describe normalisaton rules using sets
43
44 =cut
45
46 our $VERSION = '0.31';
47
48 =head1 SYNOPSIS
49
50 This module uses C<conf/normalize/*.pl> files to perform normalisation
51 from input records using perl functions which are specialized for set
52 processing.
53
54 Sets are implemented as arrays, and normalisation file is valid perl, which
55 means that you check it's validity before running WebPAC using
56 C<perl -c normalize.pl>.
57
58 Normalisation can generate multiple output normalized data. For now, supported output
59 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
60 C<marc>.
61
62 =head1 FUNCTIONS
63
64 Functions which start with C<_> are private and used by WebPAC internally.
65 All other functions are available for use within normalisation rules.
66
67 =head2 data_structure
68
69 Return data structure
70
71 my $ds = WebPAC::Normalize::data_structure(
72 lookup => $lookup_hash,
73 row => $row,
74 rules => $normalize_pl_config,
75 marc_encoding => 'utf-8',
76 config => $config,
77 load_row_coderef => sub {
78 my ($database,$input,$mfn) = shift;
79 $store->load_row( database => $database, input => $input, id => $mfn );
80 },
81 );
82
83 Options C<row>, C<rules> and C<log> are mandatory while all
84 other are optional.
85
86 C<load_row_coderef> is closure only used when executing lookups, so they will
87 die if it's not defined.
88
89 This function will B<die> if normalizastion can't be evaled.
90
91 Since this function isn't exported you have to call it with
92 C<WebPAC::Normalize::data_structure>.
93
94 =cut
95
96 my $load_row_coderef;
97
98 sub data_structure {
99 my $arg = {@_};
100
101 die "need row argument" unless ($arg->{row});
102 die "need normalisation argument" unless ($arg->{rules});
103
104 no strict 'subs';
105 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 _set_rec( $arg->{row} );
107 _set_config( $arg->{config} ) if defined($arg->{config});
108 _clean_ds( %{ $arg } );
109 $load_row_coderef = $arg->{load_row_coderef};
110
111 eval "$arg->{rules}";
112 die "error evaling $arg->{rules}: $@\n" if ($@);
113
114 return _get_ds();
115 }
116
117 =head2 _set_rec
118
119 Set current record hash
120
121 _set_rec( $rec );
122
123 =cut
124
125 my $rec;
126
127 sub _set_rec {
128 $rec = shift or die "no record hash";
129 }
130
131 =head2 _set_config
132
133 Set current config hash
134
135 _set_config( $config );
136
137 Magic keys are:
138
139 =over 4
140
141 =item _
142
143 Code of current database
144
145 =item _mfn
146
147 Current MFN
148
149 =back
150
151 =cut
152
153 my $config;
154
155 sub _set_config {
156 $config = shift;
157 }
158
159 =head2 _get_ds
160
161 Return hash formatted as data structure
162
163 my $ds = _get_ds();
164
165 =cut
166
167 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
168 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
169
170 sub _get_ds {
171 return $out;
172 }
173
174 =head2 _clean_ds
175
176 Clean data structure hash for next record
177
178 _clean_ds();
179
180 =cut
181
182 sub _clean_ds {
183 my $a = {@_};
184 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
185 ($marc_record_offset, $marc_fetch_offset) = (0,0);
186 $marc_encoding = $a->{marc_encoding};
187 }
188
189 =head2 _set_lookup
190
191 Set current lookup hash
192
193 _set_lookup( $lookup );
194
195 =cut
196
197 my $lookup;
198
199 sub _set_lookup {
200 $lookup = shift;
201 }
202
203 =head2 _get_lookup
204
205 Get current lookup hash
206
207 my $lookup = _get_lookup();
208
209 =cut
210
211 sub _get_lookup {
212 return $lookup;
213 }
214
215 =head2 _set_load_row
216
217 Setup code reference which will return L<data_structure> from
218 L<WebPAC::Store>
219
220 _set_load_row(sub {
221 my ($database,$input,$mfn) = @_;
222 $store->load_row( database => $database, input => $input, id => $mfn );
223 });
224
225 =cut
226
227 sub _set_load_row {
228 my $coderef = shift;
229 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
230
231 $load_row_coderef = $coderef;
232 }
233
234 =head2 _get_marc_fields
235
236 Get all fields defined by calls to C<marc>
237
238 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
239
240 We are using I<magic> which detect repeatable fields only from
241 sequence of field/subfield data generated by normalization.
242
243 Repeatable field is created when there is second occurence of same subfield or
244 if any of indicators are different.
245
246 This is sane for most cases. Something like:
247
248 900a-1 900b-1 900c-1
249 900a-2 900b-2
250 900a-3
251
252 will be created from any combination of:
253
254 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
255
256 and following rules:
257
258 marc('900','a', rec('200','a') );
259 marc('900','b', rec('200','b') );
260 marc('900','c', rec('200','c') );
261
262 which might not be what you have in mind. If you need repeatable subfield,
263 define it using C<marc_repeatable_subfield> like this:
264
265 marc_repeatable_subfield('900','a');
266 marc('900','a', rec('200','a') );
267 marc('900','b', rec('200','b') );
268 marc('900','c', rec('200','c') );
269
270 will create:
271
272 900a-1 900a-2 900a-3 900b-1 900c-1
273 900b-2
274
275 There is also support for returning next or specific using:
276
277 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
278 # do something with $mf
279 }
280
281 will always return fields from next MARC record or
282
283 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
284
285 will return 42th copy record (if it exists).
286
287 =cut
288
289 my $fetch_pos;
290
291 sub _get_marc_fields {
292
293 my $arg = {@_};
294 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
295 $fetch_pos = $marc_fetch_offset;
296 if ($arg->{offset}) {
297 $fetch_pos = $arg->{offset};
298 } elsif($arg->{fetch_next}) {
299 $marc_fetch_offset++;
300 }
301
302 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
303
304 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
305
306 my $marc_rec = $marc_record->[ $fetch_pos ];
307
308 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
309
310 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
311
312 # first, sort all existing fields
313 # XXX might not be needed, but modern perl might randomize elements in hash
314 my @sorted_marc_record = sort {
315 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
316 } @{ $marc_rec };
317
318 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
319
320 # output marc fields
321 my @m;
322
323 # count unique field-subfields (used for offset when walking to next subfield)
324 my $u;
325 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
326
327 if ($debug) {
328 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
329 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
330 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
331 warn "## subfield count = ", dump( $u ), $/;
332 }
333
334 my $len = $#sorted_marc_record;
335 my $visited;
336 my $i = 0;
337 my $field;
338
339 foreach ( 0 .. $len ) {
340
341 # find next element which isn't visited
342 while ($visited->{$i}) {
343 $i = ($i + 1) % ($len + 1);
344 }
345
346 # mark it visited
347 $visited->{$i}++;
348
349 my $row = dclone( $sorted_marc_record[$i] );
350
351 # field and subfield which is key for
352 # marc_repeatable_subfield and u
353 my $fsf = $row->[0] . ( $row->[3] || '' );
354
355 if ($debug > 1) {
356
357 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
358 print "### this [$i]: ", dump( $row ),$/;
359 print "### sf: ", $row->[3], " vs ", $field->[3],
360 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
361 if ($#$field >= 0);
362
363 }
364
365 # if field exists
366 if ( $#$field >= 0 ) {
367 if (
368 $row->[0] ne $field->[0] || # field
369 $row->[1] ne $field->[1] || # i1
370 $row->[2] ne $field->[2] # i2
371 ) {
372 push @m, $field;
373 warn "## saved/1 ", dump( $field ),$/ if ($debug);
374 $field = $row;
375
376 } elsif (
377 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
378 ||
379 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
380 ! $marc_repeatable_subfield->{ $fsf }
381 )
382 ) {
383 push @m, $field;
384 warn "## saved/2 ", dump( $field ),$/ if ($debug);
385 $field = $row;
386
387 } else {
388 # append new subfields to existing field
389 push @$field, ( $row->[3], $row->[4] );
390 }
391 } else {
392 # insert first field
393 $field = $row;
394 }
395
396 if (! $marc_repeatable_subfield->{ $fsf }) {
397 # make step to next subfield
398 $i = ($i + $u->{ $fsf } ) % ($len + 1);
399 }
400 }
401
402 if ($#$field >= 0) {
403 push @m, $field;
404 warn "## saved/3 ", dump( $field ),$/ if ($debug);
405 }
406
407 return \@m;
408 }
409
410 =head2 _get_marc_leader
411
412 Return leader from currently fetched record by L</_get_marc_fields>
413
414 print WebPAC::Normalize::_get_marc_leader();
415
416 =cut
417
418 sub _get_marc_leader {
419 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
420 return $marc_leader->[ $fetch_pos ];
421 }
422
423 =head2 _debug
424
425 Change level of debug warnings
426
427 _debug( 2 );
428
429 =cut
430
431 sub _debug {
432 my $l = shift;
433 return $debug unless defined($l);
434 warn "debug level $l",$/ if ($l > 0);
435 $debug = $l;
436 }
437
438 =head1 Functions to create C<data_structure>
439
440 Those functions generally have to first in your normalization file.
441
442 =head2 search_display
443
444 Define output for L<search> and L<display> at the same time
445
446 search_display('Title', rec('200','a') );
447
448
449 =cut
450
451 sub search_display {
452 my $name = shift or die "search_display needs name as first argument";
453 my @o = grep { defined($_) && $_ ne '' } @_;
454 return unless (@o);
455 $out->{$name}->{search} = \@o;
456 $out->{$name}->{display} = \@o;
457 }
458
459 =head2 tag
460
461 Old name for L<search_display>, but supported
462
463 =cut
464
465 sub tag {
466 search_display( @_ );
467 }
468
469 =head2 display
470
471 Define output just for I<display>
472
473 @v = display('Title', rec('200','a') );
474
475 =cut
476
477 sub _field {
478 my $type = shift or confess "need type -- BUG?";
479 my $name = shift or confess "needs name as first argument";
480 my @o = grep { defined($_) && $_ ne '' } @_;
481 return unless (@o);
482 $out->{$name}->{$type} = \@o;
483 }
484
485 sub display { _field( 'display', @_ ) }
486
487 =head2 search
488
489 Prepare values just for I<search>
490
491 @v = search('Title', rec('200','a') );
492
493 =cut
494
495 sub search { _field( 'search', @_ ) }
496
497 =head2 sorted
498
499 Insert into lists which will be automatically sorted
500
501 sorted('Title', rec('200','a') );
502
503 =cut
504
505 sub sorted { _field( 'sorted', @_ ) }
506
507
508 =head2 marc_leader
509
510 Setup fields within MARC leader or get leader
511
512 marc_leader('05','c');
513 my $leader = marc_leader();
514
515 =cut
516
517 sub marc_leader {
518 my ($offset,$value) = @_;
519
520 if ($offset) {
521 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
522 } else {
523
524 if (defined($marc_leader)) {
525 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
526 return $marc_leader->[ $marc_record_offset ];
527 } else {
528 return;
529 }
530 }
531 }
532
533 =head2 marc_fixed
534
535 Create control/indentifier fields with values in fixed positions
536
537 marc_fixed('008', 00, '070402');
538 marc_fixed('008', 39, '|');
539
540 Positions not specified will be filled with spaces (C<0x20>).
541
542 There will be no effort to extend last specified value to full length of
543 field in standard.
544
545 =cut
546
547 sub marc_fixed {
548 my ($f, $pos, $val) = @_;
549 die "need marc(field, position, value)" unless defined($f) && defined($pos);
550
551 confess "need val" unless defined $val;
552
553 my $update = 0;
554
555 map {
556 if ($_->[0] eq $f) {
557 my $old = $_->[1];
558 if (length($old) <= $pos) {
559 $_->[1] .= ' ' x ( $pos - length($old) ) . $val;
560 warn "## marc_fixed($f,$pos,'$val') append '$old' -> '$_->[1]'\n" if ($debug > 1);
561 } else {
562 $_->[1] = substr($old, 0, $pos) . $val . substr($old, $pos + length($val));
563 warn "## marc_fixed($f,$pos,'$val') update '$old' -> '$_->[1]'\n" if ($debug > 1);
564 }
565 $update++;
566 }
567 } @{ $marc_record->[ $marc_record_offset ] };
568
569 if (! $update) {
570 my $v = ' ' x $pos . $val;
571 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
572 warn "## marc_fixed($f,$pos,'val') created '$v'\n" if ($debug > 1);
573 }
574 }
575
576 =head2 marc
577
578 Save value for MARC field
579
580 marc('900','a', rec('200','a') );
581 marc('001', rec('000') );
582
583 =cut
584
585 sub marc {
586 my $f = shift or die "marc needs field";
587 die "marc field must be numer" unless ($f =~ /^\d+$/);
588
589 my $sf;
590 if ($f >= 10) {
591 $sf = shift or die "marc needs subfield";
592 }
593
594 foreach (@_) {
595 my $v = $_; # make var read-write for Encode
596 next unless (defined($v) && $v !~ /^\s*$/);
597 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
598 if (defined $sf) {
599 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
600 } else {
601 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
602 }
603 }
604 }
605
606 =head2 marc_repeatable_subfield
607
608 Save values for MARC repetable subfield
609
610 marc_repeatable_subfield('910', 'z', rec('909') );
611
612 =cut
613
614 sub marc_repeatable_subfield {
615 my ($f,$sf) = @_;
616 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
617 $marc_repeatable_subfield->{ $f . $sf }++;
618 marc(@_);
619 }
620
621 =head2 marc_indicators
622
623 Set both indicators for MARC field
624
625 marc_indicators('900', ' ', 1);
626
627 Any indicator value other than C<0-9> will be treated as undefined.
628
629 =cut
630
631 sub marc_indicators {
632 my $f = shift || die "marc_indicators need field!\n";
633 my ($i1,$i2) = @_;
634 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
635 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
636
637 $i1 = ' ' if ($i1 !~ /^\d$/);
638 $i2 = ' ' if ($i2 !~ /^\d$/);
639 @{ $marc_indicators->{$f} } = ($i1,$i2);
640 }
641
642 =head2 marc_compose
643
644 Save values for each MARC subfield explicitly
645
646 marc_compose('900',
647 'a', rec('200','a')
648 'b', rec('201','a')
649 'a', rec('200','b')
650 'c', rec('200','c')
651 );
652
653 If you specify C<+> for subfield, value will be appended
654 to previous defined subfield.
655
656 =cut
657
658 sub marc_compose {
659 my $f = shift or die "marc_compose needs field";
660 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
661
662 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
663 my $m = [ $f, $i1, $i2 ];
664
665 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
666
667 if ($#_ % 2 != 1) {
668 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
669 }
670
671 while (@_) {
672 my $sf = shift;
673 my $v = shift;
674
675 next unless (defined($v) && $v !~ /^\s*$/);
676 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
677 if ($sf ne '+') {
678 push @$m, ( $sf, $v );
679 } else {
680 $m->[ $#$m ] .= $v;
681 }
682 }
683
684 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
685
686 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
687 }
688
689 =head2 marc_duplicate
690
691 Generate copy of current MARC record and continue working on copy
692
693 marc_duplicate();
694
695 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
696 C<< _get_marc_fields( offset => 42 ) >>.
697
698 =cut
699
700 sub marc_duplicate {
701 my $m = $marc_record->[ -1 ];
702 die "can't duplicate record which isn't defined" unless ($m);
703 push @{ $marc_record }, dclone( $m );
704 push @{ $marc_leader }, dclone( marc_leader() );
705 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
706 $marc_record_offset = $#{ $marc_record };
707 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
708
709 }
710
711 =head2 marc_remove
712
713 Remove some field or subfield from MARC record.
714
715 marc_remove('200');
716 marc_remove('200','a');
717
718 This will erase field C<200> or C<200^a> from current MARC record.
719
720 marc_remove('*');
721
722 Will remove all fields in current MARC record.
723
724 This is useful after calling C<marc_duplicate> or on it's own (but, you
725 should probably just remove that subfield definition if you are not
726 using C<marc_duplicate>).
727
728 FIXME: support fields < 10.
729
730 =cut
731
732 sub marc_remove {
733 my ($f, $sf) = @_;
734
735 die "marc_remove needs record number" unless defined($f);
736
737 my $marc = $marc_record->[ $marc_record_offset ];
738
739 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
740
741 if ($f eq '*') {
742
743 delete( $marc_record->[ $marc_record_offset ] );
744 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
745
746 } else {
747
748 my $i = 0;
749 foreach ( 0 .. $#{ $marc } ) {
750 last unless (defined $marc->[$i]);
751 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
752 if ($marc->[$i]->[0] eq $f) {
753 if (! defined $sf) {
754 # remove whole field
755 splice @$marc, $i, 1;
756 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
757 $i--;
758 } else {
759 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
760 my $o = ($j * 2) + 3;
761 if ($marc->[$i]->[$o] eq $sf) {
762 # remove subfield
763 splice @{$marc->[$i]}, $o, 2;
764 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
765 # is record now empty?
766 if ($#{ $marc->[$i] } == 2) {
767 splice @$marc, $i, 1;
768 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
769 $i--;
770 };
771 }
772 }
773 }
774 }
775 $i++;
776 }
777
778 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
779
780 $marc_record->[ $marc_record_offset ] = $marc;
781 }
782
783 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
784 }
785
786 =head2 marc_original_order
787
788 Copy all subfields preserving original order to marc field.
789
790 marc_original_order( marc_field_number, original_input_field_number );
791
792 Please note that field numbers are consistent with other commands (marc
793 field number first), but somewhat counter-intuitive (destination and then
794 source).
795
796 You might want to use this command if you are just renaming subfields or
797 using pre-processing modify_record in C<config.yml> and don't need any
798 post-processing or want to preserve order of original subfields.
799
800
801 =cut
802
803 sub marc_original_order {
804
805 my ($to, $from) = @_;
806 die "marc_original_order needs from and to fields\n" unless ($from && $to);
807
808 return unless defined($rec->{$from});
809
810 my $r = $rec->{$from};
811 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
812
813 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
814 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
815
816 foreach my $d (@$r) {
817
818 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
819 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
820 next;
821 }
822
823 my @sfs = @{ $d->{subfields} };
824
825 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
826
827 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
828
829 my $m = [ $to, $i1, $i2 ];
830
831 while (my $sf = shift @sfs) {
832
833 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
834 my $offset = shift @sfs;
835 die "corrupted sufields specification for field $from\n" unless defined($offset);
836
837 my $v;
838 if (ref($d->{$sf}) eq 'ARRAY') {
839 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
840 } elsif ($offset == 0) {
841 $v = $d->{$sf};
842 } else {
843 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
844 }
845 push @$m, ( $sf, $v ) if (defined($v));
846 }
847
848 if ($#{$m} > 2) {
849 push @{ $marc_record->[ $marc_record_offset ] }, $m;
850 }
851 }
852
853 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
854 }
855
856 =head2 marc_count
857
858 Return number of MARC records created using L</marc_duplicate>.
859
860 print "created ", marc_count(), " records";
861
862 =cut
863
864 sub marc_count {
865 return $#{ $marc_record };
866 }
867
868
869 =head1 Functions to extract data from input
870
871 This function should be used inside functions to create C<data_structure> described
872 above.
873
874 =head2 _pack_subfields_hash
875
876 @subfields = _pack_subfields_hash( $h );
877 $subfields = _pack_subfields_hash( $h, 1 );
878
879 Return each subfield value in array or pack them all together and return scalar
880 with subfields (denoted by C<^>) and values.
881
882 =cut
883
884 sub _pack_subfields_hash {
885
886 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
887
888 my ($h,$include_subfields) = @_;
889
890 # sanity and ease of use
891 return $h if (ref($h) ne 'HASH');
892
893 if ( defined($h->{subfields}) ) {
894 my $sfs = delete $h->{subfields} || die "no subfields?";
895 my @out;
896 while (@$sfs) {
897 my $sf = shift @$sfs;
898 push @out, '^' . $sf if ($include_subfields);
899 my $o = shift @$sfs;
900 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
901 # single element subfields are not arrays
902 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
903
904 push @out, $h->{$sf};
905 } else {
906 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
907 push @out, $h->{$sf}->[$o];
908 }
909 }
910 if ($include_subfields) {
911 return join('', @out);
912 } else {
913 return @out;
914 }
915 } else {
916 if ($include_subfields) {
917 my $out = '';
918 foreach my $sf (sort keys %$h) {
919 if (ref($h->{$sf}) eq 'ARRAY') {
920 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
921 } else {
922 $out .= '^' . $sf . $h->{$sf};
923 }
924 }
925 return $out;
926 } else {
927 # FIXME this should probably be in alphabetical order instead of hash order
928 values %{$h};
929 }
930 }
931 }
932
933 =head2 rec1
934
935 Return all values in some field
936
937 @v = rec1('200')
938
939 TODO: order of values is probably same as in source data, need to investigate that
940
941 =cut
942
943 sub rec1 {
944 my $f = shift;
945 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
946 return unless (defined($rec) && defined($rec->{$f}));
947 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
948 if (ref($rec->{$f}) eq 'ARRAY') {
949 my @out;
950 foreach my $h ( @{ $rec->{$f} } ) {
951 if (ref($h) eq 'HASH') {
952 push @out, ( _pack_subfields_hash( $h ) );
953 } else {
954 push @out, $h;
955 }
956 }
957 return @out;
958 } elsif( defined($rec->{$f}) ) {
959 return $rec->{$f};
960 }
961 }
962
963 =head2 rec2
964
965 Return all values in specific field and subfield
966
967 @v = rec2('200','a')
968
969 =cut
970
971 sub rec2 {
972 my $f = shift;
973 return unless (defined($rec && $rec->{$f}));
974 my $sf = shift;
975 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
976 return map {
977 if (ref($_->{$sf}) eq 'ARRAY') {
978 @{ $_->{$sf} };
979 } else {
980 $_->{$sf};
981 }
982 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
983 }
984
985 =head2 rec
986
987 syntaxtic sugar for
988
989 @v = rec('200')
990 @v = rec('200','a')
991
992 If rec() returns just single value, it will
993 return scalar, not array.
994
995 =cut
996
997 sub rec {
998 my @out;
999 if ($#_ == 0) {
1000 @out = rec1(@_);
1001 } elsif ($#_ == 1) {
1002 @out = rec2(@_);
1003 }
1004 if ($#out == 0 && ! wantarray) {
1005 return $out[0];
1006 } elsif (@out) {
1007 return @out;
1008 } else {
1009 return '';
1010 }
1011 }
1012
1013 =head2 regex
1014
1015 Apply regex to some or all values
1016
1017 @v = regex( 's/foo/bar/g', @v );
1018
1019 =cut
1020
1021 sub regex {
1022 my $r = shift;
1023 my @out;
1024 #warn "r: $r\n", dump(\@_);
1025 foreach my $t (@_) {
1026 next unless ($t);
1027 eval "\$t =~ $r";
1028 push @out, $t if ($t && $t ne '');
1029 }
1030 return @out;
1031 }
1032
1033 =head2 prefix
1034
1035 Prefix all values with a string
1036
1037 @v = prefix( 'my_', @v );
1038
1039 =cut
1040
1041 sub prefix {
1042 my $p = shift;
1043 return @_ unless defined( $p );
1044 return map { $p . $_ } grep { defined($_) } @_;
1045 }
1046
1047 =head2 suffix
1048
1049 suffix all values with a string
1050
1051 @v = suffix( '_my', @v );
1052
1053 =cut
1054
1055 sub suffix {
1056 my $s = shift;
1057 return @_ unless defined( $s );
1058 return map { $_ . $s } grep { defined($_) } @_;
1059 }
1060
1061 =head2 surround
1062
1063 surround all values with a two strings
1064
1065 @v = surround( 'prefix_', '_suffix', @v );
1066
1067 =cut
1068
1069 sub surround {
1070 my $p = shift;
1071 my $s = shift;
1072 $p = '' unless defined( $p );
1073 $s = '' unless defined( $s );
1074 return map { $p . $_ . $s } grep { defined($_) } @_;
1075 }
1076
1077 =head2 first
1078
1079 Return first element
1080
1081 $v = first( @v );
1082
1083 =cut
1084
1085 sub first {
1086 my $r = shift;
1087 return $r;
1088 }
1089
1090 =head2 lookup
1091
1092 Consult lookup hashes for some value
1093
1094 @v = lookup(
1095 sub {
1096 'ffkk/peri/mfn'.rec('000')
1097 },
1098 'ffkk','peri','200-a-200-e',
1099 sub {
1100 first(rec(200,'a')).' '.first(rec('200','e'))
1101 }
1102 );
1103
1104 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1105 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1106
1107 lookup(
1108 # which results to return from record recorded in lookup
1109 sub { 'ffkk/peri/mfn' . rec('000') },
1110 # from which database and input
1111 'ffkk','peri',
1112 # such that following values match
1113 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1114 # if this part is missing, we will try to match same fields
1115 # from lookup record and current one, or you can override
1116 # which records to use from current record using
1117 sub { rec('900','x') . ' ' . rec('900','y') },
1118 )
1119
1120 You can think about this lookup as SQL (if that helps):
1121
1122 select
1123 sub { what }
1124 from
1125 database, input
1126 where
1127 sub { filter from lookuped record }
1128 having
1129 sub { optional filter on current record }
1130
1131 Easy as pie, right?
1132
1133 =cut
1134
1135 sub lookup {
1136 my ($what, $database, $input, $key, $having) = @_;
1137
1138 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1139
1140 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1141 return unless (defined($lookup->{$database}->{$input}->{$key}));
1142
1143 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1144
1145 my $mfns;
1146 my @having = $having->();
1147
1148 warn "## having = ", dump( @having ) if ($debug > 2);
1149
1150 foreach my $h ( @having ) {
1151 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1152 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1153 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1154 }
1155 }
1156
1157 return unless ($mfns);
1158
1159 my @mfns = sort keys %$mfns;
1160
1161 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1162
1163 my $old_rec = $rec;
1164 my @out;
1165
1166 foreach my $mfn (@mfns) {
1167 $rec = $load_row_coderef->( $database, $input, $mfn );
1168
1169 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1170
1171 my @vals = $what->();
1172
1173 push @out, ( @vals );
1174
1175 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1176 }
1177
1178 # if (ref($lookup->{$k}) eq 'ARRAY') {
1179 # return @{ $lookup->{$k} };
1180 # } else {
1181 # return $lookup->{$k};
1182 # }
1183
1184 $rec = $old_rec;
1185
1186 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1187
1188 if ($#out == 0) {
1189 return $out[0];
1190 } else {
1191 return @out;
1192 }
1193 }
1194
1195 =head2 save_into_lookup
1196
1197 Save value into lookup. It associates current database, input
1198 and specific keys with one or more values which will be
1199 associated over MFN.
1200
1201 MFN will be extracted from first occurence current of field 000
1202 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1203
1204 my $nr = save_into_lookup($database,$input,$key,sub {
1205 # code which produce one or more values
1206 });
1207
1208 It returns number of items saved.
1209
1210 This function shouldn't be called directly, it's called from code created by
1211 L<WebPAC::Parser>.
1212
1213 =cut
1214
1215 sub save_into_lookup {
1216 my ($database,$input,$key,$coderef) = @_;
1217 die "save_into_lookup needs database" unless defined($database);
1218 die "save_into_lookup needs input" unless defined($input);
1219 die "save_into_lookup needs key" unless defined($key);
1220 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1221
1222 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1223
1224 my $mfn =
1225 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1226 defined($config->{_mfn}) ? $config->{_mfn} :
1227 die "mfn not defined or zero";
1228
1229 my $nr = 0;
1230
1231 foreach my $v ( $coderef->() ) {
1232 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1233 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1234 $nr++;
1235 }
1236
1237 return $nr;
1238 }
1239
1240 =head2 config
1241
1242 Consult config values stored in C<config.yml>
1243
1244 # return database code (key under databases in yaml)
1245 $database_code = config(); # use _ from hash
1246 $database_name = config('name');
1247 $database_input_name = config('input name');
1248
1249 Up to three levels are supported.
1250
1251 =cut
1252
1253 sub config {
1254 return unless ($config);
1255
1256 my $p = shift;
1257
1258 $p ||= '';
1259
1260 my $v;
1261
1262 warn "### getting config($p)\n" if ($debug > 1);
1263
1264 my @p = split(/\s+/,$p);
1265 if ($#p < 0) {
1266 $v = $config->{ '_' }; # special, database code
1267 } else {
1268
1269 my $c = dclone( $config );
1270
1271 foreach my $k (@p) {
1272 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1273 if (ref($c) eq 'ARRAY') {
1274 $c = shift @$c;
1275 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1276 last;
1277 }
1278
1279 if (! defined($c->{$k}) ) {
1280 $c = undef;
1281 last;
1282 } else {
1283 $c = $c->{$k};
1284 }
1285 }
1286 $v = $c if ($c);
1287
1288 }
1289
1290 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1291 warn "config( '$p' ) is empty\n" if (! $v);
1292
1293 return $v;
1294 }
1295
1296 =head2 id
1297
1298 Returns unique id of this record
1299
1300 $id = id();
1301
1302 Returns C<42/2> for 2nd occurence of MFN 42.
1303
1304 =cut
1305
1306 sub id {
1307 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1308 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1309 }
1310
1311 =head2 join_with
1312
1313 Joins walues with some delimiter
1314
1315 $v = join_with(", ", @v);
1316
1317 =cut
1318
1319 sub join_with {
1320 my $d = shift;
1321 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1322 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1323 return '' unless defined($v);
1324 return $v;
1325 }
1326
1327 =head2 split_rec_on
1328
1329 Split record subfield on some regex and take one of parts out
1330
1331 $a_before_semi_column =
1332 split_rec_on('200','a', /\s*;\s*/, $part);
1333
1334 C<$part> is optional number of element. First element is
1335 B<1>, not 0!
1336
1337 If there is no C<$part> parameter or C<$part> is 0, this function will
1338 return all values produced by splitting.
1339
1340 =cut
1341
1342 sub split_rec_on {
1343 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1344
1345 my ($fld, $sf, $regex, $part) = @_;
1346 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1347
1348 my @r = rec( $fld, $sf );
1349 my $v = shift @r;
1350 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1351
1352 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1353
1354 my @s = split( $regex, $v );
1355 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1356 if ($part && $part > 0) {
1357 return $s[ $part - 1 ];
1358 } else {
1359 return @s;
1360 }
1361 }
1362
1363 my $hash;
1364
1365 =head2 set
1366
1367 set( key => 'value' );
1368
1369 =cut
1370
1371 sub set {
1372 my ($k,$v) = @_;
1373 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1374 $hash->{$k} = $v;
1375 };
1376
1377 =head2 get
1378
1379 get( 'key' );
1380
1381 =cut
1382
1383 sub get {
1384 my $k = shift || return;
1385 my $v = $hash->{$k};
1386 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1387 return $v;
1388 }
1389
1390 =head2 count
1391
1392 if ( count( @result ) == 1 ) {
1393 # do something if only 1 result is there
1394 }
1395
1396 =cut
1397
1398 sub count {
1399 warn "## count ",dump(@_),$/ if ( $debug );
1400 return @_ . '';
1401 }
1402
1403 # END
1404 1;

  ViewVC Help
Powered by ViewVC 1.1.26