/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 752 - (show annotations)
Sun Oct 8 18:21:26 2006 UTC (17 years, 5 months ago) by dpavlin
File size: 27683 byte(s)
move warns to debug level

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader
13 marc_duplicate marc_remove
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22 /;
23
24 use warnings;
25 use strict;
26
27 #use base qw/WebPAC::Common/;
28 use Data::Dump qw/dump/;
29 use Storable qw/dclone/;
30 use Carp qw/confess/;
31
32 # debugging warn(s)
33 my $debug = 0;
34
35
36 =head1 NAME
37
38 WebPAC::Normalize - describe normalisaton rules using sets
39
40 =head1 VERSION
41
42 Version 0.23
43
44 =cut
45
46 our $VERSION = '0.23';
47
48 =head1 SYNOPSIS
49
50 This module uses C<conf/normalize/*.pl> files to perform normalisation
51 from input records using perl functions which are specialized for set
52 processing.
53
54 Sets are implemented as arrays, and normalisation file is valid perl, which
55 means that you check it's validity before running WebPAC using
56 C<perl -c normalize.pl>.
57
58 Normalisation can generate multiple output normalized data. For now, supported output
59 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 C<marc>.
61
62 =head1 FUNCTIONS
63
64 Functions which start with C<_> are private and used by WebPAC internally.
65 All other functions are available for use within normalisation rules.
66
67 =head2 data_structure
68
69 Return data structure
70
71 my $ds = WebPAC::Normalize::data_structure(
72 lookup => $lookup_hash,
73 row => $row,
74 rules => $normalize_pl_config,
75 marc_encoding => 'utf-8',
76 config => $config,
77 load_row_coderef => sub {
78 my ($database,$input,$mfn) = shift;
79 $store->load_row( database => $database, input => $input, id => $mfn );
80 },
81 );
82
83 Options C<row>, C<rules> and C<log> are mandatory while all
84 other are optional.
85
86 C<load_row_coderef> is closure only used when executing lookups, so they will
87 die if it's not defined.
88
89 This function will B<die> if normalizastion can't be evaled.
90
91 Since this function isn't exported you have to call it with
92 C<WebPAC::Normalize::data_structure>.
93
94 =cut
95
96 my $load_row_coderef;
97
98 sub data_structure {
99 my $arg = {@_};
100
101 die "need row argument" unless ($arg->{row});
102 die "need normalisation argument" unless ($arg->{rules});
103
104 no strict 'subs';
105 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 _set_rec( $arg->{row} );
107 _set_config( $arg->{config} ) if defined($arg->{config});
108 _clean_ds( %{ $arg } );
109 $load_row_coderef = $arg->{load_row_coderef};
110
111 eval "$arg->{rules}";
112 die "error evaling $arg->{rules}: $@\n" if ($@);
113
114 return _get_ds();
115 }
116
117 =head2 _set_rec
118
119 Set current record hash
120
121 _set_rec( $rec );
122
123 =cut
124
125 my $rec;
126
127 sub _set_rec {
128 $rec = shift or die "no record hash";
129 }
130
131 =head2 _set_config
132
133 Set current config hash
134
135 _set_config( $config );
136
137 Magic keys are:
138
139 =over 4
140
141 =item _
142
143 Code of current database
144
145 =item _mfn
146
147 Current MFN
148
149 =back
150
151 =cut
152
153 my $config;
154
155 sub _set_config {
156 $config = shift;
157 }
158
159 =head2 _get_ds
160
161 Return hash formatted as data structure
162
163 my $ds = _get_ds();
164
165 =cut
166
167 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
168 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
169
170 sub _get_ds {
171 return $out;
172 }
173
174 =head2 _clean_ds
175
176 Clean data structure hash for next record
177
178 _clean_ds();
179
180 =cut
181
182 sub _clean_ds {
183 my $a = {@_};
184 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
185 ($marc_record_offset, $marc_fetch_offset) = (0,0);
186 $marc_encoding = $a->{marc_encoding};
187 }
188
189 =head2 _set_lookup
190
191 Set current lookup hash
192
193 _set_lookup( $lookup );
194
195 =cut
196
197 my $lookup;
198
199 sub _set_lookup {
200 $lookup = shift;
201 }
202
203 =head2 _get_lookup
204
205 Get current lookup hash
206
207 my $lookup = _get_lookup();
208
209 =cut
210
211 sub _get_lookup {
212 return $lookup;
213 }
214
215 =head2 _set_load_row
216
217 Setup code reference which will return L<data_structure> from
218 L<WebPAC::Store>
219
220 _set_load_row(sub {
221 my ($database,$input,$mfn) = @_;
222 $store->load_row( database => $database, input => $input, id => $mfn );
223 });
224
225 =cut
226
227 sub _set_load_row {
228 my $coderef = shift;
229 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
230
231 $load_row_coderef = $coderef;
232 }
233
234 =head2 _get_marc_fields
235
236 Get all fields defined by calls to C<marc>
237
238 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
239
240 We are using I<magic> which detect repeatable fields only from
241 sequence of field/subfield data generated by normalization.
242
243 Repeatable field is created when there is second occurence of same subfield or
244 if any of indicators are different.
245
246 This is sane for most cases. Something like:
247
248 900a-1 900b-1 900c-1
249 900a-2 900b-2
250 900a-3
251
252 will be created from any combination of:
253
254 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
255
256 and following rules:
257
258 marc('900','a', rec('200','a') );
259 marc('900','b', rec('200','b') );
260 marc('900','c', rec('200','c') );
261
262 which might not be what you have in mind. If you need repeatable subfield,
263 define it using C<marc_repeatable_subfield> like this:
264
265 marc_repeatable_subfield('900','a');
266 marc('900','a', rec('200','a') );
267 marc('900','b', rec('200','b') );
268 marc('900','c', rec('200','c') );
269
270 will create:
271
272 900a-1 900a-2 900a-3 900b-1 900c-1
273 900b-2
274
275 There is also support for returning next or specific using:
276
277 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
278 # do something with $mf
279 }
280
281 will always return fields from next MARC record or
282
283 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
284
285 will return 42th copy record (if it exists).
286
287 =cut
288
289 sub _get_marc_fields {
290
291 my $arg = {@_};
292 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
293 my $offset = $marc_fetch_offset;
294 if ($arg->{offset}) {
295 $offset = $arg->{offset};
296 } elsif($arg->{fetch_next}) {
297 $marc_fetch_offset++;
298 }
299
300 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
301
302 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
303
304 my $marc_rec = $marc_record->[ $offset ];
305
306 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
307
308 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
309
310 # first, sort all existing fields
311 # XXX might not be needed, but modern perl might randomize elements in hash
312 my @sorted_marc_record = sort {
313 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
314 } @{ $marc_rec };
315
316 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
317
318 # output marc fields
319 my @m;
320
321 # count unique field-subfields (used for offset when walking to next subfield)
322 my $u;
323 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
324
325 if ($debug) {
326 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
327 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
328 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
329 warn "## subfield count = ", dump( $u ), $/;
330 }
331
332 my $len = $#sorted_marc_record;
333 my $visited;
334 my $i = 0;
335 my $field;
336
337 foreach ( 0 .. $len ) {
338
339 # find next element which isn't visited
340 while ($visited->{$i}) {
341 $i = ($i + 1) % ($len + 1);
342 }
343
344 # mark it visited
345 $visited->{$i}++;
346
347 my $row = dclone( $sorted_marc_record[$i] );
348
349 # field and subfield which is key for
350 # marc_repeatable_subfield and u
351 my $fsf = $row->[0] . ( $row->[3] || '' );
352
353 if ($debug > 1) {
354
355 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
356 print "### this [$i]: ", dump( $row ),$/;
357 print "### sf: ", $row->[3], " vs ", $field->[3],
358 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
359 if ($#$field >= 0);
360
361 }
362
363 # if field exists
364 if ( $#$field >= 0 ) {
365 if (
366 $row->[0] ne $field->[0] || # field
367 $row->[1] ne $field->[1] || # i1
368 $row->[2] ne $field->[2] # i2
369 ) {
370 push @m, $field;
371 warn "## saved/1 ", dump( $field ),$/ if ($debug);
372 $field = $row;
373
374 } elsif (
375 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
376 ||
377 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
378 ! $marc_repeatable_subfield->{ $fsf }
379 )
380 ) {
381 push @m, $field;
382 warn "## saved/2 ", dump( $field ),$/ if ($debug);
383 $field = $row;
384
385 } else {
386 # append new subfields to existing field
387 push @$field, ( $row->[3], $row->[4] );
388 }
389 } else {
390 # insert first field
391 $field = $row;
392 }
393
394 if (! $marc_repeatable_subfield->{ $fsf }) {
395 # make step to next subfield
396 $i = ($i + $u->{ $fsf } ) % ($len + 1);
397 }
398 }
399
400 if ($#$field >= 0) {
401 push @m, $field;
402 warn "## saved/3 ", dump( $field ),$/ if ($debug);
403 }
404
405 return \@m;
406 }
407
408 =head2 _debug
409
410 Change level of debug warnings
411
412 _debug( 2 );
413
414 =cut
415
416 sub _debug {
417 my $l = shift;
418 return $debug unless defined($l);
419 warn "debug level $l",$/ if ($l > 0);
420 $debug = $l;
421 }
422
423 =head1 Functions to create C<data_structure>
424
425 Those functions generally have to first in your normalization file.
426
427 =head2 tag
428
429 Define new tag for I<search> and I<display>.
430
431 tag('Title', rec('200','a') );
432
433
434 =cut
435
436 sub tag {
437 my $name = shift or die "tag needs name as first argument";
438 my @o = grep { defined($_) && $_ ne '' } @_;
439 return unless (@o);
440 $out->{$name}->{tag} = $name;
441 $out->{$name}->{search} = \@o;
442 $out->{$name}->{display} = \@o;
443 }
444
445 =head2 display
446
447 Define tag just for I<display>
448
449 @v = display('Title', rec('200','a') );
450
451 =cut
452
453 sub display {
454 my $name = shift or die "display needs name as first argument";
455 my @o = grep { defined($_) && $_ ne '' } @_;
456 return unless (@o);
457 $out->{$name}->{tag} = $name;
458 $out->{$name}->{display} = \@o;
459 }
460
461 =head2 search
462
463 Prepare values just for I<search>
464
465 @v = search('Title', rec('200','a') );
466
467 =cut
468
469 sub search {
470 my $name = shift or die "search needs name as first argument";
471 my @o = grep { defined($_) && $_ ne '' } @_;
472 return unless (@o);
473 $out->{$name}->{tag} = $name;
474 $out->{$name}->{search} = \@o;
475 }
476
477 =head2 marc_leader
478
479 Setup fields within MARC leader or get leader
480
481 marc_leader('05','c');
482 my $leader = marc_leader();
483
484 =cut
485
486 sub marc_leader {
487 my ($offset,$value) = @_;
488
489 if ($offset) {
490 $leader->{ $offset } = $value;
491 } else {
492 return $leader;
493 }
494 }
495
496 =head2 marc
497
498 Save value for MARC field
499
500 marc('900','a', rec('200','a') );
501 marc('001', rec('000') );
502
503 =cut
504
505 sub marc {
506 my $f = shift or die "marc needs field";
507 die "marc field must be numer" unless ($f =~ /^\d+$/);
508
509 my $sf;
510 if ($f >= 10) {
511 $sf = shift or die "marc needs subfield";
512 }
513
514 foreach (@_) {
515 my $v = $_; # make var read-write for Encode
516 next unless (defined($v) && $v !~ /^\s*$/);
517 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
518 if (defined $sf) {
519 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
520 } else {
521 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
522 }
523 }
524 }
525
526 =head2 marc_repeatable_subfield
527
528 Save values for MARC repetable subfield
529
530 marc_repeatable_subfield('910', 'z', rec('909') );
531
532 =cut
533
534 sub marc_repeatable_subfield {
535 my ($f,$sf) = @_;
536 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
537 $marc_repeatable_subfield->{ $f . $sf }++;
538 marc(@_);
539 }
540
541 =head2 marc_indicators
542
543 Set both indicators for MARC field
544
545 marc_indicators('900', ' ', 1);
546
547 Any indicator value other than C<0-9> will be treated as undefined.
548
549 =cut
550
551 sub marc_indicators {
552 my $f = shift || die "marc_indicators need field!\n";
553 my ($i1,$i2) = @_;
554 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
555 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
556
557 $i1 = ' ' if ($i1 !~ /^\d$/);
558 $i2 = ' ' if ($i2 !~ /^\d$/);
559 @{ $marc_indicators->{$f} } = ($i1,$i2);
560 }
561
562 =head2 marc_compose
563
564 Save values for each MARC subfield explicitly
565
566 marc_compose('900',
567 'a', rec('200','a')
568 'b', rec('201','a')
569 'a', rec('200','b')
570 'c', rec('200','c')
571 );
572
573 If you specify C<+> for subfield, value will be appended
574 to previous defined subfield.
575
576 =cut
577
578 sub marc_compose {
579 my $f = shift or die "marc_compose needs field";
580 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
581
582 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
583 my $m = [ $f, $i1, $i2 ];
584
585 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
586
587 if ($#_ % 2 != 1) {
588 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
589 }
590
591 while (@_) {
592 my $sf = shift;
593 my $v = shift;
594
595 next unless (defined($v) && $v !~ /^\s*$/);
596 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
597 if ($sf ne '+') {
598 push @$m, ( $sf, $v );
599 } else {
600 $m->[ $#$m ] .= $v;
601 }
602 }
603
604 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
605
606 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
607 }
608
609 =head2 marc_duplicate
610
611 Generate copy of current MARC record and continue working on copy
612
613 marc_duplicate();
614
615 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
616 C<< _get_marc_fields( offset => 42 ) >>.
617
618 =cut
619
620 sub marc_duplicate {
621 my $m = $marc_record->[ -1 ];
622 die "can't duplicate record which isn't defined" unless ($m);
623 push @{ $marc_record }, dclone( $m );
624 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
625 $marc_record_offset = $#{ $marc_record };
626 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
627 }
628
629 =head2 marc_remove
630
631 Remove some field or subfield from MARC record.
632
633 marc_remove('200');
634 marc_remove('200','a');
635
636 This will erase field C<200> or C<200^a> from current MARC record.
637
638 This is useful after calling C<marc_duplicate> or on it's own (but, you
639 should probably just remove that subfield definition if you are not
640 using C<marc_duplicate>).
641
642 FIXME: support fields < 10.
643
644 =cut
645
646 sub marc_remove {
647 my ($f, $sf) = @_;
648
649 die "marc_remove needs record number" unless defined($f);
650
651 my $marc = $marc_record->[ $marc_record_offset ];
652
653 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
654
655 my $i = 0;
656 foreach ( 0 .. $#{ $marc } ) {
657 last unless (defined $marc->[$i]);
658 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
659 if ($marc->[$i]->[0] eq $f) {
660 if (! defined $sf) {
661 # remove whole field
662 splice @$marc, $i, 1;
663 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
664 $i--;
665 } else {
666 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
667 my $o = ($j * 2) + 3;
668 if ($marc->[$i]->[$o] eq $sf) {
669 # remove subfield
670 splice @{$marc->[$i]}, $o, 2;
671 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
672 # is record now empty?
673 if ($#{ $marc->[$i] } == 2) {
674 splice @$marc, $i, 1;
675 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
676 $i--;
677 };
678 }
679 }
680 }
681 }
682 $i++;
683 }
684
685 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
686
687 $marc_record->[ $marc_record_offset ] = $marc;
688
689 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
690 }
691
692 =head2 marc_original_order
693
694 Copy all subfields preserving original order to marc field.
695
696 marc_original_order( marc_field_number, original_input_field_number );
697
698 Please note that field numbers are consistent with other commands (marc
699 field number first), but somewhat counter-intuitive (destination and then
700 source).
701
702 You might want to use this command if you are just renaming subfields or
703 using pre-processing modify_record in C<config.yml> and don't need any
704 post-processing or want to preserve order of original subfields.
705
706
707 =cut
708
709 sub marc_original_order {
710
711 my ($to, $from) = @_;
712 die "marc_original_order needs from and to fields\n" unless ($from && $to);
713
714 return unless defined($rec->{$from});
715
716 my $r = $rec->{$from};
717 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
718
719 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
720 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
721
722 foreach my $d (@$r) {
723
724 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
725 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
726 next;
727 }
728
729 my @sfs = @{ $d->{subfields} };
730
731 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
732
733 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
734
735 my $m = [ $to, $i1, $i2 ];
736
737 while (my $sf = shift @sfs) {
738
739 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
740 my $offset = shift @sfs;
741 die "corrupted sufields specification for field $from\n" unless defined($offset);
742
743 my $v;
744 if (ref($d->{$sf}) eq 'ARRAY') {
745 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
746 } elsif ($offset == 0) {
747 $v = $d->{$sf};
748 } else {
749 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
750 }
751 push @$m, ( $sf, $v ) if (defined($v));
752 }
753
754 if ($#{$m} > 2) {
755 push @{ $marc_record->[ $marc_record_offset ] }, $m;
756 }
757 }
758
759 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
760 }
761
762
763 =head1 Functions to extract data from input
764
765 This function should be used inside functions to create C<data_structure> described
766 above.
767
768 =head2 _pack_subfields_hash
769
770 @subfields = _pack_subfields_hash( $h );
771 $subfields = _pack_subfields_hash( $h, 1 );
772
773 Return each subfield value in array or pack them all together and return scalar
774 with subfields (denoted by C<^>) and values.
775
776 =cut
777
778 sub _pack_subfields_hash {
779
780 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
781
782 my ($h,$include_subfields) = @_;
783
784 if ( defined($h->{subfields}) ) {
785 my $sfs = delete $h->{subfields} || die "no subfields?";
786 my @out;
787 while (@$sfs) {
788 my $sf = shift @$sfs;
789 push @out, '^' . $sf if ($include_subfields);
790 my $o = shift @$sfs;
791 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
792 # single element subfields are not arrays
793 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
794
795 push @out, $h->{$sf};
796 } else {
797 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
798 push @out, $h->{$sf}->[$o];
799 }
800 }
801 if ($include_subfields) {
802 return join('', @out);
803 } else {
804 return @out;
805 }
806 } else {
807 if ($include_subfields) {
808 my $out = '';
809 foreach my $sf (sort keys %$h) {
810 if (ref($h->{$sf}) eq 'ARRAY') {
811 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
812 } else {
813 $out .= '^' . $sf . $h->{$sf};
814 }
815 }
816 return $out;
817 } else {
818 # FIXME this should probably be in alphabetical order instead of hash order
819 values %{$h};
820 }
821 }
822 }
823
824 =head2 rec1
825
826 Return all values in some field
827
828 @v = rec1('200')
829
830 TODO: order of values is probably same as in source data, need to investigate that
831
832 =cut
833
834 sub rec1 {
835 my $f = shift;
836 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
837 return unless (defined($rec) && defined($rec->{$f}));
838 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
839 if (ref($rec->{$f}) eq 'ARRAY') {
840 my @out;
841 foreach my $h ( @{ $rec->{$f} } ) {
842 if (ref($h) eq 'HASH') {
843 push @out, ( _pack_subfields_hash( $h ) );
844 } else {
845 push @out, $h;
846 }
847 }
848 return @out;
849 } elsif( defined($rec->{$f}) ) {
850 return $rec->{$f};
851 }
852 }
853
854 =head2 rec2
855
856 Return all values in specific field and subfield
857
858 @v = rec2('200','a')
859
860 =cut
861
862 sub rec2 {
863 my $f = shift;
864 return unless (defined($rec && $rec->{$f}));
865 my $sf = shift;
866 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
867 return map {
868 if (ref($_->{$sf}) eq 'ARRAY') {
869 @{ $_->{$sf} };
870 } else {
871 $_->{$sf};
872 }
873 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
874 }
875
876 =head2 rec
877
878 syntaxtic sugar for
879
880 @v = rec('200')
881 @v = rec('200','a')
882
883 If rec() returns just single value, it will
884 return scalar, not array.
885
886 =cut
887
888 sub rec {
889 my @out;
890 if ($#_ == 0) {
891 @out = rec1(@_);
892 } elsif ($#_ == 1) {
893 @out = rec2(@_);
894 }
895 if ($#out == 0 && ! wantarray) {
896 return $out[0];
897 } elsif (@out) {
898 return @out;
899 } else {
900 return '';
901 }
902 }
903
904 =head2 regex
905
906 Apply regex to some or all values
907
908 @v = regex( 's/foo/bar/g', @v );
909
910 =cut
911
912 sub regex {
913 my $r = shift;
914 my @out;
915 #warn "r: $r\n", dump(\@_);
916 foreach my $t (@_) {
917 next unless ($t);
918 eval "\$t =~ $r";
919 push @out, $t if ($t && $t ne '');
920 }
921 return @out;
922 }
923
924 =head2 prefix
925
926 Prefix all values with a string
927
928 @v = prefix( 'my_', @v );
929
930 =cut
931
932 sub prefix {
933 my $p = shift or return;
934 return map { $p . $_ } grep { defined($_) } @_;
935 }
936
937 =head2 suffix
938
939 suffix all values with a string
940
941 @v = suffix( '_my', @v );
942
943 =cut
944
945 sub suffix {
946 my $s = shift or die "suffix needs string as first argument";
947 return map { $_ . $s } grep { defined($_) } @_;
948 }
949
950 =head2 surround
951
952 surround all values with a two strings
953
954 @v = surround( 'prefix_', '_suffix', @v );
955
956 =cut
957
958 sub surround {
959 my $p = shift or die "surround need prefix as first argument";
960 my $s = shift or die "surround needs suffix as second argument";
961 return map { $p . $_ . $s } grep { defined($_) } @_;
962 }
963
964 =head2 first
965
966 Return first element
967
968 $v = first( @v );
969
970 =cut
971
972 sub first {
973 my $r = shift;
974 return $r;
975 }
976
977 =head2 lookup
978
979 Consult lookup hashes for some value
980
981 @v = lookup(
982 sub {
983 'ffkk/peri/mfn'.rec('000')
984 },
985 'ffkk','peri','200-a-200-e',
986 sub {
987 first(rec(200,'a')).' '.first(rec('200','e'))
988 }
989 );
990
991 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
992 normal lookup definition in C<conf/lookup/something.pl> which looks like:
993
994 lookup(
995 # which results to return from record recorded in lookup
996 sub { 'ffkk/peri/mfn' . rec('000') },
997 # from which database and input
998 'ffkk','peri',
999 # such that following values match
1000 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1001 # if this part is missing, we will try to match same fields
1002 # from lookup record and current one, or you can override
1003 # which records to use from current record using
1004 sub { rec('900','x') . ' ' . rec('900','y') },
1005 )
1006
1007 You can think about this lookup as SQL (if that helps):
1008
1009 select
1010 sub { what }
1011 from
1012 database, input
1013 where
1014 sub { filter from lookuped record }
1015 having
1016 sub { optional filter on current record }
1017
1018 Easy as pie, right?
1019
1020 =cut
1021
1022 sub lookup {
1023 my ($what, $database, $input, $key, $having) = @_;
1024
1025 confess "lookup needs 5 arguments: what, database, input, key, having" unless ($#_ == 4);
1026
1027 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1028 return unless (defined($lookup->{$database}->{$input}->{$key}));
1029
1030 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1031
1032 my $mfns;
1033 my @having = $having->();
1034
1035 warn "## having = ", dump( @having ) if ($debug > 2);
1036
1037 foreach my $h ( @having ) {
1038 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1039 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1040 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1041 }
1042 }
1043
1044 return unless ($mfns);
1045
1046 my @mfns = sort keys %$mfns;
1047
1048 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1049
1050 my $old_rec = $rec;
1051 my @out;
1052
1053 foreach my $mfn (@mfns) {
1054 $rec = $load_row_coderef->( $database, $input, $mfn );
1055
1056 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1057
1058 my @vals = $what->();
1059
1060 push @out, ( @vals );
1061
1062 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1063 }
1064
1065 # if (ref($lookup->{$k}) eq 'ARRAY') {
1066 # return @{ $lookup->{$k} };
1067 # } else {
1068 # return $lookup->{$k};
1069 # }
1070
1071 $rec = $old_rec;
1072
1073 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1074
1075 if ($#out == 0) {
1076 return $out[0];
1077 } else {
1078 return @out;
1079 }
1080 }
1081
1082 =head2 save_into_lookup
1083
1084 Save value into lookup. It associates current database, input
1085 and specific keys with one or more values which will be
1086 associated over MFN.
1087
1088 MFN will be extracted from first occurence current of field 000
1089 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1090
1091 my $nr = save_into_lookup($database,$input,$key,sub {
1092 # code which produce one or more values
1093 });
1094
1095 It returns number of items saved.
1096
1097 This function shouldn't be called directly, it's called from code created by
1098 L<WebPAC::Parser>.
1099
1100 =cut
1101
1102 sub save_into_lookup {
1103 my ($database,$input,$key,$coderef) = @_;
1104 die "save_into_lookup needs database" unless defined($database);
1105 die "save_into_lookup needs input" unless defined($input);
1106 die "save_into_lookup needs key" unless defined($key);
1107 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1108
1109 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1110
1111 my $mfn =
1112 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1113 defined($config->{_mfn}) ? $config->{_mfn} :
1114 die "mfn not defined or zero";
1115
1116 my $nr = 0;
1117
1118 foreach my $v ( $coderef->() ) {
1119 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1120 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1121 $nr++;
1122 }
1123
1124 return $nr;
1125 }
1126
1127 =head2 config
1128
1129 Consult config values stored in C<config.yml>
1130
1131 # return database code (key under databases in yaml)
1132 $database_code = config(); # use _ from hash
1133 $database_name = config('name');
1134 $database_input_name = config('input name');
1135 $tag = config('input normalize tag');
1136
1137 Up to three levels are supported.
1138
1139 =cut
1140
1141 sub config {
1142 return unless ($config);
1143
1144 my $p = shift;
1145
1146 $p ||= '';
1147
1148 my $v;
1149
1150 warn "### getting config($p)\n" if ($debug > 1);
1151
1152 my @p = split(/\s+/,$p);
1153 if ($#p < 0) {
1154 $v = $config->{ '_' }; # special, database code
1155 } else {
1156
1157 my $c = dclone( $config );
1158
1159 foreach my $k (@p) {
1160 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1161 if (ref($c) eq 'ARRAY') {
1162 $c = shift @$c;
1163 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1164 last;
1165 }
1166
1167 if (! defined($c->{$k}) ) {
1168 $c = undef;
1169 last;
1170 } else {
1171 $c = $c->{$k};
1172 }
1173 }
1174 $v = $c if ($c);
1175
1176 }
1177
1178 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1179 warn "config( '$p' ) is empty\n" if (! $v);
1180
1181 return $v;
1182 }
1183
1184 =head2 id
1185
1186 Returns unique id of this record
1187
1188 $id = id();
1189
1190 Returns C<42/2> for 2nd occurence of MFN 42.
1191
1192 =cut
1193
1194 sub id {
1195 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1196 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1197 }
1198
1199 =head2 join_with
1200
1201 Joins walues with some delimiter
1202
1203 $v = join_with(", ", @v);
1204
1205 =cut
1206
1207 sub join_with {
1208 my $d = shift;
1209 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1210 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1211 return '' unless defined($v);
1212 return $v;
1213 }
1214
1215 =head2 split_rec_on
1216
1217 Split record subfield on some regex and take one of parts out
1218
1219 $a_before_semi_column =
1220 split_rec_on('200','a', /\s*;\s*/, $part);
1221
1222 C<$part> is optional number of element. First element is
1223 B<1>, not 0!
1224
1225 If there is no C<$part> parameter or C<$part> is 0, this function will
1226 return all values produced by splitting.
1227
1228 =cut
1229
1230 sub split_rec_on {
1231 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1232
1233 my ($fld, $sf, $regex, $part) = @_;
1234 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1235
1236 my @r = rec( $fld, $sf );
1237 my $v = shift @r;
1238 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1239
1240 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1241
1242 my @s = split( $regex, $v );
1243 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1244 if ($part && $part > 0) {
1245 return $s[ $part - 1 ];
1246 } else {
1247 return @s;
1248 }
1249 }
1250
1251 # END
1252 1;

  ViewVC Help
Powered by ViewVC 1.1.26