/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 786 - (show annotations)
Sun Dec 10 12:45:11 2006 UTC (17 years, 3 months ago) by dpavlin
File size: 28179 byte(s)
added support for marc_remove('*') to clean whole
marc record after marc_dumplicate

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader
13 marc_duplicate marc_remove
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22
23 get set
24 /;
25
26 use warnings;
27 use strict;
28
29 #use base qw/WebPAC::Common/;
30 use Data::Dump qw/dump/;
31 use Storable qw/dclone/;
32 use Carp qw/confess/;
33
34 # debugging warn(s)
35 my $debug = 0;
36
37
38 =head1 NAME
39
40 WebPAC::Normalize - describe normalisaton rules using sets
41
42 =head1 VERSION
43
44 Version 0.25
45
46 =cut
47
48 our $VERSION = '0.25';
49
50 =head1 SYNOPSIS
51
52 This module uses C<conf/normalize/*.pl> files to perform normalisation
53 from input records using perl functions which are specialized for set
54 processing.
55
56 Sets are implemented as arrays, and normalisation file is valid perl, which
57 means that you check it's validity before running WebPAC using
58 C<perl -c normalize.pl>.
59
60 Normalisation can generate multiple output normalized data. For now, supported output
61 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
62 C<marc>.
63
64 =head1 FUNCTIONS
65
66 Functions which start with C<_> are private and used by WebPAC internally.
67 All other functions are available for use within normalisation rules.
68
69 =head2 data_structure
70
71 Return data structure
72
73 my $ds = WebPAC::Normalize::data_structure(
74 lookup => $lookup_hash,
75 row => $row,
76 rules => $normalize_pl_config,
77 marc_encoding => 'utf-8',
78 config => $config,
79 load_row_coderef => sub {
80 my ($database,$input,$mfn) = shift;
81 $store->load_row( database => $database, input => $input, id => $mfn );
82 },
83 );
84
85 Options C<row>, C<rules> and C<log> are mandatory while all
86 other are optional.
87
88 C<load_row_coderef> is closure only used when executing lookups, so they will
89 die if it's not defined.
90
91 This function will B<die> if normalizastion can't be evaled.
92
93 Since this function isn't exported you have to call it with
94 C<WebPAC::Normalize::data_structure>.
95
96 =cut
97
98 my $load_row_coderef;
99
100 sub data_structure {
101 my $arg = {@_};
102
103 die "need row argument" unless ($arg->{row});
104 die "need normalisation argument" unless ($arg->{rules});
105
106 no strict 'subs';
107 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
108 _set_rec( $arg->{row} );
109 _set_config( $arg->{config} ) if defined($arg->{config});
110 _clean_ds( %{ $arg } );
111 $load_row_coderef = $arg->{load_row_coderef};
112
113 eval "$arg->{rules}";
114 die "error evaling $arg->{rules}: $@\n" if ($@);
115
116 return _get_ds();
117 }
118
119 =head2 _set_rec
120
121 Set current record hash
122
123 _set_rec( $rec );
124
125 =cut
126
127 my $rec;
128
129 sub _set_rec {
130 $rec = shift or die "no record hash";
131 }
132
133 =head2 _set_config
134
135 Set current config hash
136
137 _set_config( $config );
138
139 Magic keys are:
140
141 =over 4
142
143 =item _
144
145 Code of current database
146
147 =item _mfn
148
149 Current MFN
150
151 =back
152
153 =cut
154
155 my $config;
156
157 sub _set_config {
158 $config = shift;
159 }
160
161 =head2 _get_ds
162
163 Return hash formatted as data structure
164
165 my $ds = _get_ds();
166
167 =cut
168
169 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
170 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
171
172 sub _get_ds {
173 return $out;
174 }
175
176 =head2 _clean_ds
177
178 Clean data structure hash for next record
179
180 _clean_ds();
181
182 =cut
183
184 sub _clean_ds {
185 my $a = {@_};
186 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
187 ($marc_record_offset, $marc_fetch_offset) = (0,0);
188 $marc_encoding = $a->{marc_encoding};
189 }
190
191 =head2 _set_lookup
192
193 Set current lookup hash
194
195 _set_lookup( $lookup );
196
197 =cut
198
199 my $lookup;
200
201 sub _set_lookup {
202 $lookup = shift;
203 }
204
205 =head2 _get_lookup
206
207 Get current lookup hash
208
209 my $lookup = _get_lookup();
210
211 =cut
212
213 sub _get_lookup {
214 return $lookup;
215 }
216
217 =head2 _set_load_row
218
219 Setup code reference which will return L<data_structure> from
220 L<WebPAC::Store>
221
222 _set_load_row(sub {
223 my ($database,$input,$mfn) = @_;
224 $store->load_row( database => $database, input => $input, id => $mfn );
225 });
226
227 =cut
228
229 sub _set_load_row {
230 my $coderef = shift;
231 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
232
233 $load_row_coderef = $coderef;
234 }
235
236 =head2 _get_marc_fields
237
238 Get all fields defined by calls to C<marc>
239
240 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
241
242 We are using I<magic> which detect repeatable fields only from
243 sequence of field/subfield data generated by normalization.
244
245 Repeatable field is created when there is second occurence of same subfield or
246 if any of indicators are different.
247
248 This is sane for most cases. Something like:
249
250 900a-1 900b-1 900c-1
251 900a-2 900b-2
252 900a-3
253
254 will be created from any combination of:
255
256 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
257
258 and following rules:
259
260 marc('900','a', rec('200','a') );
261 marc('900','b', rec('200','b') );
262 marc('900','c', rec('200','c') );
263
264 which might not be what you have in mind. If you need repeatable subfield,
265 define it using C<marc_repeatable_subfield> like this:
266
267 marc_repeatable_subfield('900','a');
268 marc('900','a', rec('200','a') );
269 marc('900','b', rec('200','b') );
270 marc('900','c', rec('200','c') );
271
272 will create:
273
274 900a-1 900a-2 900a-3 900b-1 900c-1
275 900b-2
276
277 There is also support for returning next or specific using:
278
279 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
280 # do something with $mf
281 }
282
283 will always return fields from next MARC record or
284
285 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
286
287 will return 42th copy record (if it exists).
288
289 =cut
290
291 sub _get_marc_fields {
292
293 my $arg = {@_};
294 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
295 my $offset = $marc_fetch_offset;
296 if ($arg->{offset}) {
297 $offset = $arg->{offset};
298 } elsif($arg->{fetch_next}) {
299 $marc_fetch_offset++;
300 }
301
302 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
303
304 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
305
306 my $marc_rec = $marc_record->[ $offset ];
307
308 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
309
310 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
311
312 # first, sort all existing fields
313 # XXX might not be needed, but modern perl might randomize elements in hash
314 my @sorted_marc_record = sort {
315 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
316 } @{ $marc_rec };
317
318 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
319
320 # output marc fields
321 my @m;
322
323 # count unique field-subfields (used for offset when walking to next subfield)
324 my $u;
325 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
326
327 if ($debug) {
328 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
329 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
330 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
331 warn "## subfield count = ", dump( $u ), $/;
332 }
333
334 my $len = $#sorted_marc_record;
335 my $visited;
336 my $i = 0;
337 my $field;
338
339 foreach ( 0 .. $len ) {
340
341 # find next element which isn't visited
342 while ($visited->{$i}) {
343 $i = ($i + 1) % ($len + 1);
344 }
345
346 # mark it visited
347 $visited->{$i}++;
348
349 my $row = dclone( $sorted_marc_record[$i] );
350
351 # field and subfield which is key for
352 # marc_repeatable_subfield and u
353 my $fsf = $row->[0] . ( $row->[3] || '' );
354
355 if ($debug > 1) {
356
357 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
358 print "### this [$i]: ", dump( $row ),$/;
359 print "### sf: ", $row->[3], " vs ", $field->[3],
360 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
361 if ($#$field >= 0);
362
363 }
364
365 # if field exists
366 if ( $#$field >= 0 ) {
367 if (
368 $row->[0] ne $field->[0] || # field
369 $row->[1] ne $field->[1] || # i1
370 $row->[2] ne $field->[2] # i2
371 ) {
372 push @m, $field;
373 warn "## saved/1 ", dump( $field ),$/ if ($debug);
374 $field = $row;
375
376 } elsif (
377 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
378 ||
379 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
380 ! $marc_repeatable_subfield->{ $fsf }
381 )
382 ) {
383 push @m, $field;
384 warn "## saved/2 ", dump( $field ),$/ if ($debug);
385 $field = $row;
386
387 } else {
388 # append new subfields to existing field
389 push @$field, ( $row->[3], $row->[4] );
390 }
391 } else {
392 # insert first field
393 $field = $row;
394 }
395
396 if (! $marc_repeatable_subfield->{ $fsf }) {
397 # make step to next subfield
398 $i = ($i + $u->{ $fsf } ) % ($len + 1);
399 }
400 }
401
402 if ($#$field >= 0) {
403 push @m, $field;
404 warn "## saved/3 ", dump( $field ),$/ if ($debug);
405 }
406
407 return \@m;
408 }
409
410 =head2 _debug
411
412 Change level of debug warnings
413
414 _debug( 2 );
415
416 =cut
417
418 sub _debug {
419 my $l = shift;
420 return $debug unless defined($l);
421 warn "debug level $l",$/ if ($l > 0);
422 $debug = $l;
423 }
424
425 =head1 Functions to create C<data_structure>
426
427 Those functions generally have to first in your normalization file.
428
429 =head2 tag
430
431 Define new tag for I<search> and I<display>.
432
433 tag('Title', rec('200','a') );
434
435
436 =cut
437
438 sub tag {
439 my $name = shift or die "tag needs name as first argument";
440 my @o = grep { defined($_) && $_ ne '' } @_;
441 return unless (@o);
442 $out->{$name}->{tag} = $name;
443 $out->{$name}->{search} = \@o;
444 $out->{$name}->{display} = \@o;
445 }
446
447 =head2 display
448
449 Define tag just for I<display>
450
451 @v = display('Title', rec('200','a') );
452
453 =cut
454
455 sub display {
456 my $name = shift or die "display needs name as first argument";
457 my @o = grep { defined($_) && $_ ne '' } @_;
458 return unless (@o);
459 $out->{$name}->{tag} = $name;
460 $out->{$name}->{display} = \@o;
461 }
462
463 =head2 search
464
465 Prepare values just for I<search>
466
467 @v = search('Title', rec('200','a') );
468
469 =cut
470
471 sub search {
472 my $name = shift or die "search needs name as first argument";
473 my @o = grep { defined($_) && $_ ne '' } @_;
474 return unless (@o);
475 $out->{$name}->{tag} = $name;
476 $out->{$name}->{search} = \@o;
477 }
478
479 =head2 marc_leader
480
481 Setup fields within MARC leader or get leader
482
483 marc_leader('05','c');
484 my $leader = marc_leader();
485
486 =cut
487
488 sub marc_leader {
489 my ($offset,$value) = @_;
490
491 if ($offset) {
492 $leader->{ $offset } = $value;
493 } else {
494 return $leader;
495 }
496 }
497
498 =head2 marc
499
500 Save value for MARC field
501
502 marc('900','a', rec('200','a') );
503 marc('001', rec('000') );
504
505 =cut
506
507 sub marc {
508 my $f = shift or die "marc needs field";
509 die "marc field must be numer" unless ($f =~ /^\d+$/);
510
511 my $sf;
512 if ($f >= 10) {
513 $sf = shift or die "marc needs subfield";
514 }
515
516 foreach (@_) {
517 my $v = $_; # make var read-write for Encode
518 next unless (defined($v) && $v !~ /^\s*$/);
519 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
520 if (defined $sf) {
521 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
522 } else {
523 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
524 }
525 }
526 }
527
528 =head2 marc_repeatable_subfield
529
530 Save values for MARC repetable subfield
531
532 marc_repeatable_subfield('910', 'z', rec('909') );
533
534 =cut
535
536 sub marc_repeatable_subfield {
537 my ($f,$sf) = @_;
538 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
539 $marc_repeatable_subfield->{ $f . $sf }++;
540 marc(@_);
541 }
542
543 =head2 marc_indicators
544
545 Set both indicators for MARC field
546
547 marc_indicators('900', ' ', 1);
548
549 Any indicator value other than C<0-9> will be treated as undefined.
550
551 =cut
552
553 sub marc_indicators {
554 my $f = shift || die "marc_indicators need field!\n";
555 my ($i1,$i2) = @_;
556 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
557 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
558
559 $i1 = ' ' if ($i1 !~ /^\d$/);
560 $i2 = ' ' if ($i2 !~ /^\d$/);
561 @{ $marc_indicators->{$f} } = ($i1,$i2);
562 }
563
564 =head2 marc_compose
565
566 Save values for each MARC subfield explicitly
567
568 marc_compose('900',
569 'a', rec('200','a')
570 'b', rec('201','a')
571 'a', rec('200','b')
572 'c', rec('200','c')
573 );
574
575 If you specify C<+> for subfield, value will be appended
576 to previous defined subfield.
577
578 =cut
579
580 sub marc_compose {
581 my $f = shift or die "marc_compose needs field";
582 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
583
584 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
585 my $m = [ $f, $i1, $i2 ];
586
587 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
588
589 if ($#_ % 2 != 1) {
590 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
591 }
592
593 while (@_) {
594 my $sf = shift;
595 my $v = shift;
596
597 next unless (defined($v) && $v !~ /^\s*$/);
598 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
599 if ($sf ne '+') {
600 push @$m, ( $sf, $v );
601 } else {
602 $m->[ $#$m ] .= $v;
603 }
604 }
605
606 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
607
608 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
609 }
610
611 =head2 marc_duplicate
612
613 Generate copy of current MARC record and continue working on copy
614
615 marc_duplicate();
616
617 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
618 C<< _get_marc_fields( offset => 42 ) >>.
619
620 =cut
621
622 sub marc_duplicate {
623 my $m = $marc_record->[ -1 ];
624 die "can't duplicate record which isn't defined" unless ($m);
625 push @{ $marc_record }, dclone( $m );
626 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
627 $marc_record_offset = $#{ $marc_record };
628 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
629 }
630
631 =head2 marc_remove
632
633 Remove some field or subfield from MARC record.
634
635 marc_remove('200');
636 marc_remove('200','a');
637
638 This will erase field C<200> or C<200^a> from current MARC record.
639
640 marc_remove('*');
641
642 Will remove all fields in current MARC record.
643
644 This is useful after calling C<marc_duplicate> or on it's own (but, you
645 should probably just remove that subfield definition if you are not
646 using C<marc_duplicate>).
647
648 FIXME: support fields < 10.
649
650 =cut
651
652 sub marc_remove {
653 my ($f, $sf) = @_;
654
655 die "marc_remove needs record number" unless defined($f);
656
657 my $marc = $marc_record->[ $marc_record_offset ];
658
659 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
660
661 if ($f eq '*') {
662
663 delete( $marc_record->[ $marc_record_offset ] );
664
665 } else {
666
667 my $i = 0;
668 foreach ( 0 .. $#{ $marc } ) {
669 last unless (defined $marc->[$i]);
670 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
671 if ($marc->[$i]->[0] eq $f) {
672 if (! defined $sf) {
673 # remove whole field
674 splice @$marc, $i, 1;
675 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
676 $i--;
677 } else {
678 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
679 my $o = ($j * 2) + 3;
680 if ($marc->[$i]->[$o] eq $sf) {
681 # remove subfield
682 splice @{$marc->[$i]}, $o, 2;
683 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
684 # is record now empty?
685 if ($#{ $marc->[$i] } == 2) {
686 splice @$marc, $i, 1;
687 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
688 $i--;
689 };
690 }
691 }
692 }
693 }
694 $i++;
695 }
696
697 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
698
699 $marc_record->[ $marc_record_offset ] = $marc;
700 }
701
702
703 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
704 }
705
706 =head2 marc_original_order
707
708 Copy all subfields preserving original order to marc field.
709
710 marc_original_order( marc_field_number, original_input_field_number );
711
712 Please note that field numbers are consistent with other commands (marc
713 field number first), but somewhat counter-intuitive (destination and then
714 source).
715
716 You might want to use this command if you are just renaming subfields or
717 using pre-processing modify_record in C<config.yml> and don't need any
718 post-processing or want to preserve order of original subfields.
719
720
721 =cut
722
723 sub marc_original_order {
724
725 my ($to, $from) = @_;
726 die "marc_original_order needs from and to fields\n" unless ($from && $to);
727
728 return unless defined($rec->{$from});
729
730 my $r = $rec->{$from};
731 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
732
733 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
734 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
735
736 foreach my $d (@$r) {
737
738 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
739 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
740 next;
741 }
742
743 my @sfs = @{ $d->{subfields} };
744
745 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
746
747 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
748
749 my $m = [ $to, $i1, $i2 ];
750
751 while (my $sf = shift @sfs) {
752
753 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
754 my $offset = shift @sfs;
755 die "corrupted sufields specification for field $from\n" unless defined($offset);
756
757 my $v;
758 if (ref($d->{$sf}) eq 'ARRAY') {
759 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
760 } elsif ($offset == 0) {
761 $v = $d->{$sf};
762 } else {
763 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
764 }
765 push @$m, ( $sf, $v ) if (defined($v));
766 }
767
768 if ($#{$m} > 2) {
769 push @{ $marc_record->[ $marc_record_offset ] }, $m;
770 }
771 }
772
773 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
774 }
775
776
777 =head1 Functions to extract data from input
778
779 This function should be used inside functions to create C<data_structure> described
780 above.
781
782 =head2 _pack_subfields_hash
783
784 @subfields = _pack_subfields_hash( $h );
785 $subfields = _pack_subfields_hash( $h, 1 );
786
787 Return each subfield value in array or pack them all together and return scalar
788 with subfields (denoted by C<^>) and values.
789
790 =cut
791
792 sub _pack_subfields_hash {
793
794 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
795
796 my ($h,$include_subfields) = @_;
797
798 if ( defined($h->{subfields}) ) {
799 my $sfs = delete $h->{subfields} || die "no subfields?";
800 my @out;
801 while (@$sfs) {
802 my $sf = shift @$sfs;
803 push @out, '^' . $sf if ($include_subfields);
804 my $o = shift @$sfs;
805 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
806 # single element subfields are not arrays
807 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
808
809 push @out, $h->{$sf};
810 } else {
811 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
812 push @out, $h->{$sf}->[$o];
813 }
814 }
815 if ($include_subfields) {
816 return join('', @out);
817 } else {
818 return @out;
819 }
820 } else {
821 if ($include_subfields) {
822 my $out = '';
823 foreach my $sf (sort keys %$h) {
824 if (ref($h->{$sf}) eq 'ARRAY') {
825 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
826 } else {
827 $out .= '^' . $sf . $h->{$sf};
828 }
829 }
830 return $out;
831 } else {
832 # FIXME this should probably be in alphabetical order instead of hash order
833 values %{$h};
834 }
835 }
836 }
837
838 =head2 rec1
839
840 Return all values in some field
841
842 @v = rec1('200')
843
844 TODO: order of values is probably same as in source data, need to investigate that
845
846 =cut
847
848 sub rec1 {
849 my $f = shift;
850 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
851 return unless (defined($rec) && defined($rec->{$f}));
852 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
853 if (ref($rec->{$f}) eq 'ARRAY') {
854 my @out;
855 foreach my $h ( @{ $rec->{$f} } ) {
856 if (ref($h) eq 'HASH') {
857 push @out, ( _pack_subfields_hash( $h ) );
858 } else {
859 push @out, $h;
860 }
861 }
862 return @out;
863 } elsif( defined($rec->{$f}) ) {
864 return $rec->{$f};
865 }
866 }
867
868 =head2 rec2
869
870 Return all values in specific field and subfield
871
872 @v = rec2('200','a')
873
874 =cut
875
876 sub rec2 {
877 my $f = shift;
878 return unless (defined($rec && $rec->{$f}));
879 my $sf = shift;
880 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
881 return map {
882 if (ref($_->{$sf}) eq 'ARRAY') {
883 @{ $_->{$sf} };
884 } else {
885 $_->{$sf};
886 }
887 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
888 }
889
890 =head2 rec
891
892 syntaxtic sugar for
893
894 @v = rec('200')
895 @v = rec('200','a')
896
897 If rec() returns just single value, it will
898 return scalar, not array.
899
900 =cut
901
902 sub rec {
903 my @out;
904 if ($#_ == 0) {
905 @out = rec1(@_);
906 } elsif ($#_ == 1) {
907 @out = rec2(@_);
908 }
909 if ($#out == 0 && ! wantarray) {
910 return $out[0];
911 } elsif (@out) {
912 return @out;
913 } else {
914 return '';
915 }
916 }
917
918 =head2 regex
919
920 Apply regex to some or all values
921
922 @v = regex( 's/foo/bar/g', @v );
923
924 =cut
925
926 sub regex {
927 my $r = shift;
928 my @out;
929 #warn "r: $r\n", dump(\@_);
930 foreach my $t (@_) {
931 next unless ($t);
932 eval "\$t =~ $r";
933 push @out, $t if ($t && $t ne '');
934 }
935 return @out;
936 }
937
938 =head2 prefix
939
940 Prefix all values with a string
941
942 @v = prefix( 'my_', @v );
943
944 =cut
945
946 sub prefix {
947 my $p = shift or return;
948 return map { $p . $_ } grep { defined($_) } @_;
949 }
950
951 =head2 suffix
952
953 suffix all values with a string
954
955 @v = suffix( '_my', @v );
956
957 =cut
958
959 sub suffix {
960 my $s = shift or die "suffix needs string as first argument";
961 return map { $_ . $s } grep { defined($_) } @_;
962 }
963
964 =head2 surround
965
966 surround all values with a two strings
967
968 @v = surround( 'prefix_', '_suffix', @v );
969
970 =cut
971
972 sub surround {
973 my $p = shift or die "surround need prefix as first argument";
974 my $s = shift or die "surround needs suffix as second argument";
975 return map { $p . $_ . $s } grep { defined($_) } @_;
976 }
977
978 =head2 first
979
980 Return first element
981
982 $v = first( @v );
983
984 =cut
985
986 sub first {
987 my $r = shift;
988 return $r;
989 }
990
991 =head2 lookup
992
993 Consult lookup hashes for some value
994
995 @v = lookup(
996 sub {
997 'ffkk/peri/mfn'.rec('000')
998 },
999 'ffkk','peri','200-a-200-e',
1000 sub {
1001 first(rec(200,'a')).' '.first(rec('200','e'))
1002 }
1003 );
1004
1005 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1006 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1007
1008 lookup(
1009 # which results to return from record recorded in lookup
1010 sub { 'ffkk/peri/mfn' . rec('000') },
1011 # from which database and input
1012 'ffkk','peri',
1013 # such that following values match
1014 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1015 # if this part is missing, we will try to match same fields
1016 # from lookup record and current one, or you can override
1017 # which records to use from current record using
1018 sub { rec('900','x') . ' ' . rec('900','y') },
1019 )
1020
1021 You can think about this lookup as SQL (if that helps):
1022
1023 select
1024 sub { what }
1025 from
1026 database, input
1027 where
1028 sub { filter from lookuped record }
1029 having
1030 sub { optional filter on current record }
1031
1032 Easy as pie, right?
1033
1034 =cut
1035
1036 sub lookup {
1037 my ($what, $database, $input, $key, $having) = @_;
1038
1039 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1040
1041 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1042 return unless (defined($lookup->{$database}->{$input}->{$key}));
1043
1044 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1045
1046 my $mfns;
1047 my @having = $having->();
1048
1049 warn "## having = ", dump( @having ) if ($debug > 2);
1050
1051 foreach my $h ( @having ) {
1052 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1053 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1054 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1055 }
1056 }
1057
1058 return unless ($mfns);
1059
1060 my @mfns = sort keys %$mfns;
1061
1062 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1063
1064 my $old_rec = $rec;
1065 my @out;
1066
1067 foreach my $mfn (@mfns) {
1068 $rec = $load_row_coderef->( $database, $input, $mfn );
1069
1070 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1071
1072 my @vals = $what->();
1073
1074 push @out, ( @vals );
1075
1076 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1077 }
1078
1079 # if (ref($lookup->{$k}) eq 'ARRAY') {
1080 # return @{ $lookup->{$k} };
1081 # } else {
1082 # return $lookup->{$k};
1083 # }
1084
1085 $rec = $old_rec;
1086
1087 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1088
1089 if ($#out == 0) {
1090 return $out[0];
1091 } else {
1092 return @out;
1093 }
1094 }
1095
1096 =head2 save_into_lookup
1097
1098 Save value into lookup. It associates current database, input
1099 and specific keys with one or more values which will be
1100 associated over MFN.
1101
1102 MFN will be extracted from first occurence current of field 000
1103 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1104
1105 my $nr = save_into_lookup($database,$input,$key,sub {
1106 # code which produce one or more values
1107 });
1108
1109 It returns number of items saved.
1110
1111 This function shouldn't be called directly, it's called from code created by
1112 L<WebPAC::Parser>.
1113
1114 =cut
1115
1116 sub save_into_lookup {
1117 my ($database,$input,$key,$coderef) = @_;
1118 die "save_into_lookup needs database" unless defined($database);
1119 die "save_into_lookup needs input" unless defined($input);
1120 die "save_into_lookup needs key" unless defined($key);
1121 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1122
1123 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1124
1125 my $mfn =
1126 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1127 defined($config->{_mfn}) ? $config->{_mfn} :
1128 die "mfn not defined or zero";
1129
1130 my $nr = 0;
1131
1132 foreach my $v ( $coderef->() ) {
1133 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1134 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1135 $nr++;
1136 }
1137
1138 return $nr;
1139 }
1140
1141 =head2 config
1142
1143 Consult config values stored in C<config.yml>
1144
1145 # return database code (key under databases in yaml)
1146 $database_code = config(); # use _ from hash
1147 $database_name = config('name');
1148 $database_input_name = config('input name');
1149 $tag = config('input normalize tag');
1150
1151 Up to three levels are supported.
1152
1153 =cut
1154
1155 sub config {
1156 return unless ($config);
1157
1158 my $p = shift;
1159
1160 $p ||= '';
1161
1162 my $v;
1163
1164 warn "### getting config($p)\n" if ($debug > 1);
1165
1166 my @p = split(/\s+/,$p);
1167 if ($#p < 0) {
1168 $v = $config->{ '_' }; # special, database code
1169 } else {
1170
1171 my $c = dclone( $config );
1172
1173 foreach my $k (@p) {
1174 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1175 if (ref($c) eq 'ARRAY') {
1176 $c = shift @$c;
1177 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1178 last;
1179 }
1180
1181 if (! defined($c->{$k}) ) {
1182 $c = undef;
1183 last;
1184 } else {
1185 $c = $c->{$k};
1186 }
1187 }
1188 $v = $c if ($c);
1189
1190 }
1191
1192 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1193 warn "config( '$p' ) is empty\n" if (! $v);
1194
1195 return $v;
1196 }
1197
1198 =head2 id
1199
1200 Returns unique id of this record
1201
1202 $id = id();
1203
1204 Returns C<42/2> for 2nd occurence of MFN 42.
1205
1206 =cut
1207
1208 sub id {
1209 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1210 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1211 }
1212
1213 =head2 join_with
1214
1215 Joins walues with some delimiter
1216
1217 $v = join_with(", ", @v);
1218
1219 =cut
1220
1221 sub join_with {
1222 my $d = shift;
1223 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1224 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1225 return '' unless defined($v);
1226 return $v;
1227 }
1228
1229 =head2 split_rec_on
1230
1231 Split record subfield on some regex and take one of parts out
1232
1233 $a_before_semi_column =
1234 split_rec_on('200','a', /\s*;\s*/, $part);
1235
1236 C<$part> is optional number of element. First element is
1237 B<1>, not 0!
1238
1239 If there is no C<$part> parameter or C<$part> is 0, this function will
1240 return all values produced by splitting.
1241
1242 =cut
1243
1244 sub split_rec_on {
1245 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1246
1247 my ($fld, $sf, $regex, $part) = @_;
1248 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1249
1250 my @r = rec( $fld, $sf );
1251 my $v = shift @r;
1252 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1253
1254 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1255
1256 my @s = split( $regex, $v );
1257 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1258 if ($part && $part > 0) {
1259 return $s[ $part - 1 ];
1260 } else {
1261 return @s;
1262 }
1263 }
1264
1265 my $hash;
1266
1267 =head2 set
1268
1269 set( key => 'value' );
1270
1271 =cut
1272
1273 sub set {
1274 my ($k,$v) = @_;
1275 warn "## set ( $k => ", dump($v), " )", $/;
1276 $hash->{$k} = $v;
1277 };
1278
1279 =head2 get
1280
1281 get( 'key' );
1282
1283 =cut
1284
1285 sub get {
1286 my $k = shift || return;
1287 my $v = $hash->{$k};
1288 warn "## get $k = ", dump( $v ), $/;
1289 return $v;
1290 }
1291
1292
1293 # END
1294 1;

  ViewVC Help
Powered by ViewVC 1.1.26