/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 730 - (show annotations)
Fri Sep 29 20:18:34 2006 UTC (17 years, 6 months ago) by dpavlin
File size: 27431 byte(s)
 r1052@llin:  dpavlin | 2006-09-29 22:15:12 +0200
 don't reset lookup and config on each data_structure call

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_ds
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader
13 marc_duplicate marc_remove
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22 /;
23
24 use warnings;
25 use strict;
26
27 #use base qw/WebPAC::Common/;
28 use Data::Dump qw/dump/;
29 use Storable qw/dclone/;
30 use Carp qw/confess/;
31
32 # debugging warn(s)
33 my $debug = 0;
34
35
36 =head1 NAME
37
38 WebPAC::Normalize - describe normalisaton rules using sets
39
40 =head1 VERSION
41
42 Version 0.21
43
44 =cut
45
46 our $VERSION = '0.21';
47
48 =head1 SYNOPSIS
49
50 This module uses C<conf/normalize/*.pl> files to perform normalisation
51 from input records using perl functions which are specialized for set
52 processing.
53
54 Sets are implemented as arrays, and normalisation file is valid perl, which
55 means that you check it's validity before running WebPAC using
56 C<perl -c normalize.pl>.
57
58 Normalisation can generate multiple output normalized data. For now, supported output
59 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 C<marc>.
61
62 =head1 FUNCTIONS
63
64 Functions which start with C<_> are private and used by WebPAC internally.
65 All other functions are available for use within normalisation rules.
66
67 =head2 data_structure
68
69 Return data structure
70
71 my $ds = WebPAC::Normalize::data_structure(
72 lookup => $lookup_hash,
73 row => $row,
74 rules => $normalize_pl_config,
75 marc_encoding => 'utf-8',
76 config => $config,
77 load_ds_coderef => sub {
78 my ($database,$input,$mfn) = shift;
79 $store->load_ds( database => $database, input => $input, id => $mfn );
80 },
81 );
82
83 Options C<row>, C<rules> and C<log> are mandatory while all
84 other are optional.
85
86 C<load_ds_coderef> is closure only used when executing lookups, so they will
87 die if it's not defined.
88
89 This function will B<die> if normalizastion can't be evaled.
90
91 Since this function isn't exported you have to call it with
92 C<WebPAC::Normalize::data_structure>.
93
94 =cut
95
96 my $load_ds_coderef;
97
98 sub data_structure {
99 my $arg = {@_};
100
101 die "need row argument" unless ($arg->{row});
102 die "need normalisation argument" unless ($arg->{rules});
103
104 no strict 'subs';
105 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 _set_rec( $arg->{row} );
107 _set_config( $arg->{config} ) if defined($arg->{config});
108 _clean_ds( %{ $arg } );
109 $load_ds_coderef = $arg->{load_ds_coderef};
110
111 eval "$arg->{rules}";
112 die "error evaling $arg->{rules}: $@\n" if ($@);
113
114 return _get_ds();
115 }
116
117 =head2 _set_rec
118
119 Set current record hash
120
121 _set_rec( $rec );
122
123 =cut
124
125 my $rec;
126
127 sub _set_rec {
128 $rec = shift or die "no record hash";
129 }
130
131 =head2 _set_config
132
133 Set current config hash
134
135 _set_config( $config );
136
137 Magic keys are:
138
139 =over 4
140
141 =item _
142
143 Code of current database
144
145 =item _mfn
146
147 Current MFN
148
149 =back
150
151 =cut
152
153 my $config;
154
155 sub _set_config {
156 $config = shift;
157 }
158
159 =head2 _get_ds
160
161 Return hash formatted as data structure
162
163 my $ds = _get_ds();
164
165 =cut
166
167 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
168 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
169
170 sub _get_ds {
171 return $out;
172 }
173
174 =head2 _clean_ds
175
176 Clean data structure hash for next record
177
178 _clean_ds();
179
180 =cut
181
182 sub _clean_ds {
183 my $a = {@_};
184 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
185 ($marc_record_offset, $marc_fetch_offset) = (0,0);
186 $marc_encoding = $a->{marc_encoding};
187 }
188
189 =head2 _set_lookup
190
191 Set current lookup hash
192
193 _set_lookup( $lookup );
194
195 =cut
196
197 my $lookup;
198
199 sub _set_lookup {
200 $lookup = shift;
201 }
202
203 =head2 _get_lookup
204
205 Get current lookup hash
206
207 my $lookup = _get_lookup();
208
209 =cut
210
211 sub _get_lookup {
212 return $lookup;
213 }
214
215 =head2 _set_load_ds
216
217 Setup code reference which will return L<data_structure> from
218 L<WebPAC::Store>
219
220 _set_load_ds(sub {
221 my ($database,$input,$mfn) = @_;
222 $store->load_ds( database => $database, input => $input, id => $mfn );
223 });
224
225 =cut
226
227 sub _set_load_ds {
228 my $coderef = shift;
229 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
230
231 $load_ds_coderef = $coderef;
232 }
233
234 =head2 _get_marc_fields
235
236 Get all fields defined by calls to C<marc>
237
238 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
239
240 We are using I<magic> which detect repeatable fields only from
241 sequence of field/subfield data generated by normalization.
242
243 Repeatable field is created when there is second occurence of same subfield or
244 if any of indicators are different.
245
246 This is sane for most cases. Something like:
247
248 900a-1 900b-1 900c-1
249 900a-2 900b-2
250 900a-3
251
252 will be created from any combination of:
253
254 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
255
256 and following rules:
257
258 marc('900','a', rec('200','a') );
259 marc('900','b', rec('200','b') );
260 marc('900','c', rec('200','c') );
261
262 which might not be what you have in mind. If you need repeatable subfield,
263 define it using C<marc_repeatable_subfield> like this:
264
265 marc_repeatable_subfield('900','a');
266 marc('900','a', rec('200','a') );
267 marc('900','b', rec('200','b') );
268 marc('900','c', rec('200','c') );
269
270 will create:
271
272 900a-1 900a-2 900a-3 900b-1 900c-1
273 900b-2
274
275 There is also support for returning next or specific using:
276
277 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
278 # do something with $mf
279 }
280
281 will always return fields from next MARC record or
282
283 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
284
285 will return 42th copy record (if it exists).
286
287 =cut
288
289 sub _get_marc_fields {
290
291 my $arg = {@_};
292 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
293 my $offset = $marc_fetch_offset;
294 if ($arg->{offset}) {
295 $offset = $arg->{offset};
296 } elsif($arg->{fetch_next}) {
297 $marc_fetch_offset++;
298 }
299
300 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
301
302 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
303
304 my $marc_rec = $marc_record->[ $offset ];
305
306 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
307
308 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
309
310 # first, sort all existing fields
311 # XXX might not be needed, but modern perl might randomize elements in hash
312 my @sorted_marc_record = sort {
313 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
314 } @{ $marc_rec };
315
316 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
317
318 # output marc fields
319 my @m;
320
321 # count unique field-subfields (used for offset when walking to next subfield)
322 my $u;
323 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
324
325 if ($debug) {
326 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
327 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
328 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
329 warn "## subfield count = ", dump( $u ), $/;
330 }
331
332 my $len = $#sorted_marc_record;
333 my $visited;
334 my $i = 0;
335 my $field;
336
337 foreach ( 0 .. $len ) {
338
339 # find next element which isn't visited
340 while ($visited->{$i}) {
341 $i = ($i + 1) % ($len + 1);
342 }
343
344 # mark it visited
345 $visited->{$i}++;
346
347 my $row = dclone( $sorted_marc_record[$i] );
348
349 # field and subfield which is key for
350 # marc_repeatable_subfield and u
351 my $fsf = $row->[0] . ( $row->[3] || '' );
352
353 if ($debug > 1) {
354
355 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
356 print "### this [$i]: ", dump( $row ),$/;
357 print "### sf: ", $row->[3], " vs ", $field->[3],
358 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
359 if ($#$field >= 0);
360
361 }
362
363 # if field exists
364 if ( $#$field >= 0 ) {
365 if (
366 $row->[0] ne $field->[0] || # field
367 $row->[1] ne $field->[1] || # i1
368 $row->[2] ne $field->[2] # i2
369 ) {
370 push @m, $field;
371 warn "## saved/1 ", dump( $field ),$/ if ($debug);
372 $field = $row;
373
374 } elsif (
375 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
376 ||
377 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
378 ! $marc_repeatable_subfield->{ $fsf }
379 )
380 ) {
381 push @m, $field;
382 warn "## saved/2 ", dump( $field ),$/ if ($debug);
383 $field = $row;
384
385 } else {
386 # append new subfields to existing field
387 push @$field, ( $row->[3], $row->[4] );
388 }
389 } else {
390 # insert first field
391 $field = $row;
392 }
393
394 if (! $marc_repeatable_subfield->{ $fsf }) {
395 # make step to next subfield
396 $i = ($i + $u->{ $fsf } ) % ($len + 1);
397 }
398 }
399
400 if ($#$field >= 0) {
401 push @m, $field;
402 warn "## saved/3 ", dump( $field ),$/ if ($debug);
403 }
404
405 return \@m;
406 }
407
408 =head2 _debug
409
410 Change level of debug warnings
411
412 _debug( 2 );
413
414 =cut
415
416 sub _debug {
417 my $l = shift;
418 return $debug unless defined($l);
419 warn "debug level $l",$/ if ($l > 0);
420 $debug = $l;
421 }
422
423 =head1 Functions to create C<data_structure>
424
425 Those functions generally have to first in your normalization file.
426
427 =head2 tag
428
429 Define new tag for I<search> and I<display>.
430
431 tag('Title', rec('200','a') );
432
433
434 =cut
435
436 sub tag {
437 my $name = shift or die "tag needs name as first argument";
438 my @o = grep { defined($_) && $_ ne '' } @_;
439 return unless (@o);
440 $out->{$name}->{tag} = $name;
441 $out->{$name}->{search} = \@o;
442 $out->{$name}->{display} = \@o;
443 }
444
445 =head2 display
446
447 Define tag just for I<display>
448
449 @v = display('Title', rec('200','a') );
450
451 =cut
452
453 sub display {
454 my $name = shift or die "display needs name as first argument";
455 my @o = grep { defined($_) && $_ ne '' } @_;
456 return unless (@o);
457 $out->{$name}->{tag} = $name;
458 $out->{$name}->{display} = \@o;
459 }
460
461 =head2 search
462
463 Prepare values just for I<search>
464
465 @v = search('Title', rec('200','a') );
466
467 =cut
468
469 sub search {
470 my $name = shift or die "search needs name as first argument";
471 my @o = grep { defined($_) && $_ ne '' } @_;
472 return unless (@o);
473 $out->{$name}->{tag} = $name;
474 $out->{$name}->{search} = \@o;
475 }
476
477 =head2 marc_leader
478
479 Setup fields within MARC leader or get leader
480
481 marc_leader('05','c');
482 my $leader = marc_leader();
483
484 =cut
485
486 sub marc_leader {
487 my ($offset,$value) = @_;
488
489 if ($offset) {
490 $out->{' leader'}->{ $offset } = $value;
491 } else {
492 return $out->{' leader'};
493 }
494 }
495
496 =head2 marc
497
498 Save value for MARC field
499
500 marc('900','a', rec('200','a') );
501 marc('001', rec('000') );
502
503 =cut
504
505 sub marc {
506 my $f = shift or die "marc needs field";
507 die "marc field must be numer" unless ($f =~ /^\d+$/);
508
509 my $sf;
510 if ($f >= 10) {
511 $sf = shift or die "marc needs subfield";
512 }
513
514 foreach (@_) {
515 my $v = $_; # make var read-write for Encode
516 next unless (defined($v) && $v !~ /^\s*$/);
517 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
518 if (defined $sf) {
519 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
520 } else {
521 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
522 }
523 }
524 }
525
526 =head2 marc_repeatable_subfield
527
528 Save values for MARC repetable subfield
529
530 marc_repeatable_subfield('910', 'z', rec('909') );
531
532 =cut
533
534 sub marc_repeatable_subfield {
535 my ($f,$sf) = @_;
536 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
537 $marc_repeatable_subfield->{ $f . $sf }++;
538 marc(@_);
539 }
540
541 =head2 marc_indicators
542
543 Set both indicators for MARC field
544
545 marc_indicators('900', ' ', 1);
546
547 Any indicator value other than C<0-9> will be treated as undefined.
548
549 =cut
550
551 sub marc_indicators {
552 my $f = shift || die "marc_indicators need field!\n";
553 my ($i1,$i2) = @_;
554 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
555 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
556
557 $i1 = ' ' if ($i1 !~ /^\d$/);
558 $i2 = ' ' if ($i2 !~ /^\d$/);
559 @{ $marc_indicators->{$f} } = ($i1,$i2);
560 }
561
562 =head2 marc_compose
563
564 Save values for each MARC subfield explicitly
565
566 marc_compose('900',
567 'a', rec('200','a')
568 'b', rec('201','a')
569 'a', rec('200','b')
570 'c', rec('200','c')
571 );
572
573 If you specify C<+> for subfield, value will be appended
574 to previous defined subfield.
575
576 =cut
577
578 sub marc_compose {
579 my $f = shift or die "marc_compose needs field";
580 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
581
582 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
583 my $m = [ $f, $i1, $i2 ];
584
585 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
586
587 if ($#_ % 2 != 1) {
588 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
589 }
590
591 while (@_) {
592 my $sf = shift;
593 my $v = shift;
594
595 next unless (defined($v) && $v !~ /^\s*$/);
596 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
597 if ($sf ne '+') {
598 push @$m, ( $sf, $v );
599 } else {
600 $m->[ $#$m ] .= $v;
601 }
602 }
603
604 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
605
606 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
607 }
608
609 =head2 marc_duplicate
610
611 Generate copy of current MARC record and continue working on copy
612
613 marc_duplicate();
614
615 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
616 C<< _get_marc_fields( offset => 42 ) >>.
617
618 =cut
619
620 sub marc_duplicate {
621 my $m = $marc_record->[ -1 ];
622 die "can't duplicate record which isn't defined" unless ($m);
623 push @{ $marc_record }, dclone( $m );
624 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
625 $marc_record_offset = $#{ $marc_record };
626 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
627 }
628
629 =head2 marc_remove
630
631 Remove some field or subfield from MARC record.
632
633 marc_remove('200');
634 marc_remove('200','a');
635
636 This will erase field C<200> or C<200^a> from current MARC record.
637
638 This is useful after calling C<marc_duplicate> or on it's own (but, you
639 should probably just remove that subfield definition if you are not
640 using C<marc_duplicate>).
641
642 FIXME: support fields < 10.
643
644 =cut
645
646 sub marc_remove {
647 my ($f, $sf) = @_;
648
649 die "marc_remove needs record number" unless defined($f);
650
651 my $marc = $marc_record->[ $marc_record_offset ];
652
653 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
654
655 my $i = 0;
656 foreach ( 0 .. $#{ $marc } ) {
657 last unless (defined $marc->[$i]);
658 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
659 if ($marc->[$i]->[0] eq $f) {
660 if (! defined $sf) {
661 # remove whole field
662 splice @$marc, $i, 1;
663 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
664 $i--;
665 } else {
666 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
667 my $o = ($j * 2) + 3;
668 if ($marc->[$i]->[$o] eq $sf) {
669 # remove subfield
670 splice @{$marc->[$i]}, $o, 2;
671 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
672 # is record now empty?
673 if ($#{ $marc->[$i] } == 2) {
674 splice @$marc, $i, 1;
675 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
676 $i--;
677 };
678 }
679 }
680 }
681 }
682 $i++;
683 }
684
685 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
686
687 $marc_record->[ $marc_record_offset ] = $marc;
688
689 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
690 }
691
692 =head2 marc_original_order
693
694 Copy all subfields preserving original order to marc field.
695
696 marc_original_order( marc_field_number, original_input_field_number );
697
698 Please note that field numbers are consistent with other commands (marc
699 field number first), but somewhat counter-intuitive (destination and then
700 source).
701
702 You might want to use this command if you are just renaming subfields or
703 using pre-processing modify_record in C<config.yml> and don't need any
704 post-processing or want to preserve order of original subfields.
705
706
707 =cut
708
709 sub marc_original_order {
710
711 my ($to, $from) = @_;
712 die "marc_original_order needs from and to fields\n" unless ($from && $to);
713
714 return unless defined($rec->{$from});
715
716 my $r = $rec->{$from};
717 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
718
719 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
720 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
721
722 foreach my $d (@$r) {
723
724 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
725 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
726 next;
727 }
728
729 my @sfs = @{ $d->{subfields} };
730
731 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
732
733 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
734
735 my $m = [ $to, $i1, $i2 ];
736
737 while (my $sf = shift @sfs) {
738
739 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
740 my $offset = shift @sfs;
741 die "corrupted sufields specification for field $from\n" unless defined($offset);
742
743 my $v;
744 if (ref($d->{$sf}) eq 'ARRAY') {
745 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
746 } elsif ($offset == 0) {
747 $v = $d->{$sf};
748 } else {
749 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
750 }
751 push @$m, ( $sf, $v ) if (defined($v));
752 }
753
754 if ($#{$m} > 2) {
755 push @{ $marc_record->[ $marc_record_offset ] }, $m;
756 }
757 }
758
759 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
760 }
761
762
763 =head1 Functions to extract data from input
764
765 This function should be used inside functions to create C<data_structure> described
766 above.
767
768 =head2 _pack_subfields_hash
769
770 @subfields = _pack_subfields_hash( $h );
771 $subfields = _pack_subfields_hash( $h, 1 );
772
773 Return each subfield value in array or pack them all together and return scalar
774 with subfields (denoted by C<^>) and values.
775
776 =cut
777
778 sub _pack_subfields_hash {
779
780 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
781
782 my ($h,$include_subfields) = @_;
783
784 if ( defined($h->{subfields}) ) {
785 my $sfs = delete $h->{subfields} || die "no subfields?";
786 my @out;
787 while (@$sfs) {
788 my $sf = shift @$sfs;
789 push @out, '^' . $sf if ($include_subfields);
790 my $o = shift @$sfs;
791 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
792 # single element subfields are not arrays
793 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
794
795 push @out, $h->{$sf};
796 } else {
797 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
798 push @out, $h->{$sf}->[$o];
799 }
800 }
801 if ($include_subfields) {
802 return join('', @out);
803 } else {
804 return @out;
805 }
806 } else {
807 if ($include_subfields) {
808 my $out = '';
809 foreach my $sf (sort keys %$h) {
810 if (ref($h->{$sf}) eq 'ARRAY') {
811 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
812 } else {
813 $out .= '^' . $sf . $h->{$sf};
814 }
815 }
816 return $out;
817 } else {
818 # FIXME this should probably be in alphabetical order instead of hash order
819 values %{$h};
820 }
821 }
822 }
823
824 =head2 rec1
825
826 Return all values in some field
827
828 @v = rec1('200')
829
830 TODO: order of values is probably same as in source data, need to investigate that
831
832 =cut
833
834 sub rec1 {
835 my $f = shift;
836 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
837 return unless (defined($rec) && defined($rec->{$f}));
838 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
839 if (ref($rec->{$f}) eq 'ARRAY') {
840 my @out;
841 foreach my $h ( @{ $rec->{$f} } ) {
842 if (ref($h) eq 'HASH') {
843 push @out, ( _pack_subfields_hash( $h ) );
844 } else {
845 push @out, $h;
846 }
847 }
848 return @out;
849 } elsif( defined($rec->{$f}) ) {
850 return $rec->{$f};
851 }
852 }
853
854 =head2 rec2
855
856 Return all values in specific field and subfield
857
858 @v = rec2('200','a')
859
860 =cut
861
862 sub rec2 {
863 my $f = shift;
864 return unless (defined($rec && $rec->{$f}));
865 my $sf = shift;
866 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
867 return map {
868 if (ref($_->{$sf}) eq 'ARRAY') {
869 @{ $_->{$sf} };
870 } else {
871 $_->{$sf};
872 }
873 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
874 }
875
876 =head2 rec
877
878 syntaxtic sugar for
879
880 @v = rec('200')
881 @v = rec('200','a')
882
883 =cut
884
885 sub rec {
886 my @out;
887 if ($#_ == 0) {
888 @out = rec1(@_);
889 } elsif ($#_ == 1) {
890 @out = rec2(@_);
891 }
892 if (@out) {
893 return @out;
894 } else {
895 return '';
896 }
897 }
898
899 =head2 regex
900
901 Apply regex to some or all values
902
903 @v = regex( 's/foo/bar/g', @v );
904
905 =cut
906
907 sub regex {
908 my $r = shift;
909 my @out;
910 #warn "r: $r\n", dump(\@_);
911 foreach my $t (@_) {
912 next unless ($t);
913 eval "\$t =~ $r";
914 push @out, $t if ($t && $t ne '');
915 }
916 return @out;
917 }
918
919 =head2 prefix
920
921 Prefix all values with a string
922
923 @v = prefix( 'my_', @v );
924
925 =cut
926
927 sub prefix {
928 my $p = shift or return;
929 return map { $p . $_ } grep { defined($_) } @_;
930 }
931
932 =head2 suffix
933
934 suffix all values with a string
935
936 @v = suffix( '_my', @v );
937
938 =cut
939
940 sub suffix {
941 my $s = shift or die "suffix needs string as first argument";
942 return map { $_ . $s } grep { defined($_) } @_;
943 }
944
945 =head2 surround
946
947 surround all values with a two strings
948
949 @v = surround( 'prefix_', '_suffix', @v );
950
951 =cut
952
953 sub surround {
954 my $p = shift or die "surround need prefix as first argument";
955 my $s = shift or die "surround needs suffix as second argument";
956 return map { $p . $_ . $s } grep { defined($_) } @_;
957 }
958
959 =head2 first
960
961 Return first element
962
963 $v = first( @v );
964
965 =cut
966
967 sub first {
968 my $r = shift;
969 return $r;
970 }
971
972 =head2 lookup
973
974 Consult lookup hashes for some value
975
976 @v = lookup(
977 sub {
978 'ffkk/peri/mfn'.rec('000')
979 },
980 'ffkk','peri','200-a-200-e',
981 sub {
982 first(rec(200,'a')).' '.first(rec('200','e'))
983 }
984 );
985
986 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
987 normal lookup definition in C<conf/lookup/something.pl> which looks like:
988
989 lookup(
990 # which results to return from record recorded in lookup
991 sub { 'ffkk/peri/mfn' . rec('000') },
992 # from which database and input
993 'ffkk','peri',
994 # such that following values match
995 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
996 # if this part is missing, we will try to match same fields
997 # from lookup record and current one, or you can override
998 # which records to use from current record using
999 sub { rec('900','x') . ' ' . rec('900','y') },
1000 )
1001
1002 You can think about this lookup as SQL (if that helps):
1003
1004 select
1005 sub { what }
1006 from
1007 database, input
1008 where
1009 sub { filter from lookuped record }
1010 having
1011 sub { optional filter on current record }
1012
1013 Easy as pie, right?
1014
1015 =cut
1016
1017 sub lookup {
1018 my ($what, $database, $input, $key, $having) = @_;
1019
1020 confess "lookup needs 5 arguments: what, database, input, key, having" unless ($#_ == 4);
1021
1022 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1023 return unless (defined($lookup->{$database}->{$input}->{$key}));
1024
1025 confess "lookup really need load_ds_coderef added to data_structure\n" unless ($load_ds_coderef);
1026
1027 my $mfns;
1028 my @having = $having->();
1029
1030 warn "## having = ", dump( @having ) if ($debug > 2);
1031
1032 foreach my $h ( @having ) {
1033 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1034 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n";
1035 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1036 }
1037 }
1038
1039 return unless ($mfns);
1040
1041 my @mfns = sort keys %$mfns;
1042
1043 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n";
1044
1045 my $old_rec = $rec;
1046 my @out;
1047
1048 foreach my $mfn (@mfns) {
1049 $rec = $load_ds_coderef->( $database, $input, $mfn );
1050
1051 warn "got $database/$input/$mfn = ", dump($rec), $/;
1052
1053 my @vals = $what->();
1054
1055 push @out, ( @vals );
1056
1057 warn "lookup for mfn $mfn returned ", dump(@vals), $/;
1058 }
1059
1060 # if (ref($lookup->{$k}) eq 'ARRAY') {
1061 # return @{ $lookup->{$k} };
1062 # } else {
1063 # return $lookup->{$k};
1064 # }
1065
1066 $rec = $old_rec;
1067
1068 warn "## lookup returns = ", dump(@out), $/;
1069
1070 return @out;
1071 }
1072
1073 =head2 save_into_lookup
1074
1075 Save value into lookup. It associates current database, input
1076 and specific keys with one or more values which will be
1077 associated over MFN.
1078
1079 MFN will be extracted from first occurence current of field 000
1080 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1081
1082 my $nr = save_into_lookup($database,$input,$key,sub {
1083 # code which produce one or more values
1084 });
1085
1086 It returns number of items saved.
1087
1088 This function shouldn't be called directly, it's called from code created by
1089 L<WebPAC::Parser>.
1090
1091 =cut
1092
1093 sub save_into_lookup {
1094 my ($database,$input,$key,$coderef) = @_;
1095 die "save_into_lookup needs database" unless defined($database);
1096 die "save_into_lookup needs input" unless defined($input);
1097 die "save_into_lookup needs key" unless defined($key);
1098 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1099
1100 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1101
1102 my $mfn =
1103 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1104 defined($config->{_mfn}) ? $config->{_mfn} :
1105 die "mfn not defined or zero";
1106
1107 my $nr = 0;
1108
1109 foreach my $v ( $coderef->() ) {
1110 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1111 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1112 $nr++;
1113 }
1114
1115 return $nr;
1116 }
1117
1118 =head2 config
1119
1120 Consult config values stored in C<config.yml>
1121
1122 # return database code (key under databases in yaml)
1123 $database_code = config(); # use _ from hash
1124 $database_name = config('name');
1125 $database_input_name = config('input name');
1126 $tag = config('input normalize tag');
1127
1128 Up to three levels are supported.
1129
1130 =cut
1131
1132 sub config {
1133 return unless ($config);
1134
1135 my $p = shift;
1136
1137 $p ||= '';
1138
1139 my $v;
1140
1141 warn "### getting config($p)\n" if ($debug > 1);
1142
1143 my @p = split(/\s+/,$p);
1144 if ($#p < 0) {
1145 $v = $config->{ '_' }; # special, database code
1146 } else {
1147
1148 my $c = dclone( $config );
1149
1150 foreach my $k (@p) {
1151 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1152 if (ref($c) eq 'ARRAY') {
1153 $c = shift @$c;
1154 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1155 last;
1156 }
1157
1158 if (! defined($c->{$k}) ) {
1159 $c = undef;
1160 last;
1161 } else {
1162 $c = $c->{$k};
1163 }
1164 }
1165 $v = $c if ($c);
1166
1167 }
1168
1169 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1170 warn "config( '$p' ) is empty\n" if (! $v);
1171
1172 return $v;
1173 }
1174
1175 =head2 id
1176
1177 Returns unique id of this record
1178
1179 $id = id();
1180
1181 Returns C<42/2> for 2nd occurence of MFN 42.
1182
1183 =cut
1184
1185 sub id {
1186 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1187 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1188 }
1189
1190 =head2 join_with
1191
1192 Joins walues with some delimiter
1193
1194 $v = join_with(", ", @v);
1195
1196 =cut
1197
1198 sub join_with {
1199 my $d = shift;
1200 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1201 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1202 return '' unless defined($v);
1203 return $v;
1204 }
1205
1206 =head2 split_rec_on
1207
1208 Split record subfield on some regex and take one of parts out
1209
1210 $a_before_semi_column =
1211 split_rec_on('200','a', /\s*;\s*/, $part);
1212
1213 C<$part> is optional number of element. First element is
1214 B<1>, not 0!
1215
1216 If there is no C<$part> parameter or C<$part> is 0, this function will
1217 return all values produced by splitting.
1218
1219 =cut
1220
1221 sub split_rec_on {
1222 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1223
1224 my ($fld, $sf, $regex, $part) = @_;
1225 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1226
1227 my @r = rec( $fld, $sf );
1228 my $v = shift @r;
1229 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1230
1231 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1232
1233 my @s = split( $regex, $v );
1234 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1235 if ($part && $part > 0) {
1236 return $s[ $part - 1 ];
1237 } else {
1238 return @s;
1239 }
1240 }
1241
1242 # END
1243 1;

  ViewVC Help
Powered by ViewVC 1.1.26