/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 711 - (show annotations)
Tue Sep 26 10:05:25 2006 UTC (17 years, 6 months ago) by dpavlin
File size: 23830 byte(s)
 r1016@llin:  dpavlin | 2006-09-26 12:03:11 +0200
 make it less chatty

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7 _pack_subfields_hash
8
9 tag search display
10 marc marc_indicators marc_repeatable_subfield
11 marc_compose marc_leader
12 marc_duplicate marc_remove
13 marc_original_order
14
15 rec1 rec2 rec
16 regex prefix suffix surround
17 first lookup join_with
18 save_into_lookup
19
20 split_rec_on
21 /;
22
23 use warnings;
24 use strict;
25
26 #use base qw/WebPAC::Common/;
27 use Data::Dump qw/dump/;
28 use Storable qw/dclone/;
29
30 # debugging warn(s)
31 my $debug = 0;
32
33
34 =head1 NAME
35
36 WebPAC::Normalize - describe normalisaton rules using sets
37
38 =head1 VERSION
39
40 Version 0.20
41
42 =cut
43
44 our $VERSION = '0.20';
45
46 =head1 SYNOPSIS
47
48 This module uses C<conf/normalize/*.pl> files to perform normalisation
49 from input records using perl functions which are specialized for set
50 processing.
51
52 Sets are implemented as arrays, and normalisation file is valid perl, which
53 means that you check it's validity before running WebPAC using
54 C<perl -c normalize.pl>.
55
56 Normalisation can generate multiple output normalized data. For now, supported output
57 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 C<marc>.
59
60 =head1 FUNCTIONS
61
62 Functions which start with C<_> are private and used by WebPAC internally.
63 All other functions are available for use within normalisation rules.
64
65 =head2 data_structure
66
67 Return data structure
68
69 my $ds = WebPAC::Normalize::data_structure(
70 lookup => $lookup_variable,
71 row => $row,
72 rules => $normalize_pl_config,
73 marc_encoding => 'utf-8',
74 config => $config,
75 );
76
77 Options C<row>, C<rules> and C<log> are mandatory while all
78 other are optional.
79
80 This function will B<die> if normalizastion can't be evaled.
81
82 Since this function isn't exported you have to call it with
83 C<WebPAC::Normalize::data_structure>.
84
85 =cut
86
87 sub data_structure {
88 my $arg = {@_};
89
90 die "need row argument" unless ($arg->{row});
91 die "need normalisation argument" unless ($arg->{rules});
92
93 no strict 'subs';
94 _set_lookup( $arg->{lookup} ) if (defined( $arg->{lookup} ));
95 _set_rec( $arg->{row} );
96 _set_config( $arg->{config} ) if (defined( $arg->{config} ));
97 _clean_ds( %{ $arg } );
98 eval "$arg->{rules}";
99 die "error evaling $arg->{rules}: $@\n" if ($@);
100
101 return _get_ds();
102 }
103
104 =head2 _set_rec
105
106 Set current record hash
107
108 _set_rec( $rec );
109
110 =cut
111
112 my $rec;
113
114 sub _set_rec {
115 $rec = shift or die "no record hash";
116 }
117
118 =head2 _set_config
119
120 Set current config hash
121
122 _set_config( $config );
123
124 Magic keys are:
125
126 =over 4
127
128 =item _
129
130 Code of current database
131
132 =item _mfn
133
134 Current MFN
135
136 =back
137
138 =cut
139
140 my $config;
141
142 sub _set_config {
143 $config = shift;
144 }
145
146 =head2 _get_ds
147
148 Return hash formatted as data structure
149
150 my $ds = _get_ds();
151
152 =cut
153
154 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
156
157 sub _get_ds {
158 return $out;
159 }
160
161 =head2 _clean_ds
162
163 Clean data structure hash for next record
164
165 _clean_ds();
166
167 =cut
168
169 sub _clean_ds {
170 my $a = {@_};
171 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 $marc_encoding = $a->{marc_encoding};
174 }
175
176 =head2 _set_lookup
177
178 Set current lookup hash
179
180 _set_lookup( $lookup );
181
182 =cut
183
184 my $lookup;
185
186 sub _set_lookup {
187 $lookup = shift;
188 }
189
190 =head2 _get_lookup
191
192 Get current lookup hash
193
194 my $lookup = _get_lookup();
195
196 =cut
197
198 sub _get_lookup {
199 return $lookup;
200 }
201
202 =head2 _get_marc_fields
203
204 Get all fields defined by calls to C<marc>
205
206 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
207
208 We are using I<magic> which detect repeatable fields only from
209 sequence of field/subfield data generated by normalization.
210
211 Repeatable field is created when there is second occurence of same subfield or
212 if any of indicators are different.
213
214 This is sane for most cases. Something like:
215
216 900a-1 900b-1 900c-1
217 900a-2 900b-2
218 900a-3
219
220 will be created from any combination of:
221
222 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
223
224 and following rules:
225
226 marc('900','a', rec('200','a') );
227 marc('900','b', rec('200','b') );
228 marc('900','c', rec('200','c') );
229
230 which might not be what you have in mind. If you need repeatable subfield,
231 define it using C<marc_repeatable_subfield> like this:
232
233 marc_repeatable_subfield('900','a');
234 marc('900','a', rec('200','a') );
235 marc('900','b', rec('200','b') );
236 marc('900','c', rec('200','c') );
237
238 will create:
239
240 900a-1 900a-2 900a-3 900b-1 900c-1
241 900b-2
242
243 There is also support for returning next or specific using:
244
245 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
246 # do something with $mf
247 }
248
249 will always return fields from next MARC record or
250
251 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
252
253 will return 42th copy record (if it exists).
254
255 =cut
256
257 sub _get_marc_fields {
258
259 my $arg = {@_};
260 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
261 my $offset = $marc_fetch_offset;
262 if ($arg->{offset}) {
263 $offset = $arg->{offset};
264 } elsif($arg->{fetch_next}) {
265 $marc_fetch_offset++;
266 }
267
268 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
269
270 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
271
272 my $marc_rec = $marc_record->[ $offset ];
273
274 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
275
276 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
277
278 # first, sort all existing fields
279 # XXX might not be needed, but modern perl might randomize elements in hash
280 my @sorted_marc_record = sort {
281 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
282 } @{ $marc_rec };
283
284 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
285
286 # output marc fields
287 my @m;
288
289 # count unique field-subfields (used for offset when walking to next subfield)
290 my $u;
291 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
292
293 if ($debug) {
294 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
295 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
296 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
297 warn "## subfield count = ", dump( $u ), $/;
298 }
299
300 my $len = $#sorted_marc_record;
301 my $visited;
302 my $i = 0;
303 my $field;
304
305 foreach ( 0 .. $len ) {
306
307 # find next element which isn't visited
308 while ($visited->{$i}) {
309 $i = ($i + 1) % ($len + 1);
310 }
311
312 # mark it visited
313 $visited->{$i}++;
314
315 my $row = dclone( $sorted_marc_record[$i] );
316
317 # field and subfield which is key for
318 # marc_repeatable_subfield and u
319 my $fsf = $row->[0] . ( $row->[3] || '' );
320
321 if ($debug > 1) {
322
323 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
324 print "### this [$i]: ", dump( $row ),$/;
325 print "### sf: ", $row->[3], " vs ", $field->[3],
326 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
327 if ($#$field >= 0);
328
329 }
330
331 # if field exists
332 if ( $#$field >= 0 ) {
333 if (
334 $row->[0] ne $field->[0] || # field
335 $row->[1] ne $field->[1] || # i1
336 $row->[2] ne $field->[2] # i2
337 ) {
338 push @m, $field;
339 warn "## saved/1 ", dump( $field ),$/ if ($debug);
340 $field = $row;
341
342 } elsif (
343 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
344 ||
345 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
346 ! $marc_repeatable_subfield->{ $fsf }
347 )
348 ) {
349 push @m, $field;
350 warn "## saved/2 ", dump( $field ),$/ if ($debug);
351 $field = $row;
352
353 } else {
354 # append new subfields to existing field
355 push @$field, ( $row->[3], $row->[4] );
356 }
357 } else {
358 # insert first field
359 $field = $row;
360 }
361
362 if (! $marc_repeatable_subfield->{ $fsf }) {
363 # make step to next subfield
364 $i = ($i + $u->{ $fsf } ) % ($len + 1);
365 }
366 }
367
368 if ($#$field >= 0) {
369 push @m, $field;
370 warn "## saved/3 ", dump( $field ),$/ if ($debug);
371 }
372
373 return \@m;
374 }
375
376 =head2 _debug
377
378 Change level of debug warnings
379
380 _debug( 2 );
381
382 =cut
383
384 sub _debug {
385 my $l = shift;
386 return $debug unless defined($l);
387 warn "debug level $l",$/ if ($l > 0);
388 $debug = $l;
389 }
390
391 =head1 Functions to create C<data_structure>
392
393 Those functions generally have to first in your normalization file.
394
395 =head2 tag
396
397 Define new tag for I<search> and I<display>.
398
399 tag('Title', rec('200','a') );
400
401
402 =cut
403
404 sub tag {
405 my $name = shift or die "tag needs name as first argument";
406 my @o = grep { defined($_) && $_ ne '' } @_;
407 return unless (@o);
408 $out->{$name}->{tag} = $name;
409 $out->{$name}->{search} = \@o;
410 $out->{$name}->{display} = \@o;
411 }
412
413 =head2 display
414
415 Define tag just for I<display>
416
417 @v = display('Title', rec('200','a') );
418
419 =cut
420
421 sub display {
422 my $name = shift or die "display needs name as first argument";
423 my @o = grep { defined($_) && $_ ne '' } @_;
424 return unless (@o);
425 $out->{$name}->{tag} = $name;
426 $out->{$name}->{display} = \@o;
427 }
428
429 =head2 search
430
431 Prepare values just for I<search>
432
433 @v = search('Title', rec('200','a') );
434
435 =cut
436
437 sub search {
438 my $name = shift or die "search needs name as first argument";
439 my @o = grep { defined($_) && $_ ne '' } @_;
440 return unless (@o);
441 $out->{$name}->{tag} = $name;
442 $out->{$name}->{search} = \@o;
443 }
444
445 =head2 marc_leader
446
447 Setup fields within MARC leader or get leader
448
449 marc_leader('05','c');
450 my $leader = marc_leader();
451
452 =cut
453
454 sub marc_leader {
455 my ($offset,$value) = @_;
456
457 if ($offset) {
458 $out->{' leader'}->{ $offset } = $value;
459 } else {
460 return $out->{' leader'};
461 }
462 }
463
464 =head2 marc
465
466 Save value for MARC field
467
468 marc('900','a', rec('200','a') );
469 marc('001', rec('000') );
470
471 =cut
472
473 sub marc {
474 my $f = shift or die "marc needs field";
475 die "marc field must be numer" unless ($f =~ /^\d+$/);
476
477 my $sf;
478 if ($f >= 10) {
479 $sf = shift or die "marc needs subfield";
480 }
481
482 foreach (@_) {
483 my $v = $_; # make var read-write for Encode
484 next unless (defined($v) && $v !~ /^\s*$/);
485 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
486 if (defined $sf) {
487 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
488 } else {
489 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
490 }
491 }
492 }
493
494 =head2 marc_repeatable_subfield
495
496 Save values for MARC repetable subfield
497
498 marc_repeatable_subfield('910', 'z', rec('909') );
499
500 =cut
501
502 sub marc_repeatable_subfield {
503 my ($f,$sf) = @_;
504 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
505 $marc_repeatable_subfield->{ $f . $sf }++;
506 marc(@_);
507 }
508
509 =head2 marc_indicators
510
511 Set both indicators for MARC field
512
513 marc_indicators('900', ' ', 1);
514
515 Any indicator value other than C<0-9> will be treated as undefined.
516
517 =cut
518
519 sub marc_indicators {
520 my $f = shift || die "marc_indicators need field!\n";
521 my ($i1,$i2) = @_;
522 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
523 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
524
525 $i1 = ' ' if ($i1 !~ /^\d$/);
526 $i2 = ' ' if ($i2 !~ /^\d$/);
527 @{ $marc_indicators->{$f} } = ($i1,$i2);
528 }
529
530 =head2 marc_compose
531
532 Save values for each MARC subfield explicitly
533
534 marc_compose('900',
535 'a', rec('200','a')
536 'b', rec('201','a')
537 'a', rec('200','b')
538 'c', rec('200','c')
539 );
540
541 If you specify C<+> for subfield, value will be appended
542 to previous defined subfield.
543
544 =cut
545
546 sub marc_compose {
547 my $f = shift or die "marc_compose needs field";
548 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
549
550 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
551 my $m = [ $f, $i1, $i2 ];
552
553 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
554
555 if ($#_ % 2 != 1) {
556 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
557 }
558
559 while (@_) {
560 my $sf = shift;
561 my $v = shift;
562
563 next unless (defined($v) && $v !~ /^\s*$/);
564 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
565 if ($sf ne '+') {
566 push @$m, ( $sf, $v );
567 } else {
568 $m->[ $#$m ] .= $v;
569 }
570 }
571
572 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
573
574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
575 }
576
577 =head2 marc_duplicate
578
579 Generate copy of current MARC record and continue working on copy
580
581 marc_duplicate();
582
583 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
584 C<< _get_marc_fields( offset => 42 ) >>.
585
586 =cut
587
588 sub marc_duplicate {
589 my $m = $marc_record->[ -1 ];
590 die "can't duplicate record which isn't defined" unless ($m);
591 push @{ $marc_record }, dclone( $m );
592 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
593 $marc_record_offset = $#{ $marc_record };
594 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
595 }
596
597 =head2 marc_remove
598
599 Remove some field or subfield from MARC record.
600
601 marc_remove('200');
602 marc_remove('200','a');
603
604 This will erase field C<200> or C<200^a> from current MARC record.
605
606 This is useful after calling C<marc_duplicate> or on it's own (but, you
607 should probably just remove that subfield definition if you are not
608 using C<marc_duplicate>).
609
610 FIXME: support fields < 10.
611
612 =cut
613
614 sub marc_remove {
615 my ($f, $sf) = @_;
616
617 die "marc_remove needs record number" unless defined($f);
618
619 my $marc = $marc_record->[ $marc_record_offset ];
620
621 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
622
623 my $i = 0;
624 foreach ( 0 .. $#{ $marc } ) {
625 last unless (defined $marc->[$i]);
626 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
627 if ($marc->[$i]->[0] eq $f) {
628 if (! defined $sf) {
629 # remove whole field
630 splice @$marc, $i, 1;
631 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
632 $i--;
633 } else {
634 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
635 my $o = ($j * 2) + 3;
636 if ($marc->[$i]->[$o] eq $sf) {
637 # remove subfield
638 splice @{$marc->[$i]}, $o, 2;
639 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
640 # is record now empty?
641 if ($#{ $marc->[$i] } == 2) {
642 splice @$marc, $i, 1;
643 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
644 $i--;
645 };
646 }
647 }
648 }
649 }
650 $i++;
651 }
652
653 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
654
655 $marc_record->[ $marc_record_offset ] = $marc;
656
657 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
658 }
659
660 =head2 marc_original_order
661
662 Copy all subfields preserving original order to marc field.
663
664 marc_original_order( marc_field_number, original_input_field_number );
665
666 Please note that field numbers are consistent with other commands (marc
667 field number first), but somewhat counter-intuitive (destination and then
668 source).
669
670 You might want to use this command if you are just renaming subfields or
671 using pre-processing modify_record in C<config.yml> and don't need any
672 post-processing or want to preserve order of original subfields.
673
674
675 =cut
676
677 sub marc_original_order {
678
679 my ($to, $from) = @_;
680 die "marc_original_order needs from and to fields\n" unless ($from && $to);
681
682 return unless defined($rec->{$from});
683
684 my $r = $rec->{$from};
685 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
686
687 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
688 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
689
690 foreach my $d (@$r) {
691
692 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
693 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
694 next;
695 }
696
697 my @sfs = @{ $d->{subfields} };
698
699 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
700
701 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
702
703 my $m = [ $to, $i1, $i2 ];
704
705 while (my $sf = shift @sfs) {
706
707 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
708 my $offset = shift @sfs;
709 die "corrupted sufields specification for field $from\n" unless defined($offset);
710
711 my $v;
712 if (ref($d->{$sf}) eq 'ARRAY') {
713 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
714 } elsif ($offset == 0) {
715 $v = $d->{$sf};
716 } else {
717 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
718 }
719 push @$m, ( $sf, $v ) if (defined($v));
720 }
721
722 if ($#{$m} > 2) {
723 push @{ $marc_record->[ $marc_record_offset ] }, $m;
724 }
725 }
726
727 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
728 }
729
730
731 =head1 Functions to extract data from input
732
733 This function should be used inside functions to create C<data_structure> described
734 above.
735
736 =head2 _pack_subfields_hash
737
738 @subfields = _pack_subfields_hash( $h );
739 $subfields = _pack_subfields_hash( $h, 1 );
740
741 Return each subfield value in array or pack them all together and return scalar
742 with subfields (denoted by C<^>) and values.
743
744 =cut
745
746 sub _pack_subfields_hash {
747
748 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
749
750 my ($h,$include_subfields) = @_;
751
752 if ( defined($h->{subfields}) ) {
753 my $sfs = delete $h->{subfields} || die "no subfields?";
754 my @out;
755 while (@$sfs) {
756 my $sf = shift @$sfs;
757 push @out, '^' . $sf if ($include_subfields);
758 my $o = shift @$sfs;
759 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
760 # single element subfields are not arrays
761 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
762
763 push @out, $h->{$sf};
764 } else {
765 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
766 push @out, $h->{$sf}->[$o];
767 }
768 }
769 if ($include_subfields) {
770 return join('', @out);
771 } else {
772 return @out;
773 }
774 } else {
775 if ($include_subfields) {
776 my $out = '';
777 foreach my $sf (sort keys %$h) {
778 if (ref($h->{$sf}) eq 'ARRAY') {
779 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
780 } else {
781 $out .= '^' . $sf . $h->{$sf};
782 }
783 }
784 return $out;
785 } else {
786 # FIXME this should probably be in alphabetical order instead of hash order
787 values %{$h};
788 }
789 }
790 }
791
792 =head2 rec1
793
794 Return all values in some field
795
796 @v = rec1('200')
797
798 TODO: order of values is probably same as in source data, need to investigate that
799
800 =cut
801
802 sub rec1 {
803 my $f = shift;
804 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
805 return unless (defined($rec) && defined($rec->{$f}));
806 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
807 if (ref($rec->{$f}) eq 'ARRAY') {
808 my @out;
809 foreach my $h ( @{ $rec->{$f} } ) {
810 if (ref($h) eq 'HASH') {
811 push @out, ( _pack_subfields_hash( $h ) );
812 } else {
813 push @out, $h;
814 }
815 }
816 return @out;
817 } elsif( defined($rec->{$f}) ) {
818 return $rec->{$f};
819 }
820 }
821
822 =head2 rec2
823
824 Return all values in specific field and subfield
825
826 @v = rec2('200','a')
827
828 =cut
829
830 sub rec2 {
831 my $f = shift;
832 return unless (defined($rec && $rec->{$f}));
833 my $sf = shift;
834 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
835 return map {
836 if (ref($_->{$sf}) eq 'ARRAY') {
837 @{ $_->{$sf} };
838 } else {
839 $_->{$sf};
840 }
841 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
842 }
843
844 =head2 rec
845
846 syntaxtic sugar for
847
848 @v = rec('200')
849 @v = rec('200','a')
850
851 =cut
852
853 sub rec {
854 my @out;
855 if ($#_ == 0) {
856 @out = rec1(@_);
857 } elsif ($#_ == 1) {
858 @out = rec2(@_);
859 }
860 if (@out) {
861 return @out;
862 } else {
863 return '';
864 }
865 }
866
867 =head2 regex
868
869 Apply regex to some or all values
870
871 @v = regex( 's/foo/bar/g', @v );
872
873 =cut
874
875 sub regex {
876 my $r = shift;
877 my @out;
878 #warn "r: $r\n", dump(\@_);
879 foreach my $t (@_) {
880 next unless ($t);
881 eval "\$t =~ $r";
882 push @out, $t if ($t && $t ne '');
883 }
884 return @out;
885 }
886
887 =head2 prefix
888
889 Prefix all values with a string
890
891 @v = prefix( 'my_', @v );
892
893 =cut
894
895 sub prefix {
896 my $p = shift or return;
897 return map { $p . $_ } grep { defined($_) } @_;
898 }
899
900 =head2 suffix
901
902 suffix all values with a string
903
904 @v = suffix( '_my', @v );
905
906 =cut
907
908 sub suffix {
909 my $s = shift or die "suffix needs string as first argument";
910 return map { $_ . $s } grep { defined($_) } @_;
911 }
912
913 =head2 surround
914
915 surround all values with a two strings
916
917 @v = surround( 'prefix_', '_suffix', @v );
918
919 =cut
920
921 sub surround {
922 my $p = shift or die "surround need prefix as first argument";
923 my $s = shift or die "surround needs suffix as second argument";
924 return map { $p . $_ . $s } grep { defined($_) } @_;
925 }
926
927 =head2 first
928
929 Return first element
930
931 $v = first( @v );
932
933 =cut
934
935 sub first {
936 my $r = shift;
937 return $r;
938 }
939
940 =head2 lookup
941
942 Consult lookup hashes for some value
943
944 @v = lookup( $v );
945 @v = lookup( @v );
946
947 FIXME B<currently this one is broken!>
948
949 =cut
950
951 sub lookup {
952 my $k = shift or return;
953 return unless (defined($lookup->{$k}));
954 if (ref($lookup->{$k}) eq 'ARRAY') {
955 return @{ $lookup->{$k} };
956 } else {
957 return $lookup->{$k};
958 }
959 }
960
961 =head2 save_into_lookup
962
963 Save value into lookup.
964
965 save_into_lookup($key,sub {
966 # code which produce one or more values
967 });
968
969 This function shouldn't be called directly, it's called from code created by L<WebPAC::Parser>.
970
971 =cut
972
973 sub save_into_lookup {
974 my ($k,$coderef) = @_;
975 die "save_into_lookup needs key" unless defined($k);
976 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
977 my $mfn = $rec->{'000'}->[0] || die "mfn not defined or zero";
978 foreach my $v ( $coderef->() ) {
979 $lookup->{$k}->{$v}->{$mfn}++;
980 warn "# lookup $k $v $mfn saved\n" if ($debug > 1);
981 }
982 }
983
984 =head2 config
985
986 Consult config values stored in C<config.yml>
987
988 # return database code (key under databases in yaml)
989 $database_code = config(); # use _ from hash
990 $database_name = config('name');
991 $database_input_name = config('input name');
992 $tag = config('input normalize tag');
993
994 Up to three levels are supported.
995
996 =cut
997
998 sub config {
999 return unless ($config);
1000
1001 my $p = shift;
1002
1003 $p ||= '';
1004
1005 my $v;
1006
1007 warn "### getting config($p)\n" if ($debug > 1);
1008
1009 my @p = split(/\s+/,$p);
1010 if ($#p < 0) {
1011 $v = $config->{ '_' }; # special, database code
1012 } else {
1013
1014 my $c = dclone( $config );
1015
1016 foreach my $k (@p) {
1017 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1018 if (ref($c) eq 'ARRAY') {
1019 $c = shift @$c;
1020 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1021 last;
1022 }
1023
1024 if (! defined($c->{$k}) ) {
1025 $c = undef;
1026 last;
1027 } else {
1028 $c = $c->{$k};
1029 }
1030 }
1031 $v = $c if ($c);
1032
1033 }
1034
1035 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1036 warn "config( '$p' ) is empty\n" if (! $v);
1037
1038 return $v;
1039 }
1040
1041 =head2 id
1042
1043 Returns unique id of this record
1044
1045 $id = id();
1046
1047 Returns C<42/2> for 2nd occurence of MFN 42.
1048
1049 =cut
1050
1051 sub id {
1052 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1053 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1054 }
1055
1056 =head2 join_with
1057
1058 Joins walues with some delimiter
1059
1060 $v = join_with(", ", @v);
1061
1062 =cut
1063
1064 sub join_with {
1065 my $d = shift;
1066 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1067 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1068 return '' unless defined($v);
1069 return $v;
1070 }
1071
1072 =head2 split_rec_on
1073
1074 Split record subfield on some regex and take one of parts out
1075
1076 $a_before_semi_column =
1077 split_rec_on('200','a', /\s*;\s*/, $part);
1078
1079 C<$part> is optional number of element. First element is
1080 B<1>, not 0!
1081
1082 If there is no C<$part> parameter or C<$part> is 0, this function will
1083 return all values produced by splitting.
1084
1085 =cut
1086
1087 sub split_rec_on {
1088 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1089
1090 my ($fld, $sf, $regex, $part) = @_;
1091 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1092
1093 my @r = rec( $fld, $sf );
1094 my $v = shift @r;
1095 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1096
1097 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1098
1099 my @s = split( $regex, $v );
1100 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1101 if ($part && $part > 0) {
1102 return $s[ $part - 1 ];
1103 } else {
1104 return @s;
1105 }
1106 }
1107
1108 # END
1109 1;

  ViewVC Help
Powered by ViewVC 1.1.26