/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 631 - (show annotations)
Wed Sep 6 14:25:16 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 22254 byte(s)
 r890@llin:  dpavlin | 2006-09-06 16:24:27 +0200
 rec1 now unrolls subfields in correct order (as in source file)

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7
8 tag search display
9 marc marc_indicators marc_repeatable_subfield
10 marc_compose marc_leader
11 marc_duplicate marc_remove
12 marc_original_order
13
14 rec1 rec2 rec
15 regex prefix suffix surround
16 first lookup join_with
17
18 split_rec_on
19 /;
20
21 use warnings;
22 use strict;
23
24 #use base qw/WebPAC::Common/;
25 use Data::Dump qw/dump/;
26 use Encode qw/from_to/;
27 use Storable qw/dclone/;
28
29 # debugging warn(s)
30 my $debug = 0;
31
32
33 =head1 NAME
34
35 WebPAC::Normalize - describe normalisaton rules using sets
36
37 =head1 VERSION
38
39 Version 0.17
40
41 =cut
42
43 our $VERSION = '0.17';
44
45 =head1 SYNOPSIS
46
47 This module uses C<conf/normalize/*.pl> files to perform normalisation
48 from input records using perl functions which are specialized for set
49 processing.
50
51 Sets are implemented as arrays, and normalisation file is valid perl, which
52 means that you check it's validity before running WebPAC using
53 C<perl -c normalize.pl>.
54
55 Normalisation can generate multiple output normalized data. For now, supported output
56 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
57 C<marc>.
58
59 =head1 FUNCTIONS
60
61 Functions which start with C<_> are private and used by WebPAC internally.
62 All other functions are available for use within normalisation rules.
63
64 =head2 data_structure
65
66 Return data structure
67
68 my $ds = WebPAC::Normalize::data_structure(
69 lookup => $lookup->lookup_hash,
70 row => $row,
71 rules => $normalize_pl_config,
72 marc_encoding => 'utf-8',
73 config => $config,
74 );
75
76 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
77 other are optional.
78
79 This function will B<die> if normalizastion can't be evaled.
80
81 Since this function isn't exported you have to call it with
82 C<WebPAC::Normalize::data_structure>.
83
84 =cut
85
86 sub data_structure {
87 my $arg = {@_};
88
89 die "need row argument" unless ($arg->{row});
90 die "need normalisation argument" unless ($arg->{rules});
91
92 no strict 'subs';
93 _set_lookup( $arg->{lookup} );
94 _set_rec( $arg->{row} );
95 _set_config( $arg->{config} );
96 _clean_ds( %{ $arg } );
97 eval "$arg->{rules}";
98 die "error evaling $arg->{rules}: $@\n" if ($@);
99
100 return _get_ds();
101 }
102
103 =head2 _set_rec
104
105 Set current record hash
106
107 _set_rec( $rec );
108
109 =cut
110
111 my $rec;
112
113 sub _set_rec {
114 $rec = shift or die "no record hash";
115 }
116
117 =head2 _set_config
118
119 Set current config hash
120
121 _set_config( $config );
122
123 Magic keys are:
124
125 =over 4
126
127 =item _
128
129 Code of current database
130
131 =item _mfn
132
133 Current MFN
134
135 =back
136
137 =cut
138
139 my $config;
140
141 sub _set_config {
142 $config = shift;
143 }
144
145 =head2 _get_ds
146
147 Return hash formatted as data structure
148
149 my $ds = _get_ds();
150
151 =cut
152
153 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
154 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
155
156 sub _get_ds {
157 return $out;
158 }
159
160 =head2 _clean_ds
161
162 Clean data structure hash for next record
163
164 _clean_ds();
165
166 =cut
167
168 sub _clean_ds {
169 my $a = {@_};
170 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
171 ($marc_record_offset, $marc_fetch_offset) = (0,0);
172 $marc_encoding = $a->{marc_encoding};
173 }
174
175 =head2 _set_lookup
176
177 Set current lookup hash
178
179 _set_lookup( $lookup );
180
181 =cut
182
183 my $lookup;
184
185 sub _set_lookup {
186 $lookup = shift;
187 }
188
189 =head2 _get_marc_fields
190
191 Get all fields defined by calls to C<marc>
192
193 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
194
195 We are using I<magic> which detect repeatable fields only from
196 sequence of field/subfield data generated by normalization.
197
198 Repeatable field is created when there is second occurence of same subfield or
199 if any of indicators are different.
200
201 This is sane for most cases. Something like:
202
203 900a-1 900b-1 900c-1
204 900a-2 900b-2
205 900a-3
206
207 will be created from any combination of:
208
209 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
210
211 and following rules:
212
213 marc('900','a', rec('200','a') );
214 marc('900','b', rec('200','b') );
215 marc('900','c', rec('200','c') );
216
217 which might not be what you have in mind. If you need repeatable subfield,
218 define it using C<marc_repeatable_subfield> like this:
219
220 marc_repeatable_subfield('900','a');
221 marc('900','a', rec('200','a') );
222 marc('900','b', rec('200','b') );
223 marc('900','c', rec('200','c') );
224
225 will create:
226
227 900a-1 900a-2 900a-3 900b-1 900c-1
228 900b-2
229
230 There is also support for returning next or specific using:
231
232 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
233 # do something with $mf
234 }
235
236 will always return fields from next MARC record or
237
238 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
239
240 will return 42th copy record (if it exists).
241
242 =cut
243
244 sub _get_marc_fields {
245
246 my $arg = {@_};
247 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
248 my $offset = $marc_fetch_offset;
249 if ($arg->{offset}) {
250 $offset = $arg->{offset};
251 } elsif($arg->{fetch_next}) {
252 $marc_fetch_offset++;
253 }
254
255 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
256
257 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
258
259 my $marc_rec = $marc_record->[ $offset ];
260
261 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
262
263 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
264
265 # first, sort all existing fields
266 # XXX might not be needed, but modern perl might randomize elements in hash
267 my @sorted_marc_record = sort {
268 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
269 } @{ $marc_rec };
270
271 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
272
273 # output marc fields
274 my @m;
275
276 # count unique field-subfields (used for offset when walking to next subfield)
277 my $u;
278 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
279
280 if ($debug) {
281 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
282 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
283 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
284 warn "## subfield count = ", dump( $u ), $/;
285 }
286
287 my $len = $#sorted_marc_record;
288 my $visited;
289 my $i = 0;
290 my $field;
291
292 foreach ( 0 .. $len ) {
293
294 # find next element which isn't visited
295 while ($visited->{$i}) {
296 $i = ($i + 1) % ($len + 1);
297 }
298
299 # mark it visited
300 $visited->{$i}++;
301
302 my $row = dclone( $sorted_marc_record[$i] );
303
304 # field and subfield which is key for
305 # marc_repeatable_subfield and u
306 my $fsf = $row->[0] . ( $row->[3] || '' );
307
308 if ($debug > 1) {
309
310 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
311 print "### this [$i]: ", dump( $row ),$/;
312 print "### sf: ", $row->[3], " vs ", $field->[3],
313 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
314 if ($#$field >= 0);
315
316 }
317
318 # if field exists
319 if ( $#$field >= 0 ) {
320 if (
321 $row->[0] ne $field->[0] || # field
322 $row->[1] ne $field->[1] || # i1
323 $row->[2] ne $field->[2] # i2
324 ) {
325 push @m, $field;
326 warn "## saved/1 ", dump( $field ),$/ if ($debug);
327 $field = $row;
328
329 } elsif (
330 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
331 ||
332 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
333 ! $marc_repeatable_subfield->{ $fsf }
334 )
335 ) {
336 push @m, $field;
337 warn "## saved/2 ", dump( $field ),$/ if ($debug);
338 $field = $row;
339
340 } else {
341 # append new subfields to existing field
342 push @$field, ( $row->[3], $row->[4] );
343 }
344 } else {
345 # insert first field
346 $field = $row;
347 }
348
349 if (! $marc_repeatable_subfield->{ $fsf }) {
350 # make step to next subfield
351 $i = ($i + $u->{ $fsf } ) % ($len + 1);
352 }
353 }
354
355 if ($#$field >= 0) {
356 push @m, $field;
357 warn "## saved/3 ", dump( $field ),$/ if ($debug);
358 }
359
360 return \@m;
361 }
362
363 =head2 _debug
364
365 Change level of debug warnings
366
367 _debug( 2 );
368
369 =cut
370
371 sub _debug {
372 my $l = shift;
373 return $debug unless defined($l);
374 warn "debug level $l",$/ if ($l > 0);
375 $debug = $l;
376 }
377
378 =head1 Functions to create C<data_structure>
379
380 Those functions generally have to first in your normalization file.
381
382 =head2 tag
383
384 Define new tag for I<search> and I<display>.
385
386 tag('Title', rec('200','a') );
387
388
389 =cut
390
391 sub tag {
392 my $name = shift or die "tag needs name as first argument";
393 my @o = grep { defined($_) && $_ ne '' } @_;
394 return unless (@o);
395 $out->{$name}->{tag} = $name;
396 $out->{$name}->{search} = \@o;
397 $out->{$name}->{display} = \@o;
398 }
399
400 =head2 display
401
402 Define tag just for I<display>
403
404 @v = display('Title', rec('200','a') );
405
406 =cut
407
408 sub display {
409 my $name = shift or die "display needs name as first argument";
410 my @o = grep { defined($_) && $_ ne '' } @_;
411 return unless (@o);
412 $out->{$name}->{tag} = $name;
413 $out->{$name}->{display} = \@o;
414 }
415
416 =head2 search
417
418 Prepare values just for I<search>
419
420 @v = search('Title', rec('200','a') );
421
422 =cut
423
424 sub search {
425 my $name = shift or die "search needs name as first argument";
426 my @o = grep { defined($_) && $_ ne '' } @_;
427 return unless (@o);
428 $out->{$name}->{tag} = $name;
429 $out->{$name}->{search} = \@o;
430 }
431
432 =head2 marc_leader
433
434 Setup fields within MARC leader or get leader
435
436 marc_leader('05','c');
437 my $leader = marc_leader();
438
439 =cut
440
441 sub marc_leader {
442 my ($offset,$value) = @_;
443
444 if ($offset) {
445 $out->{' leader'}->{ $offset } = $value;
446 } else {
447 return $out->{' leader'};
448 }
449 }
450
451 =head2 marc
452
453 Save value for MARC field
454
455 marc('900','a', rec('200','a') );
456 marc('001', rec('000') );
457
458 =cut
459
460 sub marc {
461 my $f = shift or die "marc needs field";
462 die "marc field must be numer" unless ($f =~ /^\d+$/);
463
464 my $sf;
465 if ($f >= 10) {
466 $sf = shift or die "marc needs subfield";
467 }
468
469 foreach (@_) {
470 my $v = $_; # make var read-write for Encode
471 next unless (defined($v) && $v !~ /^\s*$/);
472 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
473 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
474 if (defined $sf) {
475 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
476 } else {
477 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
478 }
479 }
480 }
481
482 =head2 marc_repeatable_subfield
483
484 Save values for MARC repetable subfield
485
486 marc_repeatable_subfield('910', 'z', rec('909') );
487
488 =cut
489
490 sub marc_repeatable_subfield {
491 my ($f,$sf) = @_;
492 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
493 $marc_repeatable_subfield->{ $f . $sf }++;
494 marc(@_);
495 }
496
497 =head2 marc_indicators
498
499 Set both indicators for MARC field
500
501 marc_indicators('900', ' ', 1);
502
503 Any indicator value other than C<0-9> will be treated as undefined.
504
505 =cut
506
507 sub marc_indicators {
508 my $f = shift || die "marc_indicators need field!\n";
509 my ($i1,$i2) = @_;
510 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
511 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
512
513 $i1 = ' ' if ($i1 !~ /^\d$/);
514 $i2 = ' ' if ($i2 !~ /^\d$/);
515 @{ $marc_indicators->{$f} } = ($i1,$i2);
516 }
517
518 =head2 marc_compose
519
520 Save values for each MARC subfield explicitly
521
522 marc_compose('900',
523 'a', rec('200','a')
524 'b', rec('201','a')
525 'a', rec('200','b')
526 'c', rec('200','c')
527 );
528
529 If you specify C<+> for subfield, value will be appended
530 to previous defined subfield.
531
532 =cut
533
534 sub marc_compose {
535 my $f = shift or die "marc_compose needs field";
536 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
537
538 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
539 my $m = [ $f, $i1, $i2 ];
540
541 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
542
543 if ($#_ % 2 != 1) {
544 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
545 }
546
547 while (@_) {
548 my $sf = shift;
549 my $v = shift;
550
551 next unless (defined($v) && $v !~ /^\s*$/);
552 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
553 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
554 if ($sf ne '+') {
555 push @$m, ( $sf, $v );
556 } else {
557 $m->[ $#$m ] .= $v;
558 }
559 }
560
561 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
562
563 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
564 }
565
566 =head2 marc_duplicate
567
568 Generate copy of current MARC record and continue working on copy
569
570 marc_duplicate();
571
572 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
573 C<< _get_marc_fields( offset => 42 ) >>.
574
575 =cut
576
577 sub marc_duplicate {
578 my $m = $marc_record->[ -1 ];
579 die "can't duplicate record which isn't defined" unless ($m);
580 push @{ $marc_record }, dclone( $m );
581 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
582 $marc_record_offset = $#{ $marc_record };
583 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
584 }
585
586 =head2 marc_remove
587
588 Remove some field or subfield from MARC record.
589
590 marc_remove('200');
591 marc_remove('200','a');
592
593 This will erase field C<200> or C<200^a> from current MARC record.
594
595 This is useful after calling C<marc_duplicate> or on it's own (but, you
596 should probably just remove that subfield definition if you are not
597 using C<marc_duplicate>).
598
599 FIXME: support fields < 10.
600
601 =cut
602
603 sub marc_remove {
604 my ($f, $sf) = @_;
605
606 die "marc_remove needs record number" unless defined($f);
607
608 my $marc = $marc_record->[ $marc_record_offset ];
609
610 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
611
612 my $i = 0;
613 foreach ( 0 .. $#{ $marc } ) {
614 last unless (defined $marc->[$i]);
615 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
616 if ($marc->[$i]->[0] eq $f) {
617 if (! defined $sf) {
618 # remove whole field
619 splice @$marc, $i, 1;
620 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
621 $i--;
622 } else {
623 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
624 my $o = ($j * 2) + 3;
625 if ($marc->[$i]->[$o] eq $sf) {
626 # remove subfield
627 splice @{$marc->[$i]}, $o, 2;
628 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
629 # is record now empty?
630 if ($#{ $marc->[$i] } == 2) {
631 splice @$marc, $i, 1;
632 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
633 $i--;
634 };
635 }
636 }
637 }
638 }
639 $i++;
640 }
641
642 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
643
644 $marc_record->[ $marc_record_offset ] = $marc;
645
646 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
647 }
648
649 =head2 marc_original_order
650
651 Copy all subfields preserving original order to marc field.
652
653 marc_original_order( marc_field_number, original_input_field_number );
654
655 Please note that field numbers are consistent with other commands (marc
656 field number first), but somewhat counter-intuitive (destination and then
657 source).
658
659 You might want to use this command if you are just renaming subfields or
660 using pre-processing modify_record in C<config.yml> and don't need any
661 post-processing or want to preserve order of original subfields.
662
663
664 =cut
665
666 sub marc_original_order {
667
668 my ($to, $from) = @_;
669 die "marc_original_order needs from and to fields\n" unless ($from && $to);
670
671 return unless defined($rec->{$from});
672
673 my $r = $rec->{$from};
674 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
675
676 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
677 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
678
679 foreach my $d (@$r) {
680
681 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
682 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
683 next;
684 }
685
686 my @sfs = @{ $d->{subfields} };
687
688 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
689
690 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
691
692 my $m = [ $to, $i1, $i2 ];
693
694 while (my $sf = shift @sfs) {
695
696 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
697 my $offset = shift @sfs;
698 die "corrupted sufields specification for field $from\n" unless defined($offset);
699
700 my $v;
701 if (ref($d->{$sf}) eq 'ARRAY') {
702 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
703 } elsif ($offset == 0) {
704 $v = $d->{$sf};
705 } else {
706 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
707 }
708 push @$m, ( $sf, $v ) if (defined($v));
709 }
710
711 if ($#{$m} > 2) {
712 push @{ $marc_record->[ $marc_record_offset ] }, $m;
713 }
714 }
715
716 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
717 }
718
719
720 =head1 Functions to extract data from input
721
722 This function should be used inside functions to create C<data_structure> described
723 above.
724
725 =head2 rec1
726
727 Return all values in some field
728
729 @v = rec1('200')
730
731 TODO: order of values is probably same as in source data, need to investigate that
732
733 =cut
734
735 sub rec1 {
736 my $f = shift;
737 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
738 return unless (defined($rec) && defined($rec->{$f}));
739 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
740 if (ref($rec->{$f}) eq 'ARRAY') {
741 return map {
742 if (ref($_) eq 'HASH') {
743 my $h = $_;
744 if ( defined($h->{subfields}) ) {
745 my $sfs = delete $h->{subfields} || die "no subfields?";
746 my @out;
747 while (@$sfs) {
748 my $sf = shift @$sfs;
749 my $o = shift @$sfs;
750 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
751 # single element subfields are not arrays
752 push @out, $h->{$sf};
753 } else {
754 warn "====> $f $sf $o $#$sfs ", dump( $sfs ), "\n";
755 push @out, $h->{$sf}->[$o];
756 }
757 }
758 return @out;
759 } else {
760 # FIXME this should probably be in alphabetical order instead of hash order
761 values %{$h};
762 }
763 } else {
764 $_;
765 }
766 } @{ $rec->{$f} };
767 } elsif( defined($rec->{$f}) ) {
768 return $rec->{$f};
769 }
770 }
771
772 =head2 rec2
773
774 Return all values in specific field and subfield
775
776 @v = rec2('200','a')
777
778 =cut
779
780 sub rec2 {
781 my $f = shift;
782 return unless (defined($rec && $rec->{$f}));
783 my $sf = shift;
784 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
785 return map {
786 if (ref($_->{$sf}) eq 'ARRAY') {
787 @{ $_->{$sf} };
788 } else {
789 $_->{$sf};
790 }
791 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
792 }
793
794 =head2 rec
795
796 syntaxtic sugar for
797
798 @v = rec('200')
799 @v = rec('200','a')
800
801 =cut
802
803 sub rec {
804 my @out;
805 if ($#_ == 0) {
806 @out = rec1(@_);
807 } elsif ($#_ == 1) {
808 @out = rec2(@_);
809 }
810 if (@out) {
811 return @out;
812 } else {
813 return '';
814 }
815 }
816
817 =head2 regex
818
819 Apply regex to some or all values
820
821 @v = regex( 's/foo/bar/g', @v );
822
823 =cut
824
825 sub regex {
826 my $r = shift;
827 my @out;
828 #warn "r: $r\n", dump(\@_);
829 foreach my $t (@_) {
830 next unless ($t);
831 eval "\$t =~ $r";
832 push @out, $t if ($t && $t ne '');
833 }
834 return @out;
835 }
836
837 =head2 prefix
838
839 Prefix all values with a string
840
841 @v = prefix( 'my_', @v );
842
843 =cut
844
845 sub prefix {
846 my $p = shift or return;
847 return map { $p . $_ } grep { defined($_) } @_;
848 }
849
850 =head2 suffix
851
852 suffix all values with a string
853
854 @v = suffix( '_my', @v );
855
856 =cut
857
858 sub suffix {
859 my $s = shift or die "suffix needs string as first argument";
860 return map { $_ . $s } grep { defined($_) } @_;
861 }
862
863 =head2 surround
864
865 surround all values with a two strings
866
867 @v = surround( 'prefix_', '_suffix', @v );
868
869 =cut
870
871 sub surround {
872 my $p = shift or die "surround need prefix as first argument";
873 my $s = shift or die "surround needs suffix as second argument";
874 return map { $p . $_ . $s } grep { defined($_) } @_;
875 }
876
877 =head2 first
878
879 Return first element
880
881 $v = first( @v );
882
883 =cut
884
885 sub first {
886 my $r = shift;
887 return $r;
888 }
889
890 =head2 lookup
891
892 Consult lookup hashes for some value
893
894 @v = lookup( $v );
895 @v = lookup( @v );
896
897 =cut
898
899 sub lookup {
900 my $k = shift or return;
901 return unless (defined($lookup->{$k}));
902 if (ref($lookup->{$k}) eq 'ARRAY') {
903 return @{ $lookup->{$k} };
904 } else {
905 return $lookup->{$k};
906 }
907 }
908
909 =head2 config
910
911 Consult config values stored in C<config.yml>
912
913 # return database code (key under databases in yaml)
914 $database_code = config(); # use _ from hash
915 $database_name = config('name');
916 $database_input_name = config('input name');
917 $tag = config('input normalize tag');
918
919 Up to three levels are supported.
920
921 =cut
922
923 sub config {
924 return unless ($config);
925
926 my $p = shift;
927
928 $p ||= '';
929
930 my $v;
931
932 warn "### getting config($p)\n" if ($debug > 1);
933
934 my @p = split(/\s+/,$p);
935 if ($#p < 0) {
936 $v = $config->{ '_' }; # special, database code
937 } else {
938
939 my $c = dclone( $config );
940
941 foreach my $k (@p) {
942 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
943 if (ref($c) eq 'ARRAY') {
944 $c = shift @$c;
945 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
946 last;
947 }
948
949 if (! defined($c->{$k}) ) {
950 $c = undef;
951 last;
952 } else {
953 $c = $c->{$k};
954 }
955 }
956 $v = $c if ($c);
957
958 }
959
960 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
961 warn "config( '$p' ) is empty\n" if (! $v);
962
963 return $v;
964 }
965
966 =head2 id
967
968 Returns unique id of this record
969
970 $id = id();
971
972 Returns C<42/2> for 2nd occurence of MFN 42.
973
974 =cut
975
976 sub id {
977 my $mfn = $config->{_mfn} || die "no _mfn in config data";
978 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
979 }
980
981 =head2 join_with
982
983 Joins walues with some delimiter
984
985 $v = join_with(", ", @v);
986
987 =cut
988
989 sub join_with {
990 my $d = shift;
991 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
992 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
993 return '' unless defined($v);
994 return $v;
995 }
996
997 =head2 split_rec_on
998
999 Split record subfield on some regex and take one of parts out
1000
1001 $a_before_semi_column =
1002 split_rec_on('200','a', /\s*;\s*/, $part);
1003
1004 C<$part> is optional number of element. First element is
1005 B<1>, not 0!
1006
1007 If there is no C<$part> parameter or C<$part> is 0, this function will
1008 return all values produced by splitting.
1009
1010 =cut
1011
1012 sub split_rec_on {
1013 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1014
1015 my ($fld, $sf, $regex, $part) = @_;
1016 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1017
1018 my @r = rec( $fld, $sf );
1019 my $v = shift @r;
1020 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1021
1022 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1023
1024 my @s = split( $regex, $v );
1025 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1026 if ($part && $part > 0) {
1027 return $s[ $part - 1 ];
1028 } else {
1029 return @s;
1030 }
1031 }
1032
1033 # END
1034 1;

  ViewVC Help
Powered by ViewVC 1.1.26