/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 642 - (show annotations)
Wed Sep 6 21:09:30 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 22575 byte(s)
make it less chatty

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6 _debug
7 _pack_subfields_hash
8
9 tag search display
10 marc marc_indicators marc_repeatable_subfield
11 marc_compose marc_leader
12 marc_duplicate marc_remove
13 marc_original_order
14
15 rec1 rec2 rec
16 regex prefix suffix surround
17 first lookup join_with
18
19 split_rec_on
20 /;
21
22 use warnings;
23 use strict;
24
25 #use base qw/WebPAC::Common/;
26 use Data::Dump qw/dump/;
27 use Encode qw/from_to/;
28 use Storable qw/dclone/;
29
30 # debugging warn(s)
31 my $debug = 0;
32
33
34 =head1 NAME
35
36 WebPAC::Normalize - describe normalisaton rules using sets
37
38 =head1 VERSION
39
40 Version 0.17
41
42 =cut
43
44 our $VERSION = '0.17';
45
46 =head1 SYNOPSIS
47
48 This module uses C<conf/normalize/*.pl> files to perform normalisation
49 from input records using perl functions which are specialized for set
50 processing.
51
52 Sets are implemented as arrays, and normalisation file is valid perl, which
53 means that you check it's validity before running WebPAC using
54 C<perl -c normalize.pl>.
55
56 Normalisation can generate multiple output normalized data. For now, supported output
57 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 C<marc>.
59
60 =head1 FUNCTIONS
61
62 Functions which start with C<_> are private and used by WebPAC internally.
63 All other functions are available for use within normalisation rules.
64
65 =head2 data_structure
66
67 Return data structure
68
69 my $ds = WebPAC::Normalize::data_structure(
70 lookup => $lookup->lookup_hash,
71 row => $row,
72 rules => $normalize_pl_config,
73 marc_encoding => 'utf-8',
74 config => $config,
75 );
76
77 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
78 other are optional.
79
80 This function will B<die> if normalizastion can't be evaled.
81
82 Since this function isn't exported you have to call it with
83 C<WebPAC::Normalize::data_structure>.
84
85 =cut
86
87 sub data_structure {
88 my $arg = {@_};
89
90 die "need row argument" unless ($arg->{row});
91 die "need normalisation argument" unless ($arg->{rules});
92
93 no strict 'subs';
94 _set_lookup( $arg->{lookup} );
95 _set_rec( $arg->{row} );
96 _set_config( $arg->{config} );
97 _clean_ds( %{ $arg } );
98 eval "$arg->{rules}";
99 die "error evaling $arg->{rules}: $@\n" if ($@);
100
101 return _get_ds();
102 }
103
104 =head2 _set_rec
105
106 Set current record hash
107
108 _set_rec( $rec );
109
110 =cut
111
112 my $rec;
113
114 sub _set_rec {
115 $rec = shift or die "no record hash";
116 }
117
118 =head2 _set_config
119
120 Set current config hash
121
122 _set_config( $config );
123
124 Magic keys are:
125
126 =over 4
127
128 =item _
129
130 Code of current database
131
132 =item _mfn
133
134 Current MFN
135
136 =back
137
138 =cut
139
140 my $config;
141
142 sub _set_config {
143 $config = shift;
144 }
145
146 =head2 _get_ds
147
148 Return hash formatted as data structure
149
150 my $ds = _get_ds();
151
152 =cut
153
154 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
156
157 sub _get_ds {
158 return $out;
159 }
160
161 =head2 _clean_ds
162
163 Clean data structure hash for next record
164
165 _clean_ds();
166
167 =cut
168
169 sub _clean_ds {
170 my $a = {@_};
171 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 $marc_encoding = $a->{marc_encoding};
174 }
175
176 =head2 _set_lookup
177
178 Set current lookup hash
179
180 _set_lookup( $lookup );
181
182 =cut
183
184 my $lookup;
185
186 sub _set_lookup {
187 $lookup = shift;
188 }
189
190 =head2 _get_marc_fields
191
192 Get all fields defined by calls to C<marc>
193
194 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
195
196 We are using I<magic> which detect repeatable fields only from
197 sequence of field/subfield data generated by normalization.
198
199 Repeatable field is created when there is second occurence of same subfield or
200 if any of indicators are different.
201
202 This is sane for most cases. Something like:
203
204 900a-1 900b-1 900c-1
205 900a-2 900b-2
206 900a-3
207
208 will be created from any combination of:
209
210 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
211
212 and following rules:
213
214 marc('900','a', rec('200','a') );
215 marc('900','b', rec('200','b') );
216 marc('900','c', rec('200','c') );
217
218 which might not be what you have in mind. If you need repeatable subfield,
219 define it using C<marc_repeatable_subfield> like this:
220
221 marc_repeatable_subfield('900','a');
222 marc('900','a', rec('200','a') );
223 marc('900','b', rec('200','b') );
224 marc('900','c', rec('200','c') );
225
226 will create:
227
228 900a-1 900a-2 900a-3 900b-1 900c-1
229 900b-2
230
231 There is also support for returning next or specific using:
232
233 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
234 # do something with $mf
235 }
236
237 will always return fields from next MARC record or
238
239 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
240
241 will return 42th copy record (if it exists).
242
243 =cut
244
245 sub _get_marc_fields {
246
247 my $arg = {@_};
248 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
249 my $offset = $marc_fetch_offset;
250 if ($arg->{offset}) {
251 $offset = $arg->{offset};
252 } elsif($arg->{fetch_next}) {
253 $marc_fetch_offset++;
254 }
255
256 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
257
258 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
259
260 my $marc_rec = $marc_record->[ $offset ];
261
262 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
263
264 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
265
266 # first, sort all existing fields
267 # XXX might not be needed, but modern perl might randomize elements in hash
268 my @sorted_marc_record = sort {
269 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
270 } @{ $marc_rec };
271
272 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
273
274 # output marc fields
275 my @m;
276
277 # count unique field-subfields (used for offset when walking to next subfield)
278 my $u;
279 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
280
281 if ($debug) {
282 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
283 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
284 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
285 warn "## subfield count = ", dump( $u ), $/;
286 }
287
288 my $len = $#sorted_marc_record;
289 my $visited;
290 my $i = 0;
291 my $field;
292
293 foreach ( 0 .. $len ) {
294
295 # find next element which isn't visited
296 while ($visited->{$i}) {
297 $i = ($i + 1) % ($len + 1);
298 }
299
300 # mark it visited
301 $visited->{$i}++;
302
303 my $row = dclone( $sorted_marc_record[$i] );
304
305 # field and subfield which is key for
306 # marc_repeatable_subfield and u
307 my $fsf = $row->[0] . ( $row->[3] || '' );
308
309 if ($debug > 1) {
310
311 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
312 print "### this [$i]: ", dump( $row ),$/;
313 print "### sf: ", $row->[3], " vs ", $field->[3],
314 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
315 if ($#$field >= 0);
316
317 }
318
319 # if field exists
320 if ( $#$field >= 0 ) {
321 if (
322 $row->[0] ne $field->[0] || # field
323 $row->[1] ne $field->[1] || # i1
324 $row->[2] ne $field->[2] # i2
325 ) {
326 push @m, $field;
327 warn "## saved/1 ", dump( $field ),$/ if ($debug);
328 $field = $row;
329
330 } elsif (
331 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
332 ||
333 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
334 ! $marc_repeatable_subfield->{ $fsf }
335 )
336 ) {
337 push @m, $field;
338 warn "## saved/2 ", dump( $field ),$/ if ($debug);
339 $field = $row;
340
341 } else {
342 # append new subfields to existing field
343 push @$field, ( $row->[3], $row->[4] );
344 }
345 } else {
346 # insert first field
347 $field = $row;
348 }
349
350 if (! $marc_repeatable_subfield->{ $fsf }) {
351 # make step to next subfield
352 $i = ($i + $u->{ $fsf } ) % ($len + 1);
353 }
354 }
355
356 if ($#$field >= 0) {
357 push @m, $field;
358 warn "## saved/3 ", dump( $field ),$/ if ($debug);
359 }
360
361 return \@m;
362 }
363
364 =head2 _debug
365
366 Change level of debug warnings
367
368 _debug( 2 );
369
370 =cut
371
372 sub _debug {
373 my $l = shift;
374 return $debug unless defined($l);
375 warn "debug level $l",$/ if ($l > 0);
376 $debug = $l;
377 }
378
379 =head1 Functions to create C<data_structure>
380
381 Those functions generally have to first in your normalization file.
382
383 =head2 tag
384
385 Define new tag for I<search> and I<display>.
386
387 tag('Title', rec('200','a') );
388
389
390 =cut
391
392 sub tag {
393 my $name = shift or die "tag needs name as first argument";
394 my @o = grep { defined($_) && $_ ne '' } @_;
395 return unless (@o);
396 $out->{$name}->{tag} = $name;
397 $out->{$name}->{search} = \@o;
398 $out->{$name}->{display} = \@o;
399 }
400
401 =head2 display
402
403 Define tag just for I<display>
404
405 @v = display('Title', rec('200','a') );
406
407 =cut
408
409 sub display {
410 my $name = shift or die "display needs name as first argument";
411 my @o = grep { defined($_) && $_ ne '' } @_;
412 return unless (@o);
413 $out->{$name}->{tag} = $name;
414 $out->{$name}->{display} = \@o;
415 }
416
417 =head2 search
418
419 Prepare values just for I<search>
420
421 @v = search('Title', rec('200','a') );
422
423 =cut
424
425 sub search {
426 my $name = shift or die "search needs name as first argument";
427 my @o = grep { defined($_) && $_ ne '' } @_;
428 return unless (@o);
429 $out->{$name}->{tag} = $name;
430 $out->{$name}->{search} = \@o;
431 }
432
433 =head2 marc_leader
434
435 Setup fields within MARC leader or get leader
436
437 marc_leader('05','c');
438 my $leader = marc_leader();
439
440 =cut
441
442 sub marc_leader {
443 my ($offset,$value) = @_;
444
445 if ($offset) {
446 $out->{' leader'}->{ $offset } = $value;
447 } else {
448 return $out->{' leader'};
449 }
450 }
451
452 =head2 marc
453
454 Save value for MARC field
455
456 marc('900','a', rec('200','a') );
457 marc('001', rec('000') );
458
459 =cut
460
461 sub marc {
462 my $f = shift or die "marc needs field";
463 die "marc field must be numer" unless ($f =~ /^\d+$/);
464
465 my $sf;
466 if ($f >= 10) {
467 $sf = shift or die "marc needs subfield";
468 }
469
470 foreach (@_) {
471 my $v = $_; # make var read-write for Encode
472 next unless (defined($v) && $v !~ /^\s*$/);
473 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
474 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
475 if (defined $sf) {
476 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
477 } else {
478 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
479 }
480 }
481 }
482
483 =head2 marc_repeatable_subfield
484
485 Save values for MARC repetable subfield
486
487 marc_repeatable_subfield('910', 'z', rec('909') );
488
489 =cut
490
491 sub marc_repeatable_subfield {
492 my ($f,$sf) = @_;
493 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
494 $marc_repeatable_subfield->{ $f . $sf }++;
495 marc(@_);
496 }
497
498 =head2 marc_indicators
499
500 Set both indicators for MARC field
501
502 marc_indicators('900', ' ', 1);
503
504 Any indicator value other than C<0-9> will be treated as undefined.
505
506 =cut
507
508 sub marc_indicators {
509 my $f = shift || die "marc_indicators need field!\n";
510 my ($i1,$i2) = @_;
511 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
512 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
513
514 $i1 = ' ' if ($i1 !~ /^\d$/);
515 $i2 = ' ' if ($i2 !~ /^\d$/);
516 @{ $marc_indicators->{$f} } = ($i1,$i2);
517 }
518
519 =head2 marc_compose
520
521 Save values for each MARC subfield explicitly
522
523 marc_compose('900',
524 'a', rec('200','a')
525 'b', rec('201','a')
526 'a', rec('200','b')
527 'c', rec('200','c')
528 );
529
530 If you specify C<+> for subfield, value will be appended
531 to previous defined subfield.
532
533 =cut
534
535 sub marc_compose {
536 my $f = shift or die "marc_compose needs field";
537 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
538
539 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
540 my $m = [ $f, $i1, $i2 ];
541
542 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
543
544 if ($#_ % 2 != 1) {
545 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
546 }
547
548 while (@_) {
549 my $sf = shift;
550 my $v = shift;
551
552 next unless (defined($v) && $v !~ /^\s*$/);
553 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
554 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
555 if ($sf ne '+') {
556 push @$m, ( $sf, $v );
557 } else {
558 $m->[ $#$m ] .= $v;
559 }
560 }
561
562 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
563
564 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
565 }
566
567 =head2 marc_duplicate
568
569 Generate copy of current MARC record and continue working on copy
570
571 marc_duplicate();
572
573 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
574 C<< _get_marc_fields( offset => 42 ) >>.
575
576 =cut
577
578 sub marc_duplicate {
579 my $m = $marc_record->[ -1 ];
580 die "can't duplicate record which isn't defined" unless ($m);
581 push @{ $marc_record }, dclone( $m );
582 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
583 $marc_record_offset = $#{ $marc_record };
584 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
585 }
586
587 =head2 marc_remove
588
589 Remove some field or subfield from MARC record.
590
591 marc_remove('200');
592 marc_remove('200','a');
593
594 This will erase field C<200> or C<200^a> from current MARC record.
595
596 This is useful after calling C<marc_duplicate> or on it's own (but, you
597 should probably just remove that subfield definition if you are not
598 using C<marc_duplicate>).
599
600 FIXME: support fields < 10.
601
602 =cut
603
604 sub marc_remove {
605 my ($f, $sf) = @_;
606
607 die "marc_remove needs record number" unless defined($f);
608
609 my $marc = $marc_record->[ $marc_record_offset ];
610
611 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
612
613 my $i = 0;
614 foreach ( 0 .. $#{ $marc } ) {
615 last unless (defined $marc->[$i]);
616 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
617 if ($marc->[$i]->[0] eq $f) {
618 if (! defined $sf) {
619 # remove whole field
620 splice @$marc, $i, 1;
621 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
622 $i--;
623 } else {
624 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
625 my $o = ($j * 2) + 3;
626 if ($marc->[$i]->[$o] eq $sf) {
627 # remove subfield
628 splice @{$marc->[$i]}, $o, 2;
629 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
630 # is record now empty?
631 if ($#{ $marc->[$i] } == 2) {
632 splice @$marc, $i, 1;
633 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
634 $i--;
635 };
636 }
637 }
638 }
639 }
640 $i++;
641 }
642
643 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
644
645 $marc_record->[ $marc_record_offset ] = $marc;
646
647 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
648 }
649
650 =head2 marc_original_order
651
652 Copy all subfields preserving original order to marc field.
653
654 marc_original_order( marc_field_number, original_input_field_number );
655
656 Please note that field numbers are consistent with other commands (marc
657 field number first), but somewhat counter-intuitive (destination and then
658 source).
659
660 You might want to use this command if you are just renaming subfields or
661 using pre-processing modify_record in C<config.yml> and don't need any
662 post-processing or want to preserve order of original subfields.
663
664
665 =cut
666
667 sub marc_original_order {
668
669 my ($to, $from) = @_;
670 die "marc_original_order needs from and to fields\n" unless ($from && $to);
671
672 return unless defined($rec->{$from});
673
674 my $r = $rec->{$from};
675 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
676
677 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
678 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
679
680 foreach my $d (@$r) {
681
682 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
683 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
684 next;
685 }
686
687 my @sfs = @{ $d->{subfields} };
688
689 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
690
691 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
692
693 my $m = [ $to, $i1, $i2 ];
694
695 while (my $sf = shift @sfs) {
696
697 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
698 my $offset = shift @sfs;
699 die "corrupted sufields specification for field $from\n" unless defined($offset);
700
701 my $v;
702 if (ref($d->{$sf}) eq 'ARRAY') {
703 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
704 } elsif ($offset == 0) {
705 $v = $d->{$sf};
706 } else {
707 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
708 }
709 push @$m, ( $sf, $v ) if (defined($v));
710 }
711
712 if ($#{$m} > 2) {
713 push @{ $marc_record->[ $marc_record_offset ] }, $m;
714 }
715 }
716
717 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
718 }
719
720
721 =head1 Functions to extract data from input
722
723 This function should be used inside functions to create C<data_structure> described
724 above.
725
726 =head2 _pack_subfields_hash
727
728 @values = _pack_subfields_hash( $h, $include_subfields )
729
730 =cut
731
732 sub _pack_subfields_hash {
733
734 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
735
736 my ($h,$include_subfields) = @_;
737
738
739 if ( defined($h->{subfields}) ) {
740 my $sfs = delete $h->{subfields} || die "no subfields?";
741 my @out;
742 while (@$sfs) {
743 my $sf = shift @$sfs;
744 push @out, '^' . $sf if ($include_subfields);
745 my $o = shift @$sfs;
746 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
747 # single element subfields are not arrays
748 push @out, $h->{$sf};
749 } else {
750 #warn "====> $f $sf $o $#$sfs ", dump( $sfs ), "\n";
751 push @out, $h->{$sf}->[$o];
752 }
753 }
754 return @out;
755 } else {
756 # FIXME this should probably be in alphabetical order instead of hash order
757 values %{$h};
758 }
759 }
760
761 =head2 rec1
762
763 Return all values in some field
764
765 @v = rec1('200')
766
767 TODO: order of values is probably same as in source data, need to investigate that
768
769 =cut
770
771 sub rec1 {
772 my $f = shift;
773 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
774 return unless (defined($rec) && defined($rec->{$f}));
775 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
776 if (ref($rec->{$f}) eq 'ARRAY') {
777 my @out;
778 foreach my $h ( @{ $rec->{$f} } ) {
779 if (ref($h) eq 'HASH') {
780 push @out, ( _pack_subfields_hash( $h ) );
781 } else {
782 push @out, $h;
783 }
784 }
785 return @out;
786 } elsif( defined($rec->{$f}) ) {
787 return $rec->{$f};
788 }
789 }
790
791 =head2 rec2
792
793 Return all values in specific field and subfield
794
795 @v = rec2('200','a')
796
797 =cut
798
799 sub rec2 {
800 my $f = shift;
801 return unless (defined($rec && $rec->{$f}));
802 my $sf = shift;
803 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
804 return map {
805 if (ref($_->{$sf}) eq 'ARRAY') {
806 @{ $_->{$sf} };
807 } else {
808 $_->{$sf};
809 }
810 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
811 }
812
813 =head2 rec
814
815 syntaxtic sugar for
816
817 @v = rec('200')
818 @v = rec('200','a')
819
820 =cut
821
822 sub rec {
823 my @out;
824 if ($#_ == 0) {
825 @out = rec1(@_);
826 } elsif ($#_ == 1) {
827 @out = rec2(@_);
828 }
829 if (@out) {
830 return @out;
831 } else {
832 return '';
833 }
834 }
835
836 =head2 regex
837
838 Apply regex to some or all values
839
840 @v = regex( 's/foo/bar/g', @v );
841
842 =cut
843
844 sub regex {
845 my $r = shift;
846 my @out;
847 #warn "r: $r\n", dump(\@_);
848 foreach my $t (@_) {
849 next unless ($t);
850 eval "\$t =~ $r";
851 push @out, $t if ($t && $t ne '');
852 }
853 return @out;
854 }
855
856 =head2 prefix
857
858 Prefix all values with a string
859
860 @v = prefix( 'my_', @v );
861
862 =cut
863
864 sub prefix {
865 my $p = shift or return;
866 return map { $p . $_ } grep { defined($_) } @_;
867 }
868
869 =head2 suffix
870
871 suffix all values with a string
872
873 @v = suffix( '_my', @v );
874
875 =cut
876
877 sub suffix {
878 my $s = shift or die "suffix needs string as first argument";
879 return map { $_ . $s } grep { defined($_) } @_;
880 }
881
882 =head2 surround
883
884 surround all values with a two strings
885
886 @v = surround( 'prefix_', '_suffix', @v );
887
888 =cut
889
890 sub surround {
891 my $p = shift or die "surround need prefix as first argument";
892 my $s = shift or die "surround needs suffix as second argument";
893 return map { $p . $_ . $s } grep { defined($_) } @_;
894 }
895
896 =head2 first
897
898 Return first element
899
900 $v = first( @v );
901
902 =cut
903
904 sub first {
905 my $r = shift;
906 return $r;
907 }
908
909 =head2 lookup
910
911 Consult lookup hashes for some value
912
913 @v = lookup( $v );
914 @v = lookup( @v );
915
916 =cut
917
918 sub lookup {
919 my $k = shift or return;
920 return unless (defined($lookup->{$k}));
921 if (ref($lookup->{$k}) eq 'ARRAY') {
922 return @{ $lookup->{$k} };
923 } else {
924 return $lookup->{$k};
925 }
926 }
927
928 =head2 config
929
930 Consult config values stored in C<config.yml>
931
932 # return database code (key under databases in yaml)
933 $database_code = config(); # use _ from hash
934 $database_name = config('name');
935 $database_input_name = config('input name');
936 $tag = config('input normalize tag');
937
938 Up to three levels are supported.
939
940 =cut
941
942 sub config {
943 return unless ($config);
944
945 my $p = shift;
946
947 $p ||= '';
948
949 my $v;
950
951 warn "### getting config($p)\n" if ($debug > 1);
952
953 my @p = split(/\s+/,$p);
954 if ($#p < 0) {
955 $v = $config->{ '_' }; # special, database code
956 } else {
957
958 my $c = dclone( $config );
959
960 foreach my $k (@p) {
961 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
962 if (ref($c) eq 'ARRAY') {
963 $c = shift @$c;
964 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
965 last;
966 }
967
968 if (! defined($c->{$k}) ) {
969 $c = undef;
970 last;
971 } else {
972 $c = $c->{$k};
973 }
974 }
975 $v = $c if ($c);
976
977 }
978
979 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
980 warn "config( '$p' ) is empty\n" if (! $v);
981
982 return $v;
983 }
984
985 =head2 id
986
987 Returns unique id of this record
988
989 $id = id();
990
991 Returns C<42/2> for 2nd occurence of MFN 42.
992
993 =cut
994
995 sub id {
996 my $mfn = $config->{_mfn} || die "no _mfn in config data";
997 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
998 }
999
1000 =head2 join_with
1001
1002 Joins walues with some delimiter
1003
1004 $v = join_with(", ", @v);
1005
1006 =cut
1007
1008 sub join_with {
1009 my $d = shift;
1010 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1011 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1012 return '' unless defined($v);
1013 return $v;
1014 }
1015
1016 =head2 split_rec_on
1017
1018 Split record subfield on some regex and take one of parts out
1019
1020 $a_before_semi_column =
1021 split_rec_on('200','a', /\s*;\s*/, $part);
1022
1023 C<$part> is optional number of element. First element is
1024 B<1>, not 0!
1025
1026 If there is no C<$part> parameter or C<$part> is 0, this function will
1027 return all values produced by splitting.
1028
1029 =cut
1030
1031 sub split_rec_on {
1032 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1033
1034 my ($fld, $sf, $regex, $part) = @_;
1035 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1036
1037 my @r = rec( $fld, $sf );
1038 my $v = shift @r;
1039 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1040
1041 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1042
1043 my @s = split( $regex, $v );
1044 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1045 if ($part && $part > 0) {
1046 return $s[ $part - 1 ];
1047 } else {
1048 return @s;
1049 }
1050 }
1051
1052 # END
1053 1;

  ViewVC Help
Powered by ViewVC 1.1.26