/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 601 - (hide annotations)
Sun Jul 23 17:33:11 2006 UTC (16 years, 6 months ago) by dpavlin
File size: 19296 byte(s)
added _debug(2) output to rec2(...)

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 538
8 dpavlin 536 tag search display
9 dpavlin 547 marc marc_indicators marc_repeatable_subfield
10 dpavlin 564 marc_compose marc_leader
11 dpavlin 574 marc_duplicate marc_remove
12 dpavlin 540
13 dpavlin 536 rec1 rec2 rec
14     regex prefix suffix surround
15     first lookup join_with
16 dpavlin 562
17     split_rec_on
18 dpavlin 536 /;
19 dpavlin 10
20     use warnings;
21     use strict;
22 dpavlin 536
23     #use base qw/WebPAC::Common/;
24 dpavlin 550 use Data::Dump qw/dump/;
25 dpavlin 541 use Encode qw/from_to/;
26 dpavlin 574 use Storable qw/dclone/;
27 dpavlin 10
28 dpavlin 550 # debugging warn(s)
29     my $debug = 0;
30    
31    
32 dpavlin 10 =head1 NAME
33    
34 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
35 dpavlin 10
36     =head1 VERSION
37    
38 dpavlin 595 Version 0.14
39 dpavlin 10
40     =cut
41    
42 dpavlin 595 our $VERSION = '0.14';
43 dpavlin 10
44     =head1 SYNOPSIS
45    
46 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
47     from input records using perl functions which are specialized for set
48     processing.
49 dpavlin 10
50 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
51     means that you check it's validity before running WebPAC using
52     C<perl -c normalize.pl>.
53 dpavlin 15
54 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
55 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
56 dpavlin 547 C<marc>.
57 dpavlin 15
58 dpavlin 10 =head1 FUNCTIONS
59    
60 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
61     All other functions are available for use within normalisation rules.
62    
63 dpavlin 536 =head2 data_structure
64 dpavlin 10
65 dpavlin 536 Return data structure
66 dpavlin 13
67 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
68 dpavlin 536 lookup => $lookup->lookup_hash,
69     row => $row,
70     rules => $normalize_pl_config,
71 dpavlin 541 marc_encoding => 'utf-8',
72 dpavlin 595 config => $config,
73 dpavlin 13 );
74    
75 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
76     other are optional.
77    
78 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
79 dpavlin 15
80 dpavlin 538 Since this function isn't exported you have to call it with
81     C<WebPAC::Normalize::data_structure>.
82    
83 dpavlin 10 =cut
84    
85 dpavlin 536 sub data_structure {
86     my $arg = {@_};
87 dpavlin 13
88 dpavlin 536 die "need row argument" unless ($arg->{row});
89     die "need normalisation argument" unless ($arg->{rules});
90 dpavlin 31
91 dpavlin 536 no strict 'subs';
92 dpavlin 538 _set_lookup( $arg->{lookup} );
93     _set_rec( $arg->{row} );
94 dpavlin 595 _set_config( $arg->{config} );
95 dpavlin 541 _clean_ds( %{ $arg } );
96 dpavlin 536 eval "$arg->{rules}";
97     die "error evaling $arg->{rules}: $@\n" if ($@);
98 dpavlin 540
99 dpavlin 538 return _get_ds();
100 dpavlin 10 }
101    
102 dpavlin 538 =head2 _set_rec
103 dpavlin 13
104 dpavlin 536 Set current record hash
105 dpavlin 433
106 dpavlin 538 _set_rec( $rec );
107 dpavlin 433
108     =cut
109    
110 dpavlin 536 my $rec;
111 dpavlin 433
112 dpavlin 538 sub _set_rec {
113 dpavlin 536 $rec = shift or die "no record hash";
114 dpavlin 433 }
115    
116 dpavlin 595 =head2 _set_config
117    
118     Set current config hash
119    
120     _set_config( $config );
121    
122     Magic keys are:
123    
124     =over 4
125    
126     =item _
127    
128     Code of current database
129    
130     =item _mfn
131    
132     Current MFN
133    
134     =back
135    
136     =cut
137    
138     my $config;
139    
140     sub _set_config {
141     $config = shift;
142     }
143    
144 dpavlin 538 =head2 _get_ds
145    
146     Return hash formatted as data structure
147    
148     my $ds = _get_ds();
149    
150     =cut
151    
152 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
153     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
154 dpavlin 538
155     sub _get_ds {
156     return $out;
157     }
158    
159     =head2 _clean_ds
160    
161     Clean data structure hash for next record
162    
163     _clean_ds();
164    
165     =cut
166    
167     sub _clean_ds {
168 dpavlin 541 my $a = {@_};
169 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
170 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
171 dpavlin 541 $marc_encoding = $a->{marc_encoding};
172 dpavlin 538 }
173    
174     =head2 _set_lookup
175    
176     Set current lookup hash
177    
178     _set_lookup( $lookup );
179    
180     =cut
181    
182     my $lookup;
183    
184     sub _set_lookup {
185     $lookup = shift;
186     }
187    
188 dpavlin 547 =head2 _get_marc_fields
189 dpavlin 540
190 dpavlin 547 Get all fields defined by calls to C<marc>
191 dpavlin 540
192 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
193 dpavlin 540
194 dpavlin 543 We are using I<magic> which detect repeatable fields only from
195     sequence of field/subfield data generated by normalization.
196    
197 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
198     if any of indicators are different.
199 dpavlin 543
200 dpavlin 554 This is sane for most cases. Something like:
201 dpavlin 543
202 dpavlin 554 900a-1 900b-1 900c-1
203     900a-2 900b-2
204     900a-3
205    
206     will be created from any combination of:
207    
208     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
209    
210     and following rules:
211    
212     marc('900','a', rec('200','a') );
213     marc('900','b', rec('200','b') );
214     marc('900','c', rec('200','c') );
215    
216     which might not be what you have in mind. If you need repeatable subfield,
217     define it using C<marc_repeatable_subfield> like this:
218    
219 dpavlin 574 marc_repeatable_subfield('900','a');
220     marc('900','a', rec('200','a') );
221     marc('900','b', rec('200','b') );
222     marc('900','c', rec('200','c') );
223 dpavlin 554
224 dpavlin 574 will create:
225    
226     900a-1 900a-2 900a-3 900b-1 900c-1
227     900b-2
228    
229     There is also support for returning next or specific using:
230    
231     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
232     # do something with $mf
233     }
234    
235     will always return fields from next MARC record or
236    
237     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
238    
239     will return 42th copy record (if it exists).
240    
241 dpavlin 540 =cut
242    
243 dpavlin 547 sub _get_marc_fields {
244 dpavlin 550
245 dpavlin 574 my $arg = {@_};
246     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
247     my $offset = $marc_fetch_offset;
248     if ($arg->{offset}) {
249     $offset = $arg->{offset};
250     } elsif($arg->{fetch_next}) {
251     $marc_fetch_offset++;
252     }
253 dpavlin 550
254 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
255    
256     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
257    
258     my $marc_rec = $marc_record->[ $offset ];
259    
260     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
261    
262     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
263    
264 dpavlin 550 # first, sort all existing fields
265     # XXX might not be needed, but modern perl might randomize elements in hash
266     my @sorted_marc_record = sort {
267 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
268 dpavlin 574 } @{ $marc_rec };
269 dpavlin 550
270 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
271 dpavlin 562
272 dpavlin 550 # output marc fields
273 dpavlin 542 my @m;
274 dpavlin 550
275     # count unique field-subfields (used for offset when walking to next subfield)
276     my $u;
277 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
278 dpavlin 550
279     if ($debug) {
280 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
281     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
282     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
283     warn "## subfield count = ", dump( $u ), $/;
284 dpavlin 550 }
285    
286     my $len = $#sorted_marc_record;
287     my $visited;
288     my $i = 0;
289     my $field;
290    
291     foreach ( 0 .. $len ) {
292    
293     # find next element which isn't visited
294     while ($visited->{$i}) {
295     $i = ($i + 1) % ($len + 1);
296 dpavlin 542 }
297    
298 dpavlin 550 # mark it visited
299     $visited->{$i}++;
300    
301 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
302 dpavlin 550
303     # field and subfield which is key for
304     # marc_repeatable_subfield and u
305 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
306 dpavlin 550
307     if ($debug > 1) {
308    
309     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
310     print "### this [$i]: ", dump( $row ),$/;
311     print "### sf: ", $row->[3], " vs ", $field->[3],
312     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
313     if ($#$field >= 0);
314    
315     }
316    
317     # if field exists
318     if ( $#$field >= 0 ) {
319     if (
320     $row->[0] ne $field->[0] || # field
321     $row->[1] ne $field->[1] || # i1
322     $row->[2] ne $field->[2] # i2
323     ) {
324     push @m, $field;
325     warn "## saved/1 ", dump( $field ),$/ if ($debug);
326     $field = $row;
327    
328     } elsif (
329     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
330     ||
331     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
332     ! $marc_repeatable_subfield->{ $fsf }
333     )
334     ) {
335     push @m, $field;
336     warn "## saved/2 ", dump( $field ),$/ if ($debug);
337     $field = $row;
338    
339     } else {
340     # append new subfields to existing field
341     push @$field, ( $row->[3], $row->[4] );
342     }
343     } else {
344     # insert first field
345     $field = $row;
346     }
347    
348     if (! $marc_repeatable_subfield->{ $fsf }) {
349     # make step to next subfield
350     $i = ($i + $u->{ $fsf } ) % ($len + 1);
351     }
352 dpavlin 542 }
353    
354 dpavlin 550 if ($#$field >= 0) {
355     push @m, $field;
356     warn "## saved/3 ", dump( $field ),$/ if ($debug);
357     }
358 dpavlin 542
359 dpavlin 579 return \@m;
360 dpavlin 540 }
361    
362 dpavlin 554 =head2 _debug
363    
364     Change level of debug warnings
365    
366     _debug( 2 );
367    
368     =cut
369    
370     sub _debug {
371     my $l = shift;
372     return $debug unless defined($l);
373 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
374 dpavlin 554 $debug = $l;
375     }
376    
377 dpavlin 540 =head1 Functions to create C<data_structure>
378    
379     Those functions generally have to first in your normalization file.
380    
381 dpavlin 536 =head2 tag
382 dpavlin 433
383 dpavlin 536 Define new tag for I<search> and I<display>.
384 dpavlin 433
385 dpavlin 536 tag('Title', rec('200','a') );
386 dpavlin 13
387    
388     =cut
389    
390 dpavlin 536 sub tag {
391     my $name = shift or die "tag needs name as first argument";
392     my @o = grep { defined($_) && $_ ne '' } @_;
393     return unless (@o);
394     $out->{$name}->{tag} = $name;
395     $out->{$name}->{search} = \@o;
396     $out->{$name}->{display} = \@o;
397     }
398 dpavlin 13
399 dpavlin 536 =head2 display
400 dpavlin 13
401 dpavlin 536 Define tag just for I<display>
402 dpavlin 125
403 dpavlin 536 @v = display('Title', rec('200','a') );
404 dpavlin 125
405 dpavlin 536 =cut
406 dpavlin 125
407 dpavlin 536 sub display {
408     my $name = shift or die "display needs name as first argument";
409     my @o = grep { defined($_) && $_ ne '' } @_;
410     return unless (@o);
411     $out->{$name}->{tag} = $name;
412     $out->{$name}->{display} = \@o;
413     }
414 dpavlin 13
415 dpavlin 536 =head2 search
416 dpavlin 13
417 dpavlin 536 Prepare values just for I<search>
418 dpavlin 13
419 dpavlin 536 @v = search('Title', rec('200','a') );
420 dpavlin 433
421 dpavlin 536 =cut
422 dpavlin 13
423 dpavlin 536 sub search {
424     my $name = shift or die "search needs name as first argument";
425     my @o = grep { defined($_) && $_ ne '' } @_;
426     return unless (@o);
427     $out->{$name}->{tag} = $name;
428     $out->{$name}->{search} = \@o;
429 dpavlin 13 }
430    
431 dpavlin 564 =head2 marc_leader
432    
433     Setup fields within MARC leader or get leader
434    
435     marc_leader('05','c');
436     my $leader = marc_leader();
437    
438     =cut
439    
440     sub marc_leader {
441     my ($offset,$value) = @_;
442    
443     if ($offset) {
444     $out->{' leader'}->{ $offset } = $value;
445     } else {
446     return $out->{' leader'};
447     }
448     }
449    
450 dpavlin 547 =head2 marc
451 dpavlin 540
452     Save value for MARC field
453    
454 dpavlin 547 marc('900','a', rec('200','a') );
455 dpavlin 571 marc('001', rec('000') );
456 dpavlin 540
457     =cut
458    
459 dpavlin 547 sub marc {
460     my $f = shift or die "marc needs field";
461     die "marc field must be numer" unless ($f =~ /^\d+$/);
462 dpavlin 540
463 dpavlin 571 my $sf;
464     if ($f >= 10) {
465     $sf = shift or die "marc needs subfield";
466     }
467 dpavlin 540
468 dpavlin 541 foreach (@_) {
469     my $v = $_; # make var read-write for Encode
470 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
471 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
472 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
473 dpavlin 571 if (defined $sf) {
474 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
475 dpavlin 571 } else {
476 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
477 dpavlin 571 }
478 dpavlin 540 }
479     }
480    
481 dpavlin 547 =head2 marc_repeatable_subfield
482    
483     Save values for MARC repetable subfield
484    
485     marc_repeatable_subfield('910', 'z', rec('909') );
486    
487     =cut
488    
489     sub marc_repeatable_subfield {
490 dpavlin 550 my ($f,$sf) = @_;
491     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
492     $marc_repeatable_subfield->{ $f . $sf }++;
493 dpavlin 547 marc(@_);
494     }
495    
496     =head2 marc_indicators
497    
498     Set both indicators for MARC field
499    
500     marc_indicators('900', ' ', 1);
501    
502     Any indicator value other than C<0-9> will be treated as undefined.
503    
504     =cut
505    
506     sub marc_indicators {
507     my $f = shift || die "marc_indicators need field!\n";
508     my ($i1,$i2) = @_;
509     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
510     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
511    
512     $i1 = ' ' if ($i1 !~ /^\d$/);
513     $i2 = ' ' if ($i2 !~ /^\d$/);
514 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
515 dpavlin 547 }
516    
517 dpavlin 562 =head2 marc_compose
518 dpavlin 547
519 dpavlin 562 Save values for each MARC subfield explicitly
520    
521     marc_compose('900',
522     'a', rec('200','a')
523     'b', rec('201','a')
524     'a', rec('200','b')
525     'c', rec('200','c')
526     );
527    
528     =cut
529    
530     sub marc_compose {
531     my $f = shift or die "marc_compose needs field";
532     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
533    
534     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
535     my $m = [ $f, $i1, $i2 ];
536    
537 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
538    
539 dpavlin 562 while (@_) {
540     my $sf = shift or die "marc_compose $f needs subfield";
541 dpavlin 565 my $v = shift;
542 dpavlin 562
543     next unless (defined($v) && $v !~ /^\s*$/);
544     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
545     push @$m, ( $sf, $v );
546 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
547 dpavlin 562 }
548    
549 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
550 dpavlin 562
551 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
552 dpavlin 562 }
553    
554 dpavlin 574 =head2 marc_duplicate
555 dpavlin 562
556 dpavlin 574 Generate copy of current MARC record and continue working on copy
557    
558     marc_duplicate();
559    
560     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
561     C<< _get_marc_fields( offset => 42 ) >>.
562    
563     =cut
564    
565     sub marc_duplicate {
566     my $m = $marc_record->[ -1 ];
567     die "can't duplicate record which isn't defined" unless ($m);
568     push @{ $marc_record }, dclone( $m );
569     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
570     $marc_record_offset = $#{ $marc_record };
571     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
572     }
573    
574     =head2 marc_remove
575    
576     Remove some field or subfield from MARC record.
577    
578     marc_remove('200');
579     marc_remove('200','a');
580    
581     This will erase field C<200> or C<200^a> from current MARC record.
582    
583     This is useful after calling C<marc_duplicate> or on it's own (but, you
584     should probably just remove that subfield definition if you are not
585     using C<marc_duplicate>).
586    
587     FIXME: support fields < 10.
588    
589     =cut
590    
591     sub marc_remove {
592     my ($f, $sf) = @_;
593    
594     die "marc_remove needs record number" unless defined($f);
595    
596     my $marc = $marc_record->[ $marc_record_offset ];
597    
598     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
599    
600 dpavlin 579 my $i = 0;
601     foreach ( 0 .. $#{ $marc } ) {
602 dpavlin 574 last unless (defined $marc->[$i]);
603     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
604     if ($marc->[$i]->[0] eq $f) {
605     if (! defined $sf) {
606     # remove whole field
607     splice @$marc, $i, 1;
608     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
609     $i--;
610     } else {
611     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
612     my $o = ($j * 2) + 3;
613     if ($marc->[$i]->[$o] eq $sf) {
614     # remove subfield
615     splice @{$marc->[$i]}, $o, 2;
616     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
617     # is record now empty?
618     if ($#{ $marc->[$i] } == 2) {
619     splice @$marc, $i, 1;
620     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
621     $i--;
622     };
623     }
624     }
625     }
626     }
627 dpavlin 579 $i++;
628 dpavlin 574 }
629    
630     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
631    
632     $marc_record->[ $marc_record_offset ] = $marc;
633    
634     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
635     }
636    
637 dpavlin 540 =head1 Functions to extract data from input
638    
639     This function should be used inside functions to create C<data_structure> described
640     above.
641    
642 dpavlin 536 =head2 rec1
643 dpavlin 371
644 dpavlin 536 Return all values in some field
645 dpavlin 371
646 dpavlin 536 @v = rec1('200')
647 dpavlin 15
648 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
649 dpavlin 15
650 dpavlin 536 =cut
651 dpavlin 15
652 dpavlin 536 sub rec1 {
653     my $f = shift;
654 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
655 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
656 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
657 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
658     return map {
659     if (ref($_) eq 'HASH') {
660     values %{$_};
661 dpavlin 31 } else {
662 dpavlin 536 $_;
663 dpavlin 31 }
664 dpavlin 536 } @{ $rec->{$f} };
665     } elsif( defined($rec->{$f}) ) {
666     return $rec->{$f};
667 dpavlin 15 }
668     }
669    
670 dpavlin 536 =head2 rec2
671 dpavlin 15
672 dpavlin 536 Return all values in specific field and subfield
673 dpavlin 13
674 dpavlin 536 @v = rec2('200','a')
675 dpavlin 13
676     =cut
677    
678 dpavlin 536 sub rec2 {
679     my $f = shift;
680     return unless (defined($rec && $rec->{$f}));
681     my $sf = shift;
682 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
683 dpavlin 589 return map {
684     if (ref($_->{$sf}) eq 'ARRAY') {
685     @{ $_->{$sf} };
686     } else {
687     $_->{$sf};
688     }
689     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
690 dpavlin 536 }
691 dpavlin 13
692 dpavlin 536 =head2 rec
693 dpavlin 13
694 dpavlin 536 syntaxtic sugar for
695 dpavlin 13
696 dpavlin 536 @v = rec('200')
697     @v = rec('200','a')
698 dpavlin 13
699 dpavlin 536 =cut
700 dpavlin 373
701 dpavlin 536 sub rec {
702 dpavlin 583 my @out;
703 dpavlin 536 if ($#_ == 0) {
704 dpavlin 583 @out = rec1(@_);
705 dpavlin 536 } elsif ($#_ == 1) {
706 dpavlin 583 @out = rec2(@_);
707 dpavlin 13 }
708 dpavlin 583 if (@out) {
709     return @out;
710     } else {
711     return '';
712     }
713 dpavlin 13 }
714    
715 dpavlin 536 =head2 regex
716 dpavlin 15
717 dpavlin 536 Apply regex to some or all values
718 dpavlin 15
719 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
720 dpavlin 15
721     =cut
722    
723 dpavlin 536 sub regex {
724     my $r = shift;
725     my @out;
726 dpavlin 550 #warn "r: $r\n", dump(\@_);
727 dpavlin 536 foreach my $t (@_) {
728     next unless ($t);
729     eval "\$t =~ $r";
730     push @out, $t if ($t && $t ne '');
731 dpavlin 368 }
732 dpavlin 536 return @out;
733 dpavlin 15 }
734    
735 dpavlin 536 =head2 prefix
736 dpavlin 15
737 dpavlin 536 Prefix all values with a string
738 dpavlin 15
739 dpavlin 536 @v = prefix( 'my_', @v );
740 dpavlin 15
741     =cut
742    
743 dpavlin 536 sub prefix {
744 dpavlin 592 my $p = shift or return;
745 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
746     }
747 dpavlin 15
748 dpavlin 536 =head2 suffix
749 dpavlin 15
750 dpavlin 536 suffix all values with a string
751 dpavlin 15
752 dpavlin 536 @v = suffix( '_my', @v );
753 dpavlin 15
754 dpavlin 536 =cut
755 dpavlin 15
756 dpavlin 536 sub suffix {
757     my $s = shift or die "suffix needs string as first argument";
758     return map { $_ . $s } grep { defined($_) } @_;
759 dpavlin 15 }
760    
761 dpavlin 536 =head2 surround
762 dpavlin 13
763 dpavlin 536 surround all values with a two strings
764 dpavlin 13
765 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
766 dpavlin 13
767     =cut
768    
769 dpavlin 536 sub surround {
770     my $p = shift or die "surround need prefix as first argument";
771     my $s = shift or die "surround needs suffix as second argument";
772     return map { $p . $_ . $s } grep { defined($_) } @_;
773 dpavlin 13 }
774    
775 dpavlin 536 =head2 first
776 dpavlin 13
777 dpavlin 536 Return first element
778 dpavlin 15
779 dpavlin 536 $v = first( @v );
780 dpavlin 13
781     =cut
782    
783 dpavlin 536 sub first {
784     my $r = shift;
785     return $r;
786 dpavlin 13 }
787    
788 dpavlin 536 =head2 lookup
789 dpavlin 13
790 dpavlin 536 Consult lookup hashes for some value
791 dpavlin 13
792 dpavlin 536 @v = lookup( $v );
793     @v = lookup( @v );
794 dpavlin 13
795     =cut
796    
797 dpavlin 536 sub lookup {
798     my $k = shift or return;
799     return unless (defined($lookup->{$k}));
800     if (ref($lookup->{$k}) eq 'ARRAY') {
801     return @{ $lookup->{$k} };
802     } else {
803     return $lookup->{$k};
804     }
805 dpavlin 13 }
806    
807 dpavlin 595 =head2 config
808    
809     Consult config values stored in C<config.yml>
810    
811     # return database code (key under databases in yaml)
812     $database_code = config(); # use _ from hash
813     $database_name = config('name');
814     $database_input_name = config('input name');
815     $tag = config('input normalize tag');
816    
817     Up to three levels are supported.
818    
819     =cut
820    
821     sub config {
822     return unless ($config);
823    
824     my $p = shift;
825    
826     $p ||= '';
827    
828     my $v;
829    
830     warn "### getting config($p)\n" if ($debug > 1);
831    
832     my @p = split(/\s+/,$p);
833     if ($#p < 0) {
834     $v = $config->{ '_' }; # special, database code
835     } else {
836    
837     my $c = dclone( $config );
838    
839     foreach my $k (@p) {
840     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
841     if (ref($c) eq 'ARRAY') {
842     $c = shift @$c;
843     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
844     last;
845     }
846    
847     if (! defined($c->{$k}) ) {
848     $c = undef;
849     last;
850     } else {
851     $c = $c->{$k};
852     }
853     }
854     $v = $c if ($c);
855    
856     }
857    
858     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
859     warn "config( '$p' ) is empty\n" if (! $v);
860    
861     return $v;
862     }
863    
864     =head2 id
865    
866     Returns unique id of this record
867    
868     $id = id();
869    
870     Returns C<42/2> for 2nd occurence of MFN 42.
871    
872     =cut
873    
874     sub id {
875     my $mfn = $config->{_mfn} || die "no _mfn in config data";
876     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
877     }
878    
879 dpavlin 536 =head2 join_with
880 dpavlin 13
881 dpavlin 536 Joins walues with some delimiter
882 dpavlin 10
883 dpavlin 536 $v = join_with(", ", @v);
884 dpavlin 10
885 dpavlin 536 =cut
886 dpavlin 10
887 dpavlin 536 sub join_with {
888     my $d = shift;
889 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
890 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
891     return '' unless defined($v);
892     return $v;
893 dpavlin 536 }
894 dpavlin 10
895 dpavlin 562 =head2 split_rec_on
896    
897     Split record subfield on some regex and take one of parts out
898    
899     $a_before_semi_column =
900     split_rec_on('200','a', /\s*;\s*/, $part);
901    
902     C<$part> is optional number of element. First element is
903     B<1>, not 0!
904    
905     If there is no C<$part> parameter or C<$part> is 0, this function will
906     return all values produced by splitting.
907    
908     =cut
909    
910     sub split_rec_on {
911     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
912    
913     my ($fld, $sf, $regex, $part) = @_;
914 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
915 dpavlin 562
916     my @r = rec( $fld, $sf );
917     my $v = shift @r;
918 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
919 dpavlin 562
920 dpavlin 566 return '' if( ! defined($v) || $v =~ /^\s*$/);
921    
922 dpavlin 562 my @s = split( $regex, $v );
923 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
924 dpavlin 566 if ($part && $part > 0) {
925 dpavlin 562 return $s[ $part - 1 ];
926     } else {
927 dpavlin 571 return @s;
928 dpavlin 562 }
929     }
930    
931 dpavlin 536 # END
932     1;

  ViewVC Help
Powered by ViewVC 1.1.26