/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 669 - (hide annotations)
Mon Sep 11 14:29:01 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 22988 byte(s)
 r937@llin:  dpavlin | 2006-09-11 16:26:07 +0200
 changed _pack_subfields_hash usage and document it

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 641 _pack_subfields_hash
8 dpavlin 538
9 dpavlin 536 tag search display
10 dpavlin 547 marc marc_indicators marc_repeatable_subfield
11 dpavlin 564 marc_compose marc_leader
12 dpavlin 574 marc_duplicate marc_remove
13 dpavlin 604 marc_original_order
14 dpavlin 540
15 dpavlin 536 rec1 rec2 rec
16     regex prefix suffix surround
17     first lookup join_with
18 dpavlin 562
19     split_rec_on
20 dpavlin 536 /;
21 dpavlin 10
22     use warnings;
23     use strict;
24 dpavlin 536
25     #use base qw/WebPAC::Common/;
26 dpavlin 550 use Data::Dump qw/dump/;
27 dpavlin 574 use Storable qw/dclone/;
28 dpavlin 10
29 dpavlin 550 # debugging warn(s)
30     my $debug = 0;
31    
32    
33 dpavlin 10 =head1 NAME
34    
35 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
36 dpavlin 10
37     =head1 VERSION
38    
39 dpavlin 667 Version 0.19
40 dpavlin 10
41     =cut
42    
43 dpavlin 667 our $VERSION = '0.19';
44 dpavlin 10
45     =head1 SYNOPSIS
46    
47 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
48     from input records using perl functions which are specialized for set
49     processing.
50 dpavlin 10
51 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
52     means that you check it's validity before running WebPAC using
53     C<perl -c normalize.pl>.
54 dpavlin 15
55 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
56 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
57 dpavlin 547 C<marc>.
58 dpavlin 15
59 dpavlin 10 =head1 FUNCTIONS
60    
61 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
62     All other functions are available for use within normalisation rules.
63    
64 dpavlin 536 =head2 data_structure
65 dpavlin 10
66 dpavlin 536 Return data structure
67 dpavlin 13
68 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
69 dpavlin 536 lookup => $lookup->lookup_hash,
70     row => $row,
71     rules => $normalize_pl_config,
72 dpavlin 541 marc_encoding => 'utf-8',
73 dpavlin 595 config => $config,
74 dpavlin 13 );
75    
76 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
77     other are optional.
78    
79 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
80 dpavlin 15
81 dpavlin 538 Since this function isn't exported you have to call it with
82     C<WebPAC::Normalize::data_structure>.
83    
84 dpavlin 10 =cut
85    
86 dpavlin 536 sub data_structure {
87     my $arg = {@_};
88 dpavlin 13
89 dpavlin 536 die "need row argument" unless ($arg->{row});
90     die "need normalisation argument" unless ($arg->{rules});
91 dpavlin 31
92 dpavlin 536 no strict 'subs';
93 dpavlin 538 _set_lookup( $arg->{lookup} );
94     _set_rec( $arg->{row} );
95 dpavlin 595 _set_config( $arg->{config} );
96 dpavlin 541 _clean_ds( %{ $arg } );
97 dpavlin 536 eval "$arg->{rules}";
98     die "error evaling $arg->{rules}: $@\n" if ($@);
99 dpavlin 540
100 dpavlin 538 return _get_ds();
101 dpavlin 10 }
102    
103 dpavlin 538 =head2 _set_rec
104 dpavlin 13
105 dpavlin 536 Set current record hash
106 dpavlin 433
107 dpavlin 538 _set_rec( $rec );
108 dpavlin 433
109     =cut
110    
111 dpavlin 536 my $rec;
112 dpavlin 433
113 dpavlin 538 sub _set_rec {
114 dpavlin 536 $rec = shift or die "no record hash";
115 dpavlin 433 }
116    
117 dpavlin 595 =head2 _set_config
118    
119     Set current config hash
120    
121     _set_config( $config );
122    
123     Magic keys are:
124    
125     =over 4
126    
127     =item _
128    
129     Code of current database
130    
131     =item _mfn
132    
133     Current MFN
134    
135     =back
136    
137     =cut
138    
139     my $config;
140    
141     sub _set_config {
142     $config = shift;
143     }
144    
145 dpavlin 538 =head2 _get_ds
146    
147     Return hash formatted as data structure
148    
149     my $ds = _get_ds();
150    
151     =cut
152    
153 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
154     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
155 dpavlin 538
156     sub _get_ds {
157     return $out;
158     }
159    
160     =head2 _clean_ds
161    
162     Clean data structure hash for next record
163    
164     _clean_ds();
165    
166     =cut
167    
168     sub _clean_ds {
169 dpavlin 541 my $a = {@_};
170 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
171 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
172 dpavlin 541 $marc_encoding = $a->{marc_encoding};
173 dpavlin 538 }
174    
175     =head2 _set_lookup
176    
177     Set current lookup hash
178    
179     _set_lookup( $lookup );
180    
181     =cut
182    
183     my $lookup;
184    
185     sub _set_lookup {
186     $lookup = shift;
187     }
188    
189 dpavlin 547 =head2 _get_marc_fields
190 dpavlin 540
191 dpavlin 547 Get all fields defined by calls to C<marc>
192 dpavlin 540
193 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
194 dpavlin 540
195 dpavlin 543 We are using I<magic> which detect repeatable fields only from
196     sequence of field/subfield data generated by normalization.
197    
198 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
199     if any of indicators are different.
200 dpavlin 543
201 dpavlin 554 This is sane for most cases. Something like:
202 dpavlin 543
203 dpavlin 554 900a-1 900b-1 900c-1
204     900a-2 900b-2
205     900a-3
206    
207     will be created from any combination of:
208    
209     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
210    
211     and following rules:
212    
213     marc('900','a', rec('200','a') );
214     marc('900','b', rec('200','b') );
215     marc('900','c', rec('200','c') );
216    
217     which might not be what you have in mind. If you need repeatable subfield,
218     define it using C<marc_repeatable_subfield> like this:
219    
220 dpavlin 574 marc_repeatable_subfield('900','a');
221     marc('900','a', rec('200','a') );
222     marc('900','b', rec('200','b') );
223     marc('900','c', rec('200','c') );
224 dpavlin 554
225 dpavlin 574 will create:
226    
227     900a-1 900a-2 900a-3 900b-1 900c-1
228     900b-2
229    
230     There is also support for returning next or specific using:
231    
232     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
233     # do something with $mf
234     }
235    
236     will always return fields from next MARC record or
237    
238     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
239    
240     will return 42th copy record (if it exists).
241    
242 dpavlin 540 =cut
243    
244 dpavlin 547 sub _get_marc_fields {
245 dpavlin 550
246 dpavlin 574 my $arg = {@_};
247     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
248     my $offset = $marc_fetch_offset;
249     if ($arg->{offset}) {
250     $offset = $arg->{offset};
251     } elsif($arg->{fetch_next}) {
252     $marc_fetch_offset++;
253     }
254 dpavlin 550
255 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
256    
257     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
258    
259     my $marc_rec = $marc_record->[ $offset ];
260    
261     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
262    
263     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
264    
265 dpavlin 550 # first, sort all existing fields
266     # XXX might not be needed, but modern perl might randomize elements in hash
267     my @sorted_marc_record = sort {
268 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
269 dpavlin 574 } @{ $marc_rec };
270 dpavlin 550
271 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
272 dpavlin 562
273 dpavlin 550 # output marc fields
274 dpavlin 542 my @m;
275 dpavlin 550
276     # count unique field-subfields (used for offset when walking to next subfield)
277     my $u;
278 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
279 dpavlin 550
280     if ($debug) {
281 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
282     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
283     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
284     warn "## subfield count = ", dump( $u ), $/;
285 dpavlin 550 }
286    
287     my $len = $#sorted_marc_record;
288     my $visited;
289     my $i = 0;
290     my $field;
291    
292     foreach ( 0 .. $len ) {
293    
294     # find next element which isn't visited
295     while ($visited->{$i}) {
296     $i = ($i + 1) % ($len + 1);
297 dpavlin 542 }
298    
299 dpavlin 550 # mark it visited
300     $visited->{$i}++;
301    
302 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
303 dpavlin 550
304     # field and subfield which is key for
305     # marc_repeatable_subfield and u
306 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
307 dpavlin 550
308     if ($debug > 1) {
309    
310     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
311     print "### this [$i]: ", dump( $row ),$/;
312     print "### sf: ", $row->[3], " vs ", $field->[3],
313     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
314     if ($#$field >= 0);
315    
316     }
317    
318     # if field exists
319     if ( $#$field >= 0 ) {
320     if (
321     $row->[0] ne $field->[0] || # field
322     $row->[1] ne $field->[1] || # i1
323     $row->[2] ne $field->[2] # i2
324     ) {
325     push @m, $field;
326     warn "## saved/1 ", dump( $field ),$/ if ($debug);
327     $field = $row;
328    
329     } elsif (
330     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
331     ||
332     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
333     ! $marc_repeatable_subfield->{ $fsf }
334     )
335     ) {
336     push @m, $field;
337     warn "## saved/2 ", dump( $field ),$/ if ($debug);
338     $field = $row;
339    
340     } else {
341     # append new subfields to existing field
342     push @$field, ( $row->[3], $row->[4] );
343     }
344     } else {
345     # insert first field
346     $field = $row;
347     }
348    
349     if (! $marc_repeatable_subfield->{ $fsf }) {
350     # make step to next subfield
351     $i = ($i + $u->{ $fsf } ) % ($len + 1);
352     }
353 dpavlin 542 }
354    
355 dpavlin 550 if ($#$field >= 0) {
356     push @m, $field;
357     warn "## saved/3 ", dump( $field ),$/ if ($debug);
358     }
359 dpavlin 542
360 dpavlin 579 return \@m;
361 dpavlin 540 }
362    
363 dpavlin 554 =head2 _debug
364    
365     Change level of debug warnings
366    
367     _debug( 2 );
368    
369     =cut
370    
371     sub _debug {
372     my $l = shift;
373     return $debug unless defined($l);
374 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
375 dpavlin 554 $debug = $l;
376     }
377    
378 dpavlin 540 =head1 Functions to create C<data_structure>
379    
380     Those functions generally have to first in your normalization file.
381    
382 dpavlin 536 =head2 tag
383 dpavlin 433
384 dpavlin 536 Define new tag for I<search> and I<display>.
385 dpavlin 433
386 dpavlin 536 tag('Title', rec('200','a') );
387 dpavlin 13
388    
389     =cut
390    
391 dpavlin 536 sub tag {
392     my $name = shift or die "tag needs name as first argument";
393     my @o = grep { defined($_) && $_ ne '' } @_;
394     return unless (@o);
395     $out->{$name}->{tag} = $name;
396     $out->{$name}->{search} = \@o;
397     $out->{$name}->{display} = \@o;
398     }
399 dpavlin 13
400 dpavlin 536 =head2 display
401 dpavlin 13
402 dpavlin 536 Define tag just for I<display>
403 dpavlin 125
404 dpavlin 536 @v = display('Title', rec('200','a') );
405 dpavlin 125
406 dpavlin 536 =cut
407 dpavlin 125
408 dpavlin 536 sub display {
409     my $name = shift or die "display needs name as first argument";
410     my @o = grep { defined($_) && $_ ne '' } @_;
411     return unless (@o);
412     $out->{$name}->{tag} = $name;
413     $out->{$name}->{display} = \@o;
414     }
415 dpavlin 13
416 dpavlin 536 =head2 search
417 dpavlin 13
418 dpavlin 536 Prepare values just for I<search>
419 dpavlin 13
420 dpavlin 536 @v = search('Title', rec('200','a') );
421 dpavlin 433
422 dpavlin 536 =cut
423 dpavlin 13
424 dpavlin 536 sub search {
425     my $name = shift or die "search needs name as first argument";
426     my @o = grep { defined($_) && $_ ne '' } @_;
427     return unless (@o);
428     $out->{$name}->{tag} = $name;
429     $out->{$name}->{search} = \@o;
430 dpavlin 13 }
431    
432 dpavlin 564 =head2 marc_leader
433    
434     Setup fields within MARC leader or get leader
435    
436     marc_leader('05','c');
437     my $leader = marc_leader();
438    
439     =cut
440    
441     sub marc_leader {
442     my ($offset,$value) = @_;
443    
444     if ($offset) {
445     $out->{' leader'}->{ $offset } = $value;
446     } else {
447     return $out->{' leader'};
448     }
449     }
450    
451 dpavlin 547 =head2 marc
452 dpavlin 540
453     Save value for MARC field
454    
455 dpavlin 547 marc('900','a', rec('200','a') );
456 dpavlin 571 marc('001', rec('000') );
457 dpavlin 540
458     =cut
459    
460 dpavlin 547 sub marc {
461     my $f = shift or die "marc needs field";
462     die "marc field must be numer" unless ($f =~ /^\d+$/);
463 dpavlin 540
464 dpavlin 571 my $sf;
465     if ($f >= 10) {
466     $sf = shift or die "marc needs subfield";
467     }
468 dpavlin 540
469 dpavlin 541 foreach (@_) {
470     my $v = $_; # make var read-write for Encode
471 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
472 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
473 dpavlin 571 if (defined $sf) {
474 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
475 dpavlin 571 } else {
476 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
477 dpavlin 571 }
478 dpavlin 540 }
479     }
480    
481 dpavlin 547 =head2 marc_repeatable_subfield
482    
483     Save values for MARC repetable subfield
484    
485     marc_repeatable_subfield('910', 'z', rec('909') );
486    
487     =cut
488    
489     sub marc_repeatable_subfield {
490 dpavlin 550 my ($f,$sf) = @_;
491     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
492     $marc_repeatable_subfield->{ $f . $sf }++;
493 dpavlin 547 marc(@_);
494     }
495    
496     =head2 marc_indicators
497    
498     Set both indicators for MARC field
499    
500     marc_indicators('900', ' ', 1);
501    
502     Any indicator value other than C<0-9> will be treated as undefined.
503    
504     =cut
505    
506     sub marc_indicators {
507     my $f = shift || die "marc_indicators need field!\n";
508     my ($i1,$i2) = @_;
509     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
510     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
511    
512     $i1 = ' ' if ($i1 !~ /^\d$/);
513     $i2 = ' ' if ($i2 !~ /^\d$/);
514 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
515 dpavlin 547 }
516    
517 dpavlin 562 =head2 marc_compose
518 dpavlin 547
519 dpavlin 562 Save values for each MARC subfield explicitly
520    
521     marc_compose('900',
522     'a', rec('200','a')
523     'b', rec('201','a')
524     'a', rec('200','b')
525     'c', rec('200','c')
526     );
527    
528 dpavlin 603 If you specify C<+> for subfield, value will be appended
529     to previous defined subfield.
530    
531 dpavlin 562 =cut
532    
533     sub marc_compose {
534     my $f = shift or die "marc_compose needs field";
535     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
536    
537     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
538     my $m = [ $f, $i1, $i2 ];
539    
540 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
541    
542 dpavlin 619 if ($#_ % 2 != 1) {
543     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
544     }
545    
546 dpavlin 562 while (@_) {
547 dpavlin 619 my $sf = shift;
548 dpavlin 565 my $v = shift;
549 dpavlin 562
550     next unless (defined($v) && $v !~ /^\s*$/);
551 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
552 dpavlin 603 if ($sf ne '+') {
553     push @$m, ( $sf, $v );
554     } else {
555     $m->[ $#$m ] .= $v;
556     }
557 dpavlin 562 }
558    
559 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
560 dpavlin 562
561 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
562 dpavlin 562 }
563    
564 dpavlin 574 =head2 marc_duplicate
565 dpavlin 562
566 dpavlin 574 Generate copy of current MARC record and continue working on copy
567    
568     marc_duplicate();
569    
570     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
571     C<< _get_marc_fields( offset => 42 ) >>.
572    
573     =cut
574    
575     sub marc_duplicate {
576     my $m = $marc_record->[ -1 ];
577     die "can't duplicate record which isn't defined" unless ($m);
578     push @{ $marc_record }, dclone( $m );
579     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
580     $marc_record_offset = $#{ $marc_record };
581     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
582     }
583    
584     =head2 marc_remove
585    
586     Remove some field or subfield from MARC record.
587    
588     marc_remove('200');
589     marc_remove('200','a');
590    
591     This will erase field C<200> or C<200^a> from current MARC record.
592    
593     This is useful after calling C<marc_duplicate> or on it's own (but, you
594     should probably just remove that subfield definition if you are not
595     using C<marc_duplicate>).
596    
597     FIXME: support fields < 10.
598    
599     =cut
600    
601     sub marc_remove {
602     my ($f, $sf) = @_;
603    
604     die "marc_remove needs record number" unless defined($f);
605    
606     my $marc = $marc_record->[ $marc_record_offset ];
607    
608     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
609    
610 dpavlin 579 my $i = 0;
611     foreach ( 0 .. $#{ $marc } ) {
612 dpavlin 574 last unless (defined $marc->[$i]);
613     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
614     if ($marc->[$i]->[0] eq $f) {
615     if (! defined $sf) {
616     # remove whole field
617     splice @$marc, $i, 1;
618     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
619     $i--;
620     } else {
621     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
622     my $o = ($j * 2) + 3;
623     if ($marc->[$i]->[$o] eq $sf) {
624     # remove subfield
625     splice @{$marc->[$i]}, $o, 2;
626     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
627     # is record now empty?
628     if ($#{ $marc->[$i] } == 2) {
629     splice @$marc, $i, 1;
630     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
631     $i--;
632     };
633     }
634     }
635     }
636     }
637 dpavlin 579 $i++;
638 dpavlin 574 }
639    
640     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
641    
642     $marc_record->[ $marc_record_offset ] = $marc;
643    
644     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
645     }
646    
647 dpavlin 604 =head2 marc_original_order
648    
649     Copy all subfields preserving original order to marc field.
650    
651 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
652 dpavlin 604
653 dpavlin 616 Please note that field numbers are consistent with other commands (marc
654     field number first), but somewhat counter-intuitive (destination and then
655     source).
656    
657 dpavlin 604 You might want to use this command if you are just renaming subfields or
658     using pre-processing modify_record in C<config.yml> and don't need any
659     post-processing or want to preserve order of original subfields.
660    
661 dpavlin 616
662 dpavlin 604 =cut
663    
664     sub marc_original_order {
665    
666 dpavlin 616 my ($to, $from) = @_;
667 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
668    
669 dpavlin 616 return unless defined($rec->{$from});
670    
671     my $r = $rec->{$from};
672 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
673    
674     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
675 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
676 dpavlin 604
677     foreach my $d (@$r) {
678    
679 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
680 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
681 dpavlin 605 next;
682     }
683    
684 dpavlin 604 my @sfs = @{ $d->{subfields} };
685    
686     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
687    
688 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
689 dpavlin 604
690     my $m = [ $to, $i1, $i2 ];
691    
692     while (my $sf = shift @sfs) {
693 dpavlin 618
694     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
695 dpavlin 604 my $offset = shift @sfs;
696     die "corrupted sufields specification for field $from\n" unless defined($offset);
697    
698     my $v;
699     if (ref($d->{$sf}) eq 'ARRAY') {
700     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
701     } elsif ($offset == 0) {
702     $v = $d->{$sf};
703     } else {
704     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
705     }
706     push @$m, ( $sf, $v ) if (defined($v));
707     }
708    
709     if ($#{$m} > 2) {
710     push @{ $marc_record->[ $marc_record_offset ] }, $m;
711     }
712     }
713    
714     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
715     }
716    
717    
718 dpavlin 540 =head1 Functions to extract data from input
719    
720     This function should be used inside functions to create C<data_structure> described
721     above.
722    
723 dpavlin 641 =head2 _pack_subfields_hash
724    
725 dpavlin 669 @subfields = _pack_subfields_hash( $h );
726     $subfields = _pack_subfields_hash( $h, 1 );
727 dpavlin 641
728 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
729     with subfields (denoted by C<^>) and values.
730    
731 dpavlin 641 =cut
732    
733     sub _pack_subfields_hash {
734    
735 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
736 dpavlin 641
737     my ($h,$include_subfields) = @_;
738    
739     if ( defined($h->{subfields}) ) {
740     my $sfs = delete $h->{subfields} || die "no subfields?";
741     my @out;
742     while (@$sfs) {
743     my $sf = shift @$sfs;
744     push @out, '^' . $sf if ($include_subfields);
745     my $o = shift @$sfs;
746     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
747     # single element subfields are not arrays
748 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
749 dpavlin 667
750 dpavlin 641 push @out, $h->{$sf};
751     } else {
752 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
753 dpavlin 641 push @out, $h->{$sf}->[$o];
754     }
755     }
756 dpavlin 667 if ($include_subfields) {
757     return join('', @out);
758     } else {
759     return @out;
760     }
761 dpavlin 641 } else {
762 dpavlin 667 if ($include_subfields) {
763     my $out = '';
764 dpavlin 668 foreach my $sf (sort keys %$h) {
765 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
766     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
767     } else {
768     $out .= '^' . $sf . $h->{$sf};
769     }
770     }
771     return $out;
772     } else {
773     # FIXME this should probably be in alphabetical order instead of hash order
774     values %{$h};
775     }
776 dpavlin 641 }
777     }
778    
779 dpavlin 536 =head2 rec1
780 dpavlin 371
781 dpavlin 536 Return all values in some field
782 dpavlin 371
783 dpavlin 536 @v = rec1('200')
784 dpavlin 15
785 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
786 dpavlin 15
787 dpavlin 536 =cut
788 dpavlin 15
789 dpavlin 536 sub rec1 {
790     my $f = shift;
791 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
792 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
793 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
794 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
795 dpavlin 641 my @out;
796     foreach my $h ( @{ $rec->{$f} } ) {
797     if (ref($h) eq 'HASH') {
798     push @out, ( _pack_subfields_hash( $h ) );
799 dpavlin 31 } else {
800 dpavlin 641 push @out, $h;
801 dpavlin 31 }
802 dpavlin 641 }
803     return @out;
804 dpavlin 536 } elsif( defined($rec->{$f}) ) {
805     return $rec->{$f};
806 dpavlin 15 }
807     }
808    
809 dpavlin 536 =head2 rec2
810 dpavlin 15
811 dpavlin 536 Return all values in specific field and subfield
812 dpavlin 13
813 dpavlin 536 @v = rec2('200','a')
814 dpavlin 13
815     =cut
816    
817 dpavlin 536 sub rec2 {
818     my $f = shift;
819     return unless (defined($rec && $rec->{$f}));
820     my $sf = shift;
821 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
822 dpavlin 589 return map {
823     if (ref($_->{$sf}) eq 'ARRAY') {
824     @{ $_->{$sf} };
825     } else {
826     $_->{$sf};
827     }
828     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
829 dpavlin 536 }
830 dpavlin 13
831 dpavlin 536 =head2 rec
832 dpavlin 13
833 dpavlin 536 syntaxtic sugar for
834 dpavlin 13
835 dpavlin 536 @v = rec('200')
836     @v = rec('200','a')
837 dpavlin 13
838 dpavlin 536 =cut
839 dpavlin 373
840 dpavlin 536 sub rec {
841 dpavlin 583 my @out;
842 dpavlin 536 if ($#_ == 0) {
843 dpavlin 583 @out = rec1(@_);
844 dpavlin 536 } elsif ($#_ == 1) {
845 dpavlin 583 @out = rec2(@_);
846 dpavlin 13 }
847 dpavlin 583 if (@out) {
848     return @out;
849     } else {
850     return '';
851     }
852 dpavlin 13 }
853    
854 dpavlin 536 =head2 regex
855 dpavlin 15
856 dpavlin 536 Apply regex to some or all values
857 dpavlin 15
858 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
859 dpavlin 15
860     =cut
861    
862 dpavlin 536 sub regex {
863     my $r = shift;
864     my @out;
865 dpavlin 550 #warn "r: $r\n", dump(\@_);
866 dpavlin 536 foreach my $t (@_) {
867     next unless ($t);
868     eval "\$t =~ $r";
869     push @out, $t if ($t && $t ne '');
870 dpavlin 368 }
871 dpavlin 536 return @out;
872 dpavlin 15 }
873    
874 dpavlin 536 =head2 prefix
875 dpavlin 15
876 dpavlin 536 Prefix all values with a string
877 dpavlin 15
878 dpavlin 536 @v = prefix( 'my_', @v );
879 dpavlin 15
880     =cut
881    
882 dpavlin 536 sub prefix {
883 dpavlin 592 my $p = shift or return;
884 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
885     }
886 dpavlin 15
887 dpavlin 536 =head2 suffix
888 dpavlin 15
889 dpavlin 536 suffix all values with a string
890 dpavlin 15
891 dpavlin 536 @v = suffix( '_my', @v );
892 dpavlin 15
893 dpavlin 536 =cut
894 dpavlin 15
895 dpavlin 536 sub suffix {
896     my $s = shift or die "suffix needs string as first argument";
897     return map { $_ . $s } grep { defined($_) } @_;
898 dpavlin 15 }
899    
900 dpavlin 536 =head2 surround
901 dpavlin 13
902 dpavlin 536 surround all values with a two strings
903 dpavlin 13
904 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
905 dpavlin 13
906     =cut
907    
908 dpavlin 536 sub surround {
909     my $p = shift or die "surround need prefix as first argument";
910     my $s = shift or die "surround needs suffix as second argument";
911     return map { $p . $_ . $s } grep { defined($_) } @_;
912 dpavlin 13 }
913    
914 dpavlin 536 =head2 first
915 dpavlin 13
916 dpavlin 536 Return first element
917 dpavlin 15
918 dpavlin 536 $v = first( @v );
919 dpavlin 13
920     =cut
921    
922 dpavlin 536 sub first {
923     my $r = shift;
924     return $r;
925 dpavlin 13 }
926    
927 dpavlin 536 =head2 lookup
928 dpavlin 13
929 dpavlin 536 Consult lookup hashes for some value
930 dpavlin 13
931 dpavlin 536 @v = lookup( $v );
932     @v = lookup( @v );
933 dpavlin 13
934     =cut
935    
936 dpavlin 536 sub lookup {
937     my $k = shift or return;
938     return unless (defined($lookup->{$k}));
939     if (ref($lookup->{$k}) eq 'ARRAY') {
940     return @{ $lookup->{$k} };
941     } else {
942     return $lookup->{$k};
943     }
944 dpavlin 13 }
945    
946 dpavlin 595 =head2 config
947    
948     Consult config values stored in C<config.yml>
949    
950     # return database code (key under databases in yaml)
951     $database_code = config(); # use _ from hash
952     $database_name = config('name');
953     $database_input_name = config('input name');
954     $tag = config('input normalize tag');
955    
956     Up to three levels are supported.
957    
958     =cut
959    
960     sub config {
961     return unless ($config);
962    
963     my $p = shift;
964    
965     $p ||= '';
966    
967     my $v;
968    
969     warn "### getting config($p)\n" if ($debug > 1);
970    
971     my @p = split(/\s+/,$p);
972     if ($#p < 0) {
973     $v = $config->{ '_' }; # special, database code
974     } else {
975    
976     my $c = dclone( $config );
977    
978     foreach my $k (@p) {
979     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
980     if (ref($c) eq 'ARRAY') {
981     $c = shift @$c;
982     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
983     last;
984     }
985    
986     if (! defined($c->{$k}) ) {
987     $c = undef;
988     last;
989     } else {
990     $c = $c->{$k};
991     }
992     }
993     $v = $c if ($c);
994    
995     }
996    
997     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
998     warn "config( '$p' ) is empty\n" if (! $v);
999    
1000     return $v;
1001     }
1002    
1003     =head2 id
1004    
1005     Returns unique id of this record
1006    
1007     $id = id();
1008    
1009     Returns C<42/2> for 2nd occurence of MFN 42.
1010    
1011     =cut
1012    
1013     sub id {
1014     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1015     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1016     }
1017    
1018 dpavlin 536 =head2 join_with
1019 dpavlin 13
1020 dpavlin 536 Joins walues with some delimiter
1021 dpavlin 10
1022 dpavlin 536 $v = join_with(", ", @v);
1023 dpavlin 10
1024 dpavlin 536 =cut
1025 dpavlin 10
1026 dpavlin 536 sub join_with {
1027     my $d = shift;
1028 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1029 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1030     return '' unless defined($v);
1031     return $v;
1032 dpavlin 536 }
1033 dpavlin 10
1034 dpavlin 562 =head2 split_rec_on
1035    
1036     Split record subfield on some regex and take one of parts out
1037    
1038     $a_before_semi_column =
1039     split_rec_on('200','a', /\s*;\s*/, $part);
1040    
1041     C<$part> is optional number of element. First element is
1042     B<1>, not 0!
1043    
1044     If there is no C<$part> parameter or C<$part> is 0, this function will
1045     return all values produced by splitting.
1046    
1047     =cut
1048    
1049     sub split_rec_on {
1050     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1051    
1052     my ($fld, $sf, $regex, $part) = @_;
1053 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1054 dpavlin 562
1055     my @r = rec( $fld, $sf );
1056     my $v = shift @r;
1057 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1058 dpavlin 562
1059 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1060 dpavlin 566
1061 dpavlin 562 my @s = split( $regex, $v );
1062 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1063 dpavlin 566 if ($part && $part > 0) {
1064 dpavlin 562 return $s[ $part - 1 ];
1065     } else {
1066 dpavlin 571 return @s;
1067 dpavlin 562 }
1068     }
1069    
1070 dpavlin 536 # END
1071     1;

  ViewVC Help
Powered by ViewVC 1.1.26