/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 641 - (hide annotations)
Wed Sep 6 20:54:47 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 22593 byte(s)
refactored _pack_subfields_hash in separate function

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 641 _pack_subfields_hash
8 dpavlin 538
9 dpavlin 536 tag search display
10 dpavlin 547 marc marc_indicators marc_repeatable_subfield
11 dpavlin 564 marc_compose marc_leader
12 dpavlin 574 marc_duplicate marc_remove
13 dpavlin 604 marc_original_order
14 dpavlin 540
15 dpavlin 536 rec1 rec2 rec
16     regex prefix suffix surround
17     first lookup join_with
18 dpavlin 562
19     split_rec_on
20 dpavlin 536 /;
21 dpavlin 10
22     use warnings;
23     use strict;
24 dpavlin 536
25     #use base qw/WebPAC::Common/;
26 dpavlin 550 use Data::Dump qw/dump/;
27 dpavlin 541 use Encode qw/from_to/;
28 dpavlin 574 use Storable qw/dclone/;
29 dpavlin 10
30 dpavlin 550 # debugging warn(s)
31     my $debug = 0;
32    
33    
34 dpavlin 10 =head1 NAME
35    
36 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
37 dpavlin 10
38     =head1 VERSION
39    
40 dpavlin 631 Version 0.17
41 dpavlin 10
42     =cut
43    
44 dpavlin 631 our $VERSION = '0.17';
45 dpavlin 10
46     =head1 SYNOPSIS
47    
48 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
49     from input records using perl functions which are specialized for set
50     processing.
51 dpavlin 10
52 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
53     means that you check it's validity before running WebPAC using
54     C<perl -c normalize.pl>.
55 dpavlin 15
56 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
57 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 dpavlin 547 C<marc>.
59 dpavlin 15
60 dpavlin 10 =head1 FUNCTIONS
61    
62 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
63     All other functions are available for use within normalisation rules.
64    
65 dpavlin 536 =head2 data_structure
66 dpavlin 10
67 dpavlin 536 Return data structure
68 dpavlin 13
69 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
70 dpavlin 536 lookup => $lookup->lookup_hash,
71     row => $row,
72     rules => $normalize_pl_config,
73 dpavlin 541 marc_encoding => 'utf-8',
74 dpavlin 595 config => $config,
75 dpavlin 13 );
76    
77 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
78     other are optional.
79    
80 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
81 dpavlin 15
82 dpavlin 538 Since this function isn't exported you have to call it with
83     C<WebPAC::Normalize::data_structure>.
84    
85 dpavlin 10 =cut
86    
87 dpavlin 536 sub data_structure {
88     my $arg = {@_};
89 dpavlin 13
90 dpavlin 536 die "need row argument" unless ($arg->{row});
91     die "need normalisation argument" unless ($arg->{rules});
92 dpavlin 31
93 dpavlin 536 no strict 'subs';
94 dpavlin 538 _set_lookup( $arg->{lookup} );
95     _set_rec( $arg->{row} );
96 dpavlin 595 _set_config( $arg->{config} );
97 dpavlin 541 _clean_ds( %{ $arg } );
98 dpavlin 536 eval "$arg->{rules}";
99     die "error evaling $arg->{rules}: $@\n" if ($@);
100 dpavlin 540
101 dpavlin 538 return _get_ds();
102 dpavlin 10 }
103    
104 dpavlin 538 =head2 _set_rec
105 dpavlin 13
106 dpavlin 536 Set current record hash
107 dpavlin 433
108 dpavlin 538 _set_rec( $rec );
109 dpavlin 433
110     =cut
111    
112 dpavlin 536 my $rec;
113 dpavlin 433
114 dpavlin 538 sub _set_rec {
115 dpavlin 536 $rec = shift or die "no record hash";
116 dpavlin 433 }
117    
118 dpavlin 595 =head2 _set_config
119    
120     Set current config hash
121    
122     _set_config( $config );
123    
124     Magic keys are:
125    
126     =over 4
127    
128     =item _
129    
130     Code of current database
131    
132     =item _mfn
133    
134     Current MFN
135    
136     =back
137    
138     =cut
139    
140     my $config;
141    
142     sub _set_config {
143     $config = shift;
144     }
145    
146 dpavlin 538 =head2 _get_ds
147    
148     Return hash formatted as data structure
149    
150     my $ds = _get_ds();
151    
152     =cut
153    
154 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
156 dpavlin 538
157     sub _get_ds {
158     return $out;
159     }
160    
161     =head2 _clean_ds
162    
163     Clean data structure hash for next record
164    
165     _clean_ds();
166    
167     =cut
168    
169     sub _clean_ds {
170 dpavlin 541 my $a = {@_};
171 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 dpavlin 541 $marc_encoding = $a->{marc_encoding};
174 dpavlin 538 }
175    
176     =head2 _set_lookup
177    
178     Set current lookup hash
179    
180     _set_lookup( $lookup );
181    
182     =cut
183    
184     my $lookup;
185    
186     sub _set_lookup {
187     $lookup = shift;
188     }
189    
190 dpavlin 547 =head2 _get_marc_fields
191 dpavlin 540
192 dpavlin 547 Get all fields defined by calls to C<marc>
193 dpavlin 540
194 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
195 dpavlin 540
196 dpavlin 543 We are using I<magic> which detect repeatable fields only from
197     sequence of field/subfield data generated by normalization.
198    
199 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
200     if any of indicators are different.
201 dpavlin 543
202 dpavlin 554 This is sane for most cases. Something like:
203 dpavlin 543
204 dpavlin 554 900a-1 900b-1 900c-1
205     900a-2 900b-2
206     900a-3
207    
208     will be created from any combination of:
209    
210     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
211    
212     and following rules:
213    
214     marc('900','a', rec('200','a') );
215     marc('900','b', rec('200','b') );
216     marc('900','c', rec('200','c') );
217    
218     which might not be what you have in mind. If you need repeatable subfield,
219     define it using C<marc_repeatable_subfield> like this:
220    
221 dpavlin 574 marc_repeatable_subfield('900','a');
222     marc('900','a', rec('200','a') );
223     marc('900','b', rec('200','b') );
224     marc('900','c', rec('200','c') );
225 dpavlin 554
226 dpavlin 574 will create:
227    
228     900a-1 900a-2 900a-3 900b-1 900c-1
229     900b-2
230    
231     There is also support for returning next or specific using:
232    
233     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
234     # do something with $mf
235     }
236    
237     will always return fields from next MARC record or
238    
239     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
240    
241     will return 42th copy record (if it exists).
242    
243 dpavlin 540 =cut
244    
245 dpavlin 547 sub _get_marc_fields {
246 dpavlin 550
247 dpavlin 574 my $arg = {@_};
248     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
249     my $offset = $marc_fetch_offset;
250     if ($arg->{offset}) {
251     $offset = $arg->{offset};
252     } elsif($arg->{fetch_next}) {
253     $marc_fetch_offset++;
254     }
255 dpavlin 550
256 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
257    
258     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
259    
260     my $marc_rec = $marc_record->[ $offset ];
261    
262     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
263    
264     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
265    
266 dpavlin 550 # first, sort all existing fields
267     # XXX might not be needed, but modern perl might randomize elements in hash
268     my @sorted_marc_record = sort {
269 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
270 dpavlin 574 } @{ $marc_rec };
271 dpavlin 550
272 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
273 dpavlin 562
274 dpavlin 550 # output marc fields
275 dpavlin 542 my @m;
276 dpavlin 550
277     # count unique field-subfields (used for offset when walking to next subfield)
278     my $u;
279 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
280 dpavlin 550
281     if ($debug) {
282 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
283     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
284     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
285     warn "## subfield count = ", dump( $u ), $/;
286 dpavlin 550 }
287    
288     my $len = $#sorted_marc_record;
289     my $visited;
290     my $i = 0;
291     my $field;
292    
293     foreach ( 0 .. $len ) {
294    
295     # find next element which isn't visited
296     while ($visited->{$i}) {
297     $i = ($i + 1) % ($len + 1);
298 dpavlin 542 }
299    
300 dpavlin 550 # mark it visited
301     $visited->{$i}++;
302    
303 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
304 dpavlin 550
305     # field and subfield which is key for
306     # marc_repeatable_subfield and u
307 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
308 dpavlin 550
309     if ($debug > 1) {
310    
311     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
312     print "### this [$i]: ", dump( $row ),$/;
313     print "### sf: ", $row->[3], " vs ", $field->[3],
314     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
315     if ($#$field >= 0);
316    
317     }
318    
319     # if field exists
320     if ( $#$field >= 0 ) {
321     if (
322     $row->[0] ne $field->[0] || # field
323     $row->[1] ne $field->[1] || # i1
324     $row->[2] ne $field->[2] # i2
325     ) {
326     push @m, $field;
327     warn "## saved/1 ", dump( $field ),$/ if ($debug);
328     $field = $row;
329    
330     } elsif (
331     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
332     ||
333     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
334     ! $marc_repeatable_subfield->{ $fsf }
335     )
336     ) {
337     push @m, $field;
338     warn "## saved/2 ", dump( $field ),$/ if ($debug);
339     $field = $row;
340    
341     } else {
342     # append new subfields to existing field
343     push @$field, ( $row->[3], $row->[4] );
344     }
345     } else {
346     # insert first field
347     $field = $row;
348     }
349    
350     if (! $marc_repeatable_subfield->{ $fsf }) {
351     # make step to next subfield
352     $i = ($i + $u->{ $fsf } ) % ($len + 1);
353     }
354 dpavlin 542 }
355    
356 dpavlin 550 if ($#$field >= 0) {
357     push @m, $field;
358     warn "## saved/3 ", dump( $field ),$/ if ($debug);
359     }
360 dpavlin 542
361 dpavlin 579 return \@m;
362 dpavlin 540 }
363    
364 dpavlin 554 =head2 _debug
365    
366     Change level of debug warnings
367    
368     _debug( 2 );
369    
370     =cut
371    
372     sub _debug {
373     my $l = shift;
374     return $debug unless defined($l);
375 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
376 dpavlin 554 $debug = $l;
377     }
378    
379 dpavlin 540 =head1 Functions to create C<data_structure>
380    
381     Those functions generally have to first in your normalization file.
382    
383 dpavlin 536 =head2 tag
384 dpavlin 433
385 dpavlin 536 Define new tag for I<search> and I<display>.
386 dpavlin 433
387 dpavlin 536 tag('Title', rec('200','a') );
388 dpavlin 13
389    
390     =cut
391    
392 dpavlin 536 sub tag {
393     my $name = shift or die "tag needs name as first argument";
394     my @o = grep { defined($_) && $_ ne '' } @_;
395     return unless (@o);
396     $out->{$name}->{tag} = $name;
397     $out->{$name}->{search} = \@o;
398     $out->{$name}->{display} = \@o;
399     }
400 dpavlin 13
401 dpavlin 536 =head2 display
402 dpavlin 13
403 dpavlin 536 Define tag just for I<display>
404 dpavlin 125
405 dpavlin 536 @v = display('Title', rec('200','a') );
406 dpavlin 125
407 dpavlin 536 =cut
408 dpavlin 125
409 dpavlin 536 sub display {
410     my $name = shift or die "display needs name as first argument";
411     my @o = grep { defined($_) && $_ ne '' } @_;
412     return unless (@o);
413     $out->{$name}->{tag} = $name;
414     $out->{$name}->{display} = \@o;
415     }
416 dpavlin 13
417 dpavlin 536 =head2 search
418 dpavlin 13
419 dpavlin 536 Prepare values just for I<search>
420 dpavlin 13
421 dpavlin 536 @v = search('Title', rec('200','a') );
422 dpavlin 433
423 dpavlin 536 =cut
424 dpavlin 13
425 dpavlin 536 sub search {
426     my $name = shift or die "search needs name as first argument";
427     my @o = grep { defined($_) && $_ ne '' } @_;
428     return unless (@o);
429     $out->{$name}->{tag} = $name;
430     $out->{$name}->{search} = \@o;
431 dpavlin 13 }
432    
433 dpavlin 564 =head2 marc_leader
434    
435     Setup fields within MARC leader or get leader
436    
437     marc_leader('05','c');
438     my $leader = marc_leader();
439    
440     =cut
441    
442     sub marc_leader {
443     my ($offset,$value) = @_;
444    
445     if ($offset) {
446     $out->{' leader'}->{ $offset } = $value;
447     } else {
448     return $out->{' leader'};
449     }
450     }
451    
452 dpavlin 547 =head2 marc
453 dpavlin 540
454     Save value for MARC field
455    
456 dpavlin 547 marc('900','a', rec('200','a') );
457 dpavlin 571 marc('001', rec('000') );
458 dpavlin 540
459     =cut
460    
461 dpavlin 547 sub marc {
462     my $f = shift or die "marc needs field";
463     die "marc field must be numer" unless ($f =~ /^\d+$/);
464 dpavlin 540
465 dpavlin 571 my $sf;
466     if ($f >= 10) {
467     $sf = shift or die "marc needs subfield";
468     }
469 dpavlin 540
470 dpavlin 541 foreach (@_) {
471     my $v = $_; # make var read-write for Encode
472 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
473 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
474 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
475 dpavlin 571 if (defined $sf) {
476 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
477 dpavlin 571 } else {
478 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
479 dpavlin 571 }
480 dpavlin 540 }
481     }
482    
483 dpavlin 547 =head2 marc_repeatable_subfield
484    
485     Save values for MARC repetable subfield
486    
487     marc_repeatable_subfield('910', 'z', rec('909') );
488    
489     =cut
490    
491     sub marc_repeatable_subfield {
492 dpavlin 550 my ($f,$sf) = @_;
493     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
494     $marc_repeatable_subfield->{ $f . $sf }++;
495 dpavlin 547 marc(@_);
496     }
497    
498     =head2 marc_indicators
499    
500     Set both indicators for MARC field
501    
502     marc_indicators('900', ' ', 1);
503    
504     Any indicator value other than C<0-9> will be treated as undefined.
505    
506     =cut
507    
508     sub marc_indicators {
509     my $f = shift || die "marc_indicators need field!\n";
510     my ($i1,$i2) = @_;
511     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
512     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
513    
514     $i1 = ' ' if ($i1 !~ /^\d$/);
515     $i2 = ' ' if ($i2 !~ /^\d$/);
516 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
517 dpavlin 547 }
518    
519 dpavlin 562 =head2 marc_compose
520 dpavlin 547
521 dpavlin 562 Save values for each MARC subfield explicitly
522    
523     marc_compose('900',
524     'a', rec('200','a')
525     'b', rec('201','a')
526     'a', rec('200','b')
527     'c', rec('200','c')
528     );
529    
530 dpavlin 603 If you specify C<+> for subfield, value will be appended
531     to previous defined subfield.
532    
533 dpavlin 562 =cut
534    
535     sub marc_compose {
536     my $f = shift or die "marc_compose needs field";
537     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
538    
539     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
540     my $m = [ $f, $i1, $i2 ];
541    
542 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
543    
544 dpavlin 619 if ($#_ % 2 != 1) {
545     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
546     }
547    
548 dpavlin 562 while (@_) {
549 dpavlin 619 my $sf = shift;
550 dpavlin 565 my $v = shift;
551 dpavlin 562
552     next unless (defined($v) && $v !~ /^\s*$/);
553     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
554 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
555 dpavlin 603 if ($sf ne '+') {
556     push @$m, ( $sf, $v );
557     } else {
558     $m->[ $#$m ] .= $v;
559     }
560 dpavlin 562 }
561    
562 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
563 dpavlin 562
564 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
565 dpavlin 562 }
566    
567 dpavlin 574 =head2 marc_duplicate
568 dpavlin 562
569 dpavlin 574 Generate copy of current MARC record and continue working on copy
570    
571     marc_duplicate();
572    
573     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
574     C<< _get_marc_fields( offset => 42 ) >>.
575    
576     =cut
577    
578     sub marc_duplicate {
579     my $m = $marc_record->[ -1 ];
580     die "can't duplicate record which isn't defined" unless ($m);
581     push @{ $marc_record }, dclone( $m );
582     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
583     $marc_record_offset = $#{ $marc_record };
584     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
585     }
586    
587     =head2 marc_remove
588    
589     Remove some field or subfield from MARC record.
590    
591     marc_remove('200');
592     marc_remove('200','a');
593    
594     This will erase field C<200> or C<200^a> from current MARC record.
595    
596     This is useful after calling C<marc_duplicate> or on it's own (but, you
597     should probably just remove that subfield definition if you are not
598     using C<marc_duplicate>).
599    
600     FIXME: support fields < 10.
601    
602     =cut
603    
604     sub marc_remove {
605     my ($f, $sf) = @_;
606    
607     die "marc_remove needs record number" unless defined($f);
608    
609     my $marc = $marc_record->[ $marc_record_offset ];
610    
611     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
612    
613 dpavlin 579 my $i = 0;
614     foreach ( 0 .. $#{ $marc } ) {
615 dpavlin 574 last unless (defined $marc->[$i]);
616     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
617     if ($marc->[$i]->[0] eq $f) {
618     if (! defined $sf) {
619     # remove whole field
620     splice @$marc, $i, 1;
621     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
622     $i--;
623     } else {
624     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
625     my $o = ($j * 2) + 3;
626     if ($marc->[$i]->[$o] eq $sf) {
627     # remove subfield
628     splice @{$marc->[$i]}, $o, 2;
629     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
630     # is record now empty?
631     if ($#{ $marc->[$i] } == 2) {
632     splice @$marc, $i, 1;
633     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
634     $i--;
635     };
636     }
637     }
638     }
639     }
640 dpavlin 579 $i++;
641 dpavlin 574 }
642    
643     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
644    
645     $marc_record->[ $marc_record_offset ] = $marc;
646    
647     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
648     }
649    
650 dpavlin 604 =head2 marc_original_order
651    
652     Copy all subfields preserving original order to marc field.
653    
654 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
655 dpavlin 604
656 dpavlin 616 Please note that field numbers are consistent with other commands (marc
657     field number first), but somewhat counter-intuitive (destination and then
658     source).
659    
660 dpavlin 604 You might want to use this command if you are just renaming subfields or
661     using pre-processing modify_record in C<config.yml> and don't need any
662     post-processing or want to preserve order of original subfields.
663    
664 dpavlin 616
665 dpavlin 604 =cut
666    
667     sub marc_original_order {
668    
669 dpavlin 616 my ($to, $from) = @_;
670 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
671    
672 dpavlin 616 return unless defined($rec->{$from});
673    
674     my $r = $rec->{$from};
675 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
676    
677     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
678 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
679 dpavlin 604
680     foreach my $d (@$r) {
681    
682 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
683 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
684 dpavlin 605 next;
685     }
686    
687 dpavlin 604 my @sfs = @{ $d->{subfields} };
688    
689     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
690    
691 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
692 dpavlin 604
693     my $m = [ $to, $i1, $i2 ];
694    
695     while (my $sf = shift @sfs) {
696 dpavlin 618
697     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
698 dpavlin 604 my $offset = shift @sfs;
699     die "corrupted sufields specification for field $from\n" unless defined($offset);
700    
701     my $v;
702     if (ref($d->{$sf}) eq 'ARRAY') {
703     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
704     } elsif ($offset == 0) {
705     $v = $d->{$sf};
706     } else {
707     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
708     }
709     push @$m, ( $sf, $v ) if (defined($v));
710     }
711    
712     if ($#{$m} > 2) {
713     push @{ $marc_record->[ $marc_record_offset ] }, $m;
714     }
715     }
716    
717     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
718     }
719    
720    
721 dpavlin 540 =head1 Functions to extract data from input
722    
723     This function should be used inside functions to create C<data_structure> described
724     above.
725    
726 dpavlin 641 =head2 _pack_subfields_hash
727    
728     @values = _pack_subfields_hash( $h, $include_subfields )
729    
730     =cut
731    
732     sub _pack_subfields_hash {
733    
734     warn "## _pack_subfields_hash( ",dump(@_), " )\n";
735    
736     my ($h,$include_subfields) = @_;
737    
738    
739     if ( defined($h->{subfields}) ) {
740     my $sfs = delete $h->{subfields} || die "no subfields?";
741     my @out;
742     while (@$sfs) {
743     my $sf = shift @$sfs;
744     push @out, '^' . $sf if ($include_subfields);
745     my $o = shift @$sfs;
746     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
747     # single element subfields are not arrays
748     push @out, $h->{$sf};
749     } else {
750     #warn "====> $f $sf $o $#$sfs ", dump( $sfs ), "\n";
751     push @out, $h->{$sf}->[$o];
752     }
753     }
754     return @out;
755     } else {
756     # FIXME this should probably be in alphabetical order instead of hash order
757     values %{$h};
758     }
759     }
760    
761 dpavlin 536 =head2 rec1
762 dpavlin 371
763 dpavlin 536 Return all values in some field
764 dpavlin 371
765 dpavlin 536 @v = rec1('200')
766 dpavlin 15
767 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
768 dpavlin 15
769 dpavlin 536 =cut
770 dpavlin 15
771 dpavlin 536 sub rec1 {
772     my $f = shift;
773 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
774 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
775 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
776 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
777 dpavlin 641 my @out;
778     foreach my $h ( @{ $rec->{$f} } ) {
779     if (ref($h) eq 'HASH') {
780     warn "rec1 hash: ",dump($h),"\n";
781     push @out, ( _pack_subfields_hash( $h ) );
782 dpavlin 31 } else {
783 dpavlin 641 push @out, $h;
784 dpavlin 31 }
785 dpavlin 641 }
786     return @out;
787 dpavlin 536 } elsif( defined($rec->{$f}) ) {
788     return $rec->{$f};
789 dpavlin 15 }
790     }
791    
792 dpavlin 536 =head2 rec2
793 dpavlin 15
794 dpavlin 536 Return all values in specific field and subfield
795 dpavlin 13
796 dpavlin 536 @v = rec2('200','a')
797 dpavlin 13
798     =cut
799    
800 dpavlin 536 sub rec2 {
801     my $f = shift;
802     return unless (defined($rec && $rec->{$f}));
803     my $sf = shift;
804 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
805 dpavlin 589 return map {
806     if (ref($_->{$sf}) eq 'ARRAY') {
807     @{ $_->{$sf} };
808     } else {
809     $_->{$sf};
810     }
811     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
812 dpavlin 536 }
813 dpavlin 13
814 dpavlin 536 =head2 rec
815 dpavlin 13
816 dpavlin 536 syntaxtic sugar for
817 dpavlin 13
818 dpavlin 536 @v = rec('200')
819     @v = rec('200','a')
820 dpavlin 13
821 dpavlin 536 =cut
822 dpavlin 373
823 dpavlin 536 sub rec {
824 dpavlin 583 my @out;
825 dpavlin 536 if ($#_ == 0) {
826 dpavlin 583 @out = rec1(@_);
827 dpavlin 536 } elsif ($#_ == 1) {
828 dpavlin 583 @out = rec2(@_);
829 dpavlin 13 }
830 dpavlin 583 if (@out) {
831     return @out;
832     } else {
833     return '';
834     }
835 dpavlin 13 }
836    
837 dpavlin 536 =head2 regex
838 dpavlin 15
839 dpavlin 536 Apply regex to some or all values
840 dpavlin 15
841 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
842 dpavlin 15
843     =cut
844    
845 dpavlin 536 sub regex {
846     my $r = shift;
847     my @out;
848 dpavlin 550 #warn "r: $r\n", dump(\@_);
849 dpavlin 536 foreach my $t (@_) {
850     next unless ($t);
851     eval "\$t =~ $r";
852     push @out, $t if ($t && $t ne '');
853 dpavlin 368 }
854 dpavlin 536 return @out;
855 dpavlin 15 }
856    
857 dpavlin 536 =head2 prefix
858 dpavlin 15
859 dpavlin 536 Prefix all values with a string
860 dpavlin 15
861 dpavlin 536 @v = prefix( 'my_', @v );
862 dpavlin 15
863     =cut
864    
865 dpavlin 536 sub prefix {
866 dpavlin 592 my $p = shift or return;
867 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
868     }
869 dpavlin 15
870 dpavlin 536 =head2 suffix
871 dpavlin 15
872 dpavlin 536 suffix all values with a string
873 dpavlin 15
874 dpavlin 536 @v = suffix( '_my', @v );
875 dpavlin 15
876 dpavlin 536 =cut
877 dpavlin 15
878 dpavlin 536 sub suffix {
879     my $s = shift or die "suffix needs string as first argument";
880     return map { $_ . $s } grep { defined($_) } @_;
881 dpavlin 15 }
882    
883 dpavlin 536 =head2 surround
884 dpavlin 13
885 dpavlin 536 surround all values with a two strings
886 dpavlin 13
887 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
888 dpavlin 13
889     =cut
890    
891 dpavlin 536 sub surround {
892     my $p = shift or die "surround need prefix as first argument";
893     my $s = shift or die "surround needs suffix as second argument";
894     return map { $p . $_ . $s } grep { defined($_) } @_;
895 dpavlin 13 }
896    
897 dpavlin 536 =head2 first
898 dpavlin 13
899 dpavlin 536 Return first element
900 dpavlin 15
901 dpavlin 536 $v = first( @v );
902 dpavlin 13
903     =cut
904    
905 dpavlin 536 sub first {
906     my $r = shift;
907     return $r;
908 dpavlin 13 }
909    
910 dpavlin 536 =head2 lookup
911 dpavlin 13
912 dpavlin 536 Consult lookup hashes for some value
913 dpavlin 13
914 dpavlin 536 @v = lookup( $v );
915     @v = lookup( @v );
916 dpavlin 13
917     =cut
918    
919 dpavlin 536 sub lookup {
920     my $k = shift or return;
921     return unless (defined($lookup->{$k}));
922     if (ref($lookup->{$k}) eq 'ARRAY') {
923     return @{ $lookup->{$k} };
924     } else {
925     return $lookup->{$k};
926     }
927 dpavlin 13 }
928    
929 dpavlin 595 =head2 config
930    
931     Consult config values stored in C<config.yml>
932    
933     # return database code (key under databases in yaml)
934     $database_code = config(); # use _ from hash
935     $database_name = config('name');
936     $database_input_name = config('input name');
937     $tag = config('input normalize tag');
938    
939     Up to three levels are supported.
940    
941     =cut
942    
943     sub config {
944     return unless ($config);
945    
946     my $p = shift;
947    
948     $p ||= '';
949    
950     my $v;
951    
952     warn "### getting config($p)\n" if ($debug > 1);
953    
954     my @p = split(/\s+/,$p);
955     if ($#p < 0) {
956     $v = $config->{ '_' }; # special, database code
957     } else {
958    
959     my $c = dclone( $config );
960    
961     foreach my $k (@p) {
962     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
963     if (ref($c) eq 'ARRAY') {
964     $c = shift @$c;
965     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
966     last;
967     }
968    
969     if (! defined($c->{$k}) ) {
970     $c = undef;
971     last;
972     } else {
973     $c = $c->{$k};
974     }
975     }
976     $v = $c if ($c);
977    
978     }
979    
980     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
981     warn "config( '$p' ) is empty\n" if (! $v);
982    
983     return $v;
984     }
985    
986     =head2 id
987    
988     Returns unique id of this record
989    
990     $id = id();
991    
992     Returns C<42/2> for 2nd occurence of MFN 42.
993    
994     =cut
995    
996     sub id {
997     my $mfn = $config->{_mfn} || die "no _mfn in config data";
998     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
999     }
1000    
1001 dpavlin 536 =head2 join_with
1002 dpavlin 13
1003 dpavlin 536 Joins walues with some delimiter
1004 dpavlin 10
1005 dpavlin 536 $v = join_with(", ", @v);
1006 dpavlin 10
1007 dpavlin 536 =cut
1008 dpavlin 10
1009 dpavlin 536 sub join_with {
1010     my $d = shift;
1011 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1012 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1013     return '' unless defined($v);
1014     return $v;
1015 dpavlin 536 }
1016 dpavlin 10
1017 dpavlin 562 =head2 split_rec_on
1018    
1019     Split record subfield on some regex and take one of parts out
1020    
1021     $a_before_semi_column =
1022     split_rec_on('200','a', /\s*;\s*/, $part);
1023    
1024     C<$part> is optional number of element. First element is
1025     B<1>, not 0!
1026    
1027     If there is no C<$part> parameter or C<$part> is 0, this function will
1028     return all values produced by splitting.
1029    
1030     =cut
1031    
1032     sub split_rec_on {
1033     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1034    
1035     my ($fld, $sf, $regex, $part) = @_;
1036 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1037 dpavlin 562
1038     my @r = rec( $fld, $sf );
1039     my $v = shift @r;
1040 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1041 dpavlin 562
1042 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1043 dpavlin 566
1044 dpavlin 562 my @s = split( $regex, $v );
1045 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1046 dpavlin 566 if ($part && $part > 0) {
1047 dpavlin 562 return $s[ $part - 1 ];
1048     } else {
1049 dpavlin 571 return @s;
1050 dpavlin 562 }
1051     }
1052    
1053 dpavlin 536 # END
1054     1;

  ViewVC Help
Powered by ViewVC 1.1.26