/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 642 - (hide annotations)
Wed Sep 6 21:09:30 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 22575 byte(s)
make it less chatty

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 641 _pack_subfields_hash
8 dpavlin 538
9 dpavlin 536 tag search display
10 dpavlin 547 marc marc_indicators marc_repeatable_subfield
11 dpavlin 564 marc_compose marc_leader
12 dpavlin 574 marc_duplicate marc_remove
13 dpavlin 604 marc_original_order
14 dpavlin 540
15 dpavlin 536 rec1 rec2 rec
16     regex prefix suffix surround
17     first lookup join_with
18 dpavlin 562
19     split_rec_on
20 dpavlin 536 /;
21 dpavlin 10
22     use warnings;
23     use strict;
24 dpavlin 536
25     #use base qw/WebPAC::Common/;
26 dpavlin 550 use Data::Dump qw/dump/;
27 dpavlin 541 use Encode qw/from_to/;
28 dpavlin 574 use Storable qw/dclone/;
29 dpavlin 10
30 dpavlin 550 # debugging warn(s)
31     my $debug = 0;
32    
33    
34 dpavlin 10 =head1 NAME
35    
36 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
37 dpavlin 10
38     =head1 VERSION
39    
40 dpavlin 631 Version 0.17
41 dpavlin 10
42     =cut
43    
44 dpavlin 631 our $VERSION = '0.17';
45 dpavlin 10
46     =head1 SYNOPSIS
47    
48 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
49     from input records using perl functions which are specialized for set
50     processing.
51 dpavlin 10
52 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
53     means that you check it's validity before running WebPAC using
54     C<perl -c normalize.pl>.
55 dpavlin 15
56 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
57 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 dpavlin 547 C<marc>.
59 dpavlin 15
60 dpavlin 10 =head1 FUNCTIONS
61    
62 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
63     All other functions are available for use within normalisation rules.
64    
65 dpavlin 536 =head2 data_structure
66 dpavlin 10
67 dpavlin 536 Return data structure
68 dpavlin 13
69 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
70 dpavlin 536 lookup => $lookup->lookup_hash,
71     row => $row,
72     rules => $normalize_pl_config,
73 dpavlin 541 marc_encoding => 'utf-8',
74 dpavlin 595 config => $config,
75 dpavlin 13 );
76    
77 dpavlin 540 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
78     other are optional.
79    
80 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
81 dpavlin 15
82 dpavlin 538 Since this function isn't exported you have to call it with
83     C<WebPAC::Normalize::data_structure>.
84    
85 dpavlin 10 =cut
86    
87 dpavlin 536 sub data_structure {
88     my $arg = {@_};
89 dpavlin 13
90 dpavlin 536 die "need row argument" unless ($arg->{row});
91     die "need normalisation argument" unless ($arg->{rules});
92 dpavlin 31
93 dpavlin 536 no strict 'subs';
94 dpavlin 538 _set_lookup( $arg->{lookup} );
95     _set_rec( $arg->{row} );
96 dpavlin 595 _set_config( $arg->{config} );
97 dpavlin 541 _clean_ds( %{ $arg } );
98 dpavlin 536 eval "$arg->{rules}";
99     die "error evaling $arg->{rules}: $@\n" if ($@);
100 dpavlin 540
101 dpavlin 538 return _get_ds();
102 dpavlin 10 }
103    
104 dpavlin 538 =head2 _set_rec
105 dpavlin 13
106 dpavlin 536 Set current record hash
107 dpavlin 433
108 dpavlin 538 _set_rec( $rec );
109 dpavlin 433
110     =cut
111    
112 dpavlin 536 my $rec;
113 dpavlin 433
114 dpavlin 538 sub _set_rec {
115 dpavlin 536 $rec = shift or die "no record hash";
116 dpavlin 433 }
117    
118 dpavlin 595 =head2 _set_config
119    
120     Set current config hash
121    
122     _set_config( $config );
123    
124     Magic keys are:
125    
126     =over 4
127    
128     =item _
129    
130     Code of current database
131    
132     =item _mfn
133    
134     Current MFN
135    
136     =back
137    
138     =cut
139    
140     my $config;
141    
142     sub _set_config {
143     $config = shift;
144     }
145    
146 dpavlin 538 =head2 _get_ds
147    
148     Return hash formatted as data structure
149    
150     my $ds = _get_ds();
151    
152     =cut
153    
154 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
156 dpavlin 538
157     sub _get_ds {
158     return $out;
159     }
160    
161     =head2 _clean_ds
162    
163     Clean data structure hash for next record
164    
165     _clean_ds();
166    
167     =cut
168    
169     sub _clean_ds {
170 dpavlin 541 my $a = {@_};
171 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 dpavlin 541 $marc_encoding = $a->{marc_encoding};
174 dpavlin 538 }
175    
176     =head2 _set_lookup
177    
178     Set current lookup hash
179    
180     _set_lookup( $lookup );
181    
182     =cut
183    
184     my $lookup;
185    
186     sub _set_lookup {
187     $lookup = shift;
188     }
189    
190 dpavlin 547 =head2 _get_marc_fields
191 dpavlin 540
192 dpavlin 547 Get all fields defined by calls to C<marc>
193 dpavlin 540
194 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
195 dpavlin 540
196 dpavlin 543 We are using I<magic> which detect repeatable fields only from
197     sequence of field/subfield data generated by normalization.
198    
199 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
200     if any of indicators are different.
201 dpavlin 543
202 dpavlin 554 This is sane for most cases. Something like:
203 dpavlin 543
204 dpavlin 554 900a-1 900b-1 900c-1
205     900a-2 900b-2
206     900a-3
207    
208     will be created from any combination of:
209    
210     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
211    
212     and following rules:
213    
214     marc('900','a', rec('200','a') );
215     marc('900','b', rec('200','b') );
216     marc('900','c', rec('200','c') );
217    
218     which might not be what you have in mind. If you need repeatable subfield,
219     define it using C<marc_repeatable_subfield> like this:
220    
221 dpavlin 574 marc_repeatable_subfield('900','a');
222     marc('900','a', rec('200','a') );
223     marc('900','b', rec('200','b') );
224     marc('900','c', rec('200','c') );
225 dpavlin 554
226 dpavlin 574 will create:
227    
228     900a-1 900a-2 900a-3 900b-1 900c-1
229     900b-2
230    
231     There is also support for returning next or specific using:
232    
233     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
234     # do something with $mf
235     }
236    
237     will always return fields from next MARC record or
238    
239     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
240    
241     will return 42th copy record (if it exists).
242    
243 dpavlin 540 =cut
244    
245 dpavlin 547 sub _get_marc_fields {
246 dpavlin 550
247 dpavlin 574 my $arg = {@_};
248     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
249     my $offset = $marc_fetch_offset;
250     if ($arg->{offset}) {
251     $offset = $arg->{offset};
252     } elsif($arg->{fetch_next}) {
253     $marc_fetch_offset++;
254     }
255 dpavlin 550
256 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
257    
258     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
259    
260     my $marc_rec = $marc_record->[ $offset ];
261    
262     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
263    
264     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
265    
266 dpavlin 550 # first, sort all existing fields
267     # XXX might not be needed, but modern perl might randomize elements in hash
268     my @sorted_marc_record = sort {
269 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
270 dpavlin 574 } @{ $marc_rec };
271 dpavlin 550
272 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
273 dpavlin 562
274 dpavlin 550 # output marc fields
275 dpavlin 542 my @m;
276 dpavlin 550
277     # count unique field-subfields (used for offset when walking to next subfield)
278     my $u;
279 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
280 dpavlin 550
281     if ($debug) {
282 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
283     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
284     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
285     warn "## subfield count = ", dump( $u ), $/;
286 dpavlin 550 }
287    
288     my $len = $#sorted_marc_record;
289     my $visited;
290     my $i = 0;
291     my $field;
292    
293     foreach ( 0 .. $len ) {
294    
295     # find next element which isn't visited
296     while ($visited->{$i}) {
297     $i = ($i + 1) % ($len + 1);
298 dpavlin 542 }
299    
300 dpavlin 550 # mark it visited
301     $visited->{$i}++;
302    
303 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
304 dpavlin 550
305     # field and subfield which is key for
306     # marc_repeatable_subfield and u
307 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
308 dpavlin 550
309     if ($debug > 1) {
310    
311     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
312     print "### this [$i]: ", dump( $row ),$/;
313     print "### sf: ", $row->[3], " vs ", $field->[3],
314     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
315     if ($#$field >= 0);
316    
317     }
318    
319     # if field exists
320     if ( $#$field >= 0 ) {
321     if (
322     $row->[0] ne $field->[0] || # field
323     $row->[1] ne $field->[1] || # i1
324     $row->[2] ne $field->[2] # i2
325     ) {
326     push @m, $field;
327     warn "## saved/1 ", dump( $field ),$/ if ($debug);
328     $field = $row;
329    
330     } elsif (
331     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
332     ||
333     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
334     ! $marc_repeatable_subfield->{ $fsf }
335     )
336     ) {
337     push @m, $field;
338     warn "## saved/2 ", dump( $field ),$/ if ($debug);
339     $field = $row;
340    
341     } else {
342     # append new subfields to existing field
343     push @$field, ( $row->[3], $row->[4] );
344     }
345     } else {
346     # insert first field
347     $field = $row;
348     }
349    
350     if (! $marc_repeatable_subfield->{ $fsf }) {
351     # make step to next subfield
352     $i = ($i + $u->{ $fsf } ) % ($len + 1);
353     }
354 dpavlin 542 }
355    
356 dpavlin 550 if ($#$field >= 0) {
357     push @m, $field;
358     warn "## saved/3 ", dump( $field ),$/ if ($debug);
359     }
360 dpavlin 542
361 dpavlin 579 return \@m;
362 dpavlin 540 }
363    
364 dpavlin 554 =head2 _debug
365    
366     Change level of debug warnings
367    
368     _debug( 2 );
369    
370     =cut
371    
372     sub _debug {
373     my $l = shift;
374     return $debug unless defined($l);
375 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
376 dpavlin 554 $debug = $l;
377     }
378    
379 dpavlin 540 =head1 Functions to create C<data_structure>
380    
381     Those functions generally have to first in your normalization file.
382    
383 dpavlin 536 =head2 tag
384 dpavlin 433
385 dpavlin 536 Define new tag for I<search> and I<display>.
386 dpavlin 433
387 dpavlin 536 tag('Title', rec('200','a') );
388 dpavlin 13
389    
390     =cut
391    
392 dpavlin 536 sub tag {
393     my $name = shift or die "tag needs name as first argument";
394     my @o = grep { defined($_) && $_ ne '' } @_;
395     return unless (@o);
396     $out->{$name}->{tag} = $name;
397     $out->{$name}->{search} = \@o;
398     $out->{$name}->{display} = \@o;
399     }
400 dpavlin 13
401 dpavlin 536 =head2 display
402 dpavlin 13
403 dpavlin 536 Define tag just for I<display>
404 dpavlin 125
405 dpavlin 536 @v = display('Title', rec('200','a') );
406 dpavlin 125
407 dpavlin 536 =cut
408 dpavlin 125
409 dpavlin 536 sub display {
410     my $name = shift or die "display needs name as first argument";
411     my @o = grep { defined($_) && $_ ne '' } @_;
412     return unless (@o);
413     $out->{$name}->{tag} = $name;
414     $out->{$name}->{display} = \@o;
415     }
416 dpavlin 13
417 dpavlin 536 =head2 search
418 dpavlin 13
419 dpavlin 536 Prepare values just for I<search>
420 dpavlin 13
421 dpavlin 536 @v = search('Title', rec('200','a') );
422 dpavlin 433
423 dpavlin 536 =cut
424 dpavlin 13
425 dpavlin 536 sub search {
426     my $name = shift or die "search needs name as first argument";
427     my @o = grep { defined($_) && $_ ne '' } @_;
428     return unless (@o);
429     $out->{$name}->{tag} = $name;
430     $out->{$name}->{search} = \@o;
431 dpavlin 13 }
432    
433 dpavlin 564 =head2 marc_leader
434    
435     Setup fields within MARC leader or get leader
436    
437     marc_leader('05','c');
438     my $leader = marc_leader();
439    
440     =cut
441    
442     sub marc_leader {
443     my ($offset,$value) = @_;
444    
445     if ($offset) {
446     $out->{' leader'}->{ $offset } = $value;
447     } else {
448     return $out->{' leader'};
449     }
450     }
451    
452 dpavlin 547 =head2 marc
453 dpavlin 540
454     Save value for MARC field
455    
456 dpavlin 547 marc('900','a', rec('200','a') );
457 dpavlin 571 marc('001', rec('000') );
458 dpavlin 540
459     =cut
460    
461 dpavlin 547 sub marc {
462     my $f = shift or die "marc needs field";
463     die "marc field must be numer" unless ($f =~ /^\d+$/);
464 dpavlin 540
465 dpavlin 571 my $sf;
466     if ($f >= 10) {
467     $sf = shift or die "marc needs subfield";
468     }
469 dpavlin 540
470 dpavlin 541 foreach (@_) {
471     my $v = $_; # make var read-write for Encode
472 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
473 dpavlin 541 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
474 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
475 dpavlin 571 if (defined $sf) {
476 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
477 dpavlin 571 } else {
478 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
479 dpavlin 571 }
480 dpavlin 540 }
481     }
482    
483 dpavlin 547 =head2 marc_repeatable_subfield
484    
485     Save values for MARC repetable subfield
486    
487     marc_repeatable_subfield('910', 'z', rec('909') );
488    
489     =cut
490    
491     sub marc_repeatable_subfield {
492 dpavlin 550 my ($f,$sf) = @_;
493     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
494     $marc_repeatable_subfield->{ $f . $sf }++;
495 dpavlin 547 marc(@_);
496     }
497    
498     =head2 marc_indicators
499    
500     Set both indicators for MARC field
501    
502     marc_indicators('900', ' ', 1);
503    
504     Any indicator value other than C<0-9> will be treated as undefined.
505    
506     =cut
507    
508     sub marc_indicators {
509     my $f = shift || die "marc_indicators need field!\n";
510     my ($i1,$i2) = @_;
511     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
512     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
513    
514     $i1 = ' ' if ($i1 !~ /^\d$/);
515     $i2 = ' ' if ($i2 !~ /^\d$/);
516 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
517 dpavlin 547 }
518    
519 dpavlin 562 =head2 marc_compose
520 dpavlin 547
521 dpavlin 562 Save values for each MARC subfield explicitly
522    
523     marc_compose('900',
524     'a', rec('200','a')
525     'b', rec('201','a')
526     'a', rec('200','b')
527     'c', rec('200','c')
528     );
529    
530 dpavlin 603 If you specify C<+> for subfield, value will be appended
531     to previous defined subfield.
532    
533 dpavlin 562 =cut
534    
535     sub marc_compose {
536     my $f = shift or die "marc_compose needs field";
537     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
538    
539     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
540     my $m = [ $f, $i1, $i2 ];
541    
542 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
543    
544 dpavlin 619 if ($#_ % 2 != 1) {
545     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
546     }
547    
548 dpavlin 562 while (@_) {
549 dpavlin 619 my $sf = shift;
550 dpavlin 565 my $v = shift;
551 dpavlin 562
552     next unless (defined($v) && $v !~ /^\s*$/);
553     from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
554 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
555 dpavlin 603 if ($sf ne '+') {
556     push @$m, ( $sf, $v );
557     } else {
558     $m->[ $#$m ] .= $v;
559     }
560 dpavlin 562 }
561    
562 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
563 dpavlin 562
564 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
565 dpavlin 562 }
566    
567 dpavlin 574 =head2 marc_duplicate
568 dpavlin 562
569 dpavlin 574 Generate copy of current MARC record and continue working on copy
570    
571     marc_duplicate();
572    
573     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
574     C<< _get_marc_fields( offset => 42 ) >>.
575    
576     =cut
577    
578     sub marc_duplicate {
579     my $m = $marc_record->[ -1 ];
580     die "can't duplicate record which isn't defined" unless ($m);
581     push @{ $marc_record }, dclone( $m );
582     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
583     $marc_record_offset = $#{ $marc_record };
584     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
585     }
586    
587     =head2 marc_remove
588    
589     Remove some field or subfield from MARC record.
590    
591     marc_remove('200');
592     marc_remove('200','a');
593    
594     This will erase field C<200> or C<200^a> from current MARC record.
595    
596     This is useful after calling C<marc_duplicate> or on it's own (but, you
597     should probably just remove that subfield definition if you are not
598     using C<marc_duplicate>).
599    
600     FIXME: support fields < 10.
601    
602     =cut
603    
604     sub marc_remove {
605     my ($f, $sf) = @_;
606    
607     die "marc_remove needs record number" unless defined($f);
608    
609     my $marc = $marc_record->[ $marc_record_offset ];
610    
611     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
612    
613 dpavlin 579 my $i = 0;
614     foreach ( 0 .. $#{ $marc } ) {
615 dpavlin 574 last unless (defined $marc->[$i]);
616     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
617     if ($marc->[$i]->[0] eq $f) {
618     if (! defined $sf) {
619     # remove whole field
620     splice @$marc, $i, 1;
621     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
622     $i--;
623     } else {
624     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
625     my $o = ($j * 2) + 3;
626     if ($marc->[$i]->[$o] eq $sf) {
627     # remove subfield
628     splice @{$marc->[$i]}, $o, 2;
629     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
630     # is record now empty?
631     if ($#{ $marc->[$i] } == 2) {
632     splice @$marc, $i, 1;
633     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
634     $i--;
635     };
636     }
637     }
638     }
639     }
640 dpavlin 579 $i++;
641 dpavlin 574 }
642    
643     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
644    
645     $marc_record->[ $marc_record_offset ] = $marc;
646    
647     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
648     }
649    
650 dpavlin 604 =head2 marc_original_order
651    
652     Copy all subfields preserving original order to marc field.
653    
654 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
655 dpavlin 604
656 dpavlin 616 Please note that field numbers are consistent with other commands (marc
657     field number first), but somewhat counter-intuitive (destination and then
658     source).
659    
660 dpavlin 604 You might want to use this command if you are just renaming subfields or
661     using pre-processing modify_record in C<config.yml> and don't need any
662     post-processing or want to preserve order of original subfields.
663    
664 dpavlin 616
665 dpavlin 604 =cut
666    
667     sub marc_original_order {
668    
669 dpavlin 616 my ($to, $from) = @_;
670 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
671    
672 dpavlin 616 return unless defined($rec->{$from});
673    
674     my $r = $rec->{$from};
675 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
676    
677     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
678 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
679 dpavlin 604
680     foreach my $d (@$r) {
681    
682 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
683 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
684 dpavlin 605 next;
685     }
686    
687 dpavlin 604 my @sfs = @{ $d->{subfields} };
688    
689     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
690    
691 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
692 dpavlin 604
693     my $m = [ $to, $i1, $i2 ];
694    
695     while (my $sf = shift @sfs) {
696 dpavlin 618
697     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
698 dpavlin 604 my $offset = shift @sfs;
699     die "corrupted sufields specification for field $from\n" unless defined($offset);
700    
701     my $v;
702     if (ref($d->{$sf}) eq 'ARRAY') {
703     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
704     } elsif ($offset == 0) {
705     $v = $d->{$sf};
706     } else {
707     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
708     }
709     push @$m, ( $sf, $v ) if (defined($v));
710     }
711    
712     if ($#{$m} > 2) {
713     push @{ $marc_record->[ $marc_record_offset ] }, $m;
714     }
715     }
716    
717     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
718     }
719    
720    
721 dpavlin 540 =head1 Functions to extract data from input
722    
723     This function should be used inside functions to create C<data_structure> described
724     above.
725    
726 dpavlin 641 =head2 _pack_subfields_hash
727    
728     @values = _pack_subfields_hash( $h, $include_subfields )
729    
730     =cut
731    
732     sub _pack_subfields_hash {
733    
734 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
735 dpavlin 641
736     my ($h,$include_subfields) = @_;
737    
738    
739     if ( defined($h->{subfields}) ) {
740     my $sfs = delete $h->{subfields} || die "no subfields?";
741     my @out;
742     while (@$sfs) {
743     my $sf = shift @$sfs;
744     push @out, '^' . $sf if ($include_subfields);
745     my $o = shift @$sfs;
746     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
747     # single element subfields are not arrays
748     push @out, $h->{$sf};
749     } else {
750     #warn "====> $f $sf $o $#$sfs ", dump( $sfs ), "\n";
751     push @out, $h->{$sf}->[$o];
752     }
753     }
754     return @out;
755     } else {
756     # FIXME this should probably be in alphabetical order instead of hash order
757     values %{$h};
758     }
759     }
760    
761 dpavlin 536 =head2 rec1
762 dpavlin 371
763 dpavlin 536 Return all values in some field
764 dpavlin 371
765 dpavlin 536 @v = rec1('200')
766 dpavlin 15
767 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
768 dpavlin 15
769 dpavlin 536 =cut
770 dpavlin 15
771 dpavlin 536 sub rec1 {
772     my $f = shift;
773 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
774 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
775 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
776 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
777 dpavlin 641 my @out;
778     foreach my $h ( @{ $rec->{$f} } ) {
779     if (ref($h) eq 'HASH') {
780     push @out, ( _pack_subfields_hash( $h ) );
781 dpavlin 31 } else {
782 dpavlin 641 push @out, $h;
783 dpavlin 31 }
784 dpavlin 641 }
785     return @out;
786 dpavlin 536 } elsif( defined($rec->{$f}) ) {
787     return $rec->{$f};
788 dpavlin 15 }
789     }
790    
791 dpavlin 536 =head2 rec2
792 dpavlin 15
793 dpavlin 536 Return all values in specific field and subfield
794 dpavlin 13
795 dpavlin 536 @v = rec2('200','a')
796 dpavlin 13
797     =cut
798    
799 dpavlin 536 sub rec2 {
800     my $f = shift;
801     return unless (defined($rec && $rec->{$f}));
802     my $sf = shift;
803 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
804 dpavlin 589 return map {
805     if (ref($_->{$sf}) eq 'ARRAY') {
806     @{ $_->{$sf} };
807     } else {
808     $_->{$sf};
809     }
810     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
811 dpavlin 536 }
812 dpavlin 13
813 dpavlin 536 =head2 rec
814 dpavlin 13
815 dpavlin 536 syntaxtic sugar for
816 dpavlin 13
817 dpavlin 536 @v = rec('200')
818     @v = rec('200','a')
819 dpavlin 13
820 dpavlin 536 =cut
821 dpavlin 373
822 dpavlin 536 sub rec {
823 dpavlin 583 my @out;
824 dpavlin 536 if ($#_ == 0) {
825 dpavlin 583 @out = rec1(@_);
826 dpavlin 536 } elsif ($#_ == 1) {
827 dpavlin 583 @out = rec2(@_);
828 dpavlin 13 }
829 dpavlin 583 if (@out) {
830     return @out;
831     } else {
832     return '';
833     }
834 dpavlin 13 }
835    
836 dpavlin 536 =head2 regex
837 dpavlin 15
838 dpavlin 536 Apply regex to some or all values
839 dpavlin 15
840 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
841 dpavlin 15
842     =cut
843    
844 dpavlin 536 sub regex {
845     my $r = shift;
846     my @out;
847 dpavlin 550 #warn "r: $r\n", dump(\@_);
848 dpavlin 536 foreach my $t (@_) {
849     next unless ($t);
850     eval "\$t =~ $r";
851     push @out, $t if ($t && $t ne '');
852 dpavlin 368 }
853 dpavlin 536 return @out;
854 dpavlin 15 }
855    
856 dpavlin 536 =head2 prefix
857 dpavlin 15
858 dpavlin 536 Prefix all values with a string
859 dpavlin 15
860 dpavlin 536 @v = prefix( 'my_', @v );
861 dpavlin 15
862     =cut
863    
864 dpavlin 536 sub prefix {
865 dpavlin 592 my $p = shift or return;
866 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
867     }
868 dpavlin 15
869 dpavlin 536 =head2 suffix
870 dpavlin 15
871 dpavlin 536 suffix all values with a string
872 dpavlin 15
873 dpavlin 536 @v = suffix( '_my', @v );
874 dpavlin 15
875 dpavlin 536 =cut
876 dpavlin 15
877 dpavlin 536 sub suffix {
878     my $s = shift or die "suffix needs string as first argument";
879     return map { $_ . $s } grep { defined($_) } @_;
880 dpavlin 15 }
881    
882 dpavlin 536 =head2 surround
883 dpavlin 13
884 dpavlin 536 surround all values with a two strings
885 dpavlin 13
886 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
887 dpavlin 13
888     =cut
889    
890 dpavlin 536 sub surround {
891     my $p = shift or die "surround need prefix as first argument";
892     my $s = shift or die "surround needs suffix as second argument";
893     return map { $p . $_ . $s } grep { defined($_) } @_;
894 dpavlin 13 }
895    
896 dpavlin 536 =head2 first
897 dpavlin 13
898 dpavlin 536 Return first element
899 dpavlin 15
900 dpavlin 536 $v = first( @v );
901 dpavlin 13
902     =cut
903    
904 dpavlin 536 sub first {
905     my $r = shift;
906     return $r;
907 dpavlin 13 }
908    
909 dpavlin 536 =head2 lookup
910 dpavlin 13
911 dpavlin 536 Consult lookup hashes for some value
912 dpavlin 13
913 dpavlin 536 @v = lookup( $v );
914     @v = lookup( @v );
915 dpavlin 13
916     =cut
917    
918 dpavlin 536 sub lookup {
919     my $k = shift or return;
920     return unless (defined($lookup->{$k}));
921     if (ref($lookup->{$k}) eq 'ARRAY') {
922     return @{ $lookup->{$k} };
923     } else {
924     return $lookup->{$k};
925     }
926 dpavlin 13 }
927    
928 dpavlin 595 =head2 config
929    
930     Consult config values stored in C<config.yml>
931    
932     # return database code (key under databases in yaml)
933     $database_code = config(); # use _ from hash
934     $database_name = config('name');
935     $database_input_name = config('input name');
936     $tag = config('input normalize tag');
937    
938     Up to three levels are supported.
939    
940     =cut
941    
942     sub config {
943     return unless ($config);
944    
945     my $p = shift;
946    
947     $p ||= '';
948    
949     my $v;
950    
951     warn "### getting config($p)\n" if ($debug > 1);
952    
953     my @p = split(/\s+/,$p);
954     if ($#p < 0) {
955     $v = $config->{ '_' }; # special, database code
956     } else {
957    
958     my $c = dclone( $config );
959    
960     foreach my $k (@p) {
961     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
962     if (ref($c) eq 'ARRAY') {
963     $c = shift @$c;
964     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
965     last;
966     }
967    
968     if (! defined($c->{$k}) ) {
969     $c = undef;
970     last;
971     } else {
972     $c = $c->{$k};
973     }
974     }
975     $v = $c if ($c);
976    
977     }
978    
979     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
980     warn "config( '$p' ) is empty\n" if (! $v);
981    
982     return $v;
983     }
984    
985     =head2 id
986    
987     Returns unique id of this record
988    
989     $id = id();
990    
991     Returns C<42/2> for 2nd occurence of MFN 42.
992    
993     =cut
994    
995     sub id {
996     my $mfn = $config->{_mfn} || die "no _mfn in config data";
997     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
998     }
999    
1000 dpavlin 536 =head2 join_with
1001 dpavlin 13
1002 dpavlin 536 Joins walues with some delimiter
1003 dpavlin 10
1004 dpavlin 536 $v = join_with(", ", @v);
1005 dpavlin 10
1006 dpavlin 536 =cut
1007 dpavlin 10
1008 dpavlin 536 sub join_with {
1009     my $d = shift;
1010 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1011 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1012     return '' unless defined($v);
1013     return $v;
1014 dpavlin 536 }
1015 dpavlin 10
1016 dpavlin 562 =head2 split_rec_on
1017    
1018     Split record subfield on some regex and take one of parts out
1019    
1020     $a_before_semi_column =
1021     split_rec_on('200','a', /\s*;\s*/, $part);
1022    
1023     C<$part> is optional number of element. First element is
1024     B<1>, not 0!
1025    
1026     If there is no C<$part> parameter or C<$part> is 0, this function will
1027     return all values produced by splitting.
1028    
1029     =cut
1030    
1031     sub split_rec_on {
1032     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1033    
1034     my ($fld, $sf, $regex, $part) = @_;
1035 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1036 dpavlin 562
1037     my @r = rec( $fld, $sf );
1038     my $v = shift @r;
1039 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1040 dpavlin 562
1041 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1042 dpavlin 566
1043 dpavlin 562 my @s = split( $regex, $v );
1044 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1045 dpavlin 566 if ($part && $part > 0) {
1046 dpavlin 562 return $s[ $part - 1 ];
1047     } else {
1048 dpavlin 571 return @s;
1049 dpavlin 562 }
1050     }
1051    
1052 dpavlin 536 # END
1053     1;

  ViewVC Help
Powered by ViewVC 1.1.26