/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 721 - (hide annotations)
Fri Sep 29 12:27:47 2006 UTC (16 years, 4 months ago) by dpavlin
File size: 24040 byte(s)
 r1035@llin:  dpavlin | 2006-09-29 14:25:03 +0200
 moved warn to debug 1

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5     _get_ds _clean_ds
6 dpavlin 554 _debug
7 dpavlin 641 _pack_subfields_hash
8 dpavlin 538
9 dpavlin 536 tag search display
10 dpavlin 547 marc marc_indicators marc_repeatable_subfield
11 dpavlin 564 marc_compose marc_leader
12 dpavlin 574 marc_duplicate marc_remove
13 dpavlin 604 marc_original_order
14 dpavlin 540
15 dpavlin 536 rec1 rec2 rec
16     regex prefix suffix surround
17     first lookup join_with
18 dpavlin 707 save_into_lookup
19 dpavlin 562
20     split_rec_on
21 dpavlin 536 /;
22 dpavlin 10
23     use warnings;
24     use strict;
25 dpavlin 536
26     #use base qw/WebPAC::Common/;
27 dpavlin 550 use Data::Dump qw/dump/;
28 dpavlin 574 use Storable qw/dclone/;
29 dpavlin 10
30 dpavlin 550 # debugging warn(s)
31     my $debug = 0;
32    
33    
34 dpavlin 10 =head1 NAME
35    
36 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
37 dpavlin 10
38     =head1 VERSION
39    
40 dpavlin 707 Version 0.20
41 dpavlin 10
42     =cut
43    
44 dpavlin 707 our $VERSION = '0.20';
45 dpavlin 10
46     =head1 SYNOPSIS
47    
48 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
49     from input records using perl functions which are specialized for set
50     processing.
51 dpavlin 10
52 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
53     means that you check it's validity before running WebPAC using
54     C<perl -c normalize.pl>.
55 dpavlin 15
56 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
57 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
58 dpavlin 547 C<marc>.
59 dpavlin 15
60 dpavlin 10 =head1 FUNCTIONS
61    
62 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
63     All other functions are available for use within normalisation rules.
64    
65 dpavlin 536 =head2 data_structure
66 dpavlin 10
67 dpavlin 536 Return data structure
68 dpavlin 13
69 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
70 dpavlin 707 lookup => $lookup_variable,
71 dpavlin 536 row => $row,
72     rules => $normalize_pl_config,
73 dpavlin 541 marc_encoding => 'utf-8',
74 dpavlin 595 config => $config,
75 dpavlin 13 );
76    
77 dpavlin 707 Options C<row>, C<rules> and C<log> are mandatory while all
78 dpavlin 540 other are optional.
79    
80 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
81 dpavlin 15
82 dpavlin 538 Since this function isn't exported you have to call it with
83     C<WebPAC::Normalize::data_structure>.
84    
85 dpavlin 10 =cut
86    
87 dpavlin 536 sub data_structure {
88     my $arg = {@_};
89 dpavlin 13
90 dpavlin 536 die "need row argument" unless ($arg->{row});
91     die "need normalisation argument" unless ($arg->{rules});
92 dpavlin 31
93 dpavlin 536 no strict 'subs';
94 dpavlin 707 _set_lookup( $arg->{lookup} ) if (defined( $arg->{lookup} ));
95 dpavlin 538 _set_rec( $arg->{row} );
96 dpavlin 707 _set_config( $arg->{config} ) if (defined( $arg->{config} ));
97 dpavlin 541 _clean_ds( %{ $arg } );
98 dpavlin 536 eval "$arg->{rules}";
99     die "error evaling $arg->{rules}: $@\n" if ($@);
100 dpavlin 540
101 dpavlin 538 return _get_ds();
102 dpavlin 10 }
103    
104 dpavlin 538 =head2 _set_rec
105 dpavlin 13
106 dpavlin 536 Set current record hash
107 dpavlin 433
108 dpavlin 538 _set_rec( $rec );
109 dpavlin 433
110     =cut
111    
112 dpavlin 536 my $rec;
113 dpavlin 433
114 dpavlin 538 sub _set_rec {
115 dpavlin 536 $rec = shift or die "no record hash";
116 dpavlin 433 }
117    
118 dpavlin 595 =head2 _set_config
119    
120     Set current config hash
121    
122     _set_config( $config );
123    
124     Magic keys are:
125    
126     =over 4
127    
128     =item _
129    
130     Code of current database
131    
132     =item _mfn
133    
134     Current MFN
135    
136     =back
137    
138     =cut
139    
140     my $config;
141    
142     sub _set_config {
143     $config = shift;
144     }
145    
146 dpavlin 538 =head2 _get_ds
147    
148     Return hash formatted as data structure
149    
150     my $ds = _get_ds();
151    
152     =cut
153    
154 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
155     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
156 dpavlin 538
157     sub _get_ds {
158     return $out;
159     }
160    
161     =head2 _clean_ds
162    
163     Clean data structure hash for next record
164    
165     _clean_ds();
166    
167     =cut
168    
169     sub _clean_ds {
170 dpavlin 541 my $a = {@_};
171 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
172 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
173 dpavlin 541 $marc_encoding = $a->{marc_encoding};
174 dpavlin 538 }
175    
176     =head2 _set_lookup
177    
178     Set current lookup hash
179    
180     _set_lookup( $lookup );
181    
182     =cut
183    
184     my $lookup;
185    
186     sub _set_lookup {
187     $lookup = shift;
188     }
189    
190 dpavlin 707 =head2 _get_lookup
191    
192     Get current lookup hash
193    
194     my $lookup = _get_lookup();
195    
196     =cut
197    
198     sub _get_lookup {
199     return $lookup;
200     }
201    
202 dpavlin 547 =head2 _get_marc_fields
203 dpavlin 540
204 dpavlin 547 Get all fields defined by calls to C<marc>
205 dpavlin 540
206 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
207 dpavlin 540
208 dpavlin 543 We are using I<magic> which detect repeatable fields only from
209     sequence of field/subfield data generated by normalization.
210    
211 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
212     if any of indicators are different.
213 dpavlin 543
214 dpavlin 554 This is sane for most cases. Something like:
215 dpavlin 543
216 dpavlin 554 900a-1 900b-1 900c-1
217     900a-2 900b-2
218     900a-3
219    
220     will be created from any combination of:
221    
222     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
223    
224     and following rules:
225    
226     marc('900','a', rec('200','a') );
227     marc('900','b', rec('200','b') );
228     marc('900','c', rec('200','c') );
229    
230     which might not be what you have in mind. If you need repeatable subfield,
231     define it using C<marc_repeatable_subfield> like this:
232    
233 dpavlin 574 marc_repeatable_subfield('900','a');
234     marc('900','a', rec('200','a') );
235     marc('900','b', rec('200','b') );
236     marc('900','c', rec('200','c') );
237 dpavlin 554
238 dpavlin 574 will create:
239    
240     900a-1 900a-2 900a-3 900b-1 900c-1
241     900b-2
242    
243     There is also support for returning next or specific using:
244    
245     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
246     # do something with $mf
247     }
248    
249     will always return fields from next MARC record or
250    
251     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
252    
253     will return 42th copy record (if it exists).
254    
255 dpavlin 540 =cut
256    
257 dpavlin 547 sub _get_marc_fields {
258 dpavlin 550
259 dpavlin 574 my $arg = {@_};
260     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
261     my $offset = $marc_fetch_offset;
262     if ($arg->{offset}) {
263     $offset = $arg->{offset};
264     } elsif($arg->{fetch_next}) {
265     $marc_fetch_offset++;
266     }
267 dpavlin 550
268 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
269    
270     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
271    
272     my $marc_rec = $marc_record->[ $offset ];
273    
274     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
275    
276     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
277    
278 dpavlin 550 # first, sort all existing fields
279     # XXX might not be needed, but modern perl might randomize elements in hash
280     my @sorted_marc_record = sort {
281 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
282 dpavlin 574 } @{ $marc_rec };
283 dpavlin 550
284 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
285 dpavlin 562
286 dpavlin 550 # output marc fields
287 dpavlin 542 my @m;
288 dpavlin 550
289     # count unique field-subfields (used for offset when walking to next subfield)
290     my $u;
291 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
292 dpavlin 550
293     if ($debug) {
294 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
295     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
296     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
297     warn "## subfield count = ", dump( $u ), $/;
298 dpavlin 550 }
299    
300     my $len = $#sorted_marc_record;
301     my $visited;
302     my $i = 0;
303     my $field;
304    
305     foreach ( 0 .. $len ) {
306    
307     # find next element which isn't visited
308     while ($visited->{$i}) {
309     $i = ($i + 1) % ($len + 1);
310 dpavlin 542 }
311    
312 dpavlin 550 # mark it visited
313     $visited->{$i}++;
314    
315 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
316 dpavlin 550
317     # field and subfield which is key for
318     # marc_repeatable_subfield and u
319 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
320 dpavlin 550
321     if ($debug > 1) {
322    
323     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
324     print "### this [$i]: ", dump( $row ),$/;
325     print "### sf: ", $row->[3], " vs ", $field->[3],
326     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
327     if ($#$field >= 0);
328    
329     }
330    
331     # if field exists
332     if ( $#$field >= 0 ) {
333     if (
334     $row->[0] ne $field->[0] || # field
335     $row->[1] ne $field->[1] || # i1
336     $row->[2] ne $field->[2] # i2
337     ) {
338     push @m, $field;
339     warn "## saved/1 ", dump( $field ),$/ if ($debug);
340     $field = $row;
341    
342     } elsif (
343     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
344     ||
345     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
346     ! $marc_repeatable_subfield->{ $fsf }
347     )
348     ) {
349     push @m, $field;
350     warn "## saved/2 ", dump( $field ),$/ if ($debug);
351     $field = $row;
352    
353     } else {
354     # append new subfields to existing field
355     push @$field, ( $row->[3], $row->[4] );
356     }
357     } else {
358     # insert first field
359     $field = $row;
360     }
361    
362     if (! $marc_repeatable_subfield->{ $fsf }) {
363     # make step to next subfield
364     $i = ($i + $u->{ $fsf } ) % ($len + 1);
365     }
366 dpavlin 542 }
367    
368 dpavlin 550 if ($#$field >= 0) {
369     push @m, $field;
370     warn "## saved/3 ", dump( $field ),$/ if ($debug);
371     }
372 dpavlin 542
373 dpavlin 579 return \@m;
374 dpavlin 540 }
375    
376 dpavlin 554 =head2 _debug
377    
378     Change level of debug warnings
379    
380     _debug( 2 );
381    
382     =cut
383    
384     sub _debug {
385     my $l = shift;
386     return $debug unless defined($l);
387 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
388 dpavlin 554 $debug = $l;
389     }
390    
391 dpavlin 540 =head1 Functions to create C<data_structure>
392    
393     Those functions generally have to first in your normalization file.
394    
395 dpavlin 536 =head2 tag
396 dpavlin 433
397 dpavlin 536 Define new tag for I<search> and I<display>.
398 dpavlin 433
399 dpavlin 536 tag('Title', rec('200','a') );
400 dpavlin 13
401    
402     =cut
403    
404 dpavlin 536 sub tag {
405     my $name = shift or die "tag needs name as first argument";
406     my @o = grep { defined($_) && $_ ne '' } @_;
407     return unless (@o);
408     $out->{$name}->{tag} = $name;
409     $out->{$name}->{search} = \@o;
410     $out->{$name}->{display} = \@o;
411     }
412 dpavlin 13
413 dpavlin 536 =head2 display
414 dpavlin 13
415 dpavlin 536 Define tag just for I<display>
416 dpavlin 125
417 dpavlin 536 @v = display('Title', rec('200','a') );
418 dpavlin 125
419 dpavlin 536 =cut
420 dpavlin 125
421 dpavlin 536 sub display {
422     my $name = shift or die "display needs name as first argument";
423     my @o = grep { defined($_) && $_ ne '' } @_;
424     return unless (@o);
425     $out->{$name}->{tag} = $name;
426     $out->{$name}->{display} = \@o;
427     }
428 dpavlin 13
429 dpavlin 536 =head2 search
430 dpavlin 13
431 dpavlin 536 Prepare values just for I<search>
432 dpavlin 13
433 dpavlin 536 @v = search('Title', rec('200','a') );
434 dpavlin 433
435 dpavlin 536 =cut
436 dpavlin 13
437 dpavlin 536 sub search {
438     my $name = shift or die "search needs name as first argument";
439     my @o = grep { defined($_) && $_ ne '' } @_;
440     return unless (@o);
441     $out->{$name}->{tag} = $name;
442     $out->{$name}->{search} = \@o;
443 dpavlin 13 }
444    
445 dpavlin 564 =head2 marc_leader
446    
447     Setup fields within MARC leader or get leader
448    
449     marc_leader('05','c');
450     my $leader = marc_leader();
451    
452     =cut
453    
454     sub marc_leader {
455     my ($offset,$value) = @_;
456    
457     if ($offset) {
458     $out->{' leader'}->{ $offset } = $value;
459     } else {
460     return $out->{' leader'};
461     }
462     }
463    
464 dpavlin 547 =head2 marc
465 dpavlin 540
466     Save value for MARC field
467    
468 dpavlin 547 marc('900','a', rec('200','a') );
469 dpavlin 571 marc('001', rec('000') );
470 dpavlin 540
471     =cut
472    
473 dpavlin 547 sub marc {
474     my $f = shift or die "marc needs field";
475     die "marc field must be numer" unless ($f =~ /^\d+$/);
476 dpavlin 540
477 dpavlin 571 my $sf;
478     if ($f >= 10) {
479     $sf = shift or die "marc needs subfield";
480     }
481 dpavlin 540
482 dpavlin 541 foreach (@_) {
483     my $v = $_; # make var read-write for Encode
484 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
485 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
486 dpavlin 571 if (defined $sf) {
487 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
488 dpavlin 571 } else {
489 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
490 dpavlin 571 }
491 dpavlin 540 }
492     }
493    
494 dpavlin 547 =head2 marc_repeatable_subfield
495    
496     Save values for MARC repetable subfield
497    
498     marc_repeatable_subfield('910', 'z', rec('909') );
499    
500     =cut
501    
502     sub marc_repeatable_subfield {
503 dpavlin 550 my ($f,$sf) = @_;
504     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
505     $marc_repeatable_subfield->{ $f . $sf }++;
506 dpavlin 547 marc(@_);
507     }
508    
509     =head2 marc_indicators
510    
511     Set both indicators for MARC field
512    
513     marc_indicators('900', ' ', 1);
514    
515     Any indicator value other than C<0-9> will be treated as undefined.
516    
517     =cut
518    
519     sub marc_indicators {
520     my $f = shift || die "marc_indicators need field!\n";
521     my ($i1,$i2) = @_;
522     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
523     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
524    
525     $i1 = ' ' if ($i1 !~ /^\d$/);
526     $i2 = ' ' if ($i2 !~ /^\d$/);
527 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
528 dpavlin 547 }
529    
530 dpavlin 562 =head2 marc_compose
531 dpavlin 547
532 dpavlin 562 Save values for each MARC subfield explicitly
533    
534     marc_compose('900',
535     'a', rec('200','a')
536     'b', rec('201','a')
537     'a', rec('200','b')
538     'c', rec('200','c')
539     );
540    
541 dpavlin 603 If you specify C<+> for subfield, value will be appended
542     to previous defined subfield.
543    
544 dpavlin 562 =cut
545    
546     sub marc_compose {
547     my $f = shift or die "marc_compose needs field";
548     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
549    
550     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
551     my $m = [ $f, $i1, $i2 ];
552    
553 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
554    
555 dpavlin 619 if ($#_ % 2 != 1) {
556     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
557     }
558    
559 dpavlin 562 while (@_) {
560 dpavlin 619 my $sf = shift;
561 dpavlin 565 my $v = shift;
562 dpavlin 562
563     next unless (defined($v) && $v !~ /^\s*$/);
564 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
565 dpavlin 603 if ($sf ne '+') {
566     push @$m, ( $sf, $v );
567     } else {
568     $m->[ $#$m ] .= $v;
569     }
570 dpavlin 562 }
571    
572 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
573 dpavlin 562
574 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
575 dpavlin 562 }
576    
577 dpavlin 574 =head2 marc_duplicate
578 dpavlin 562
579 dpavlin 574 Generate copy of current MARC record and continue working on copy
580    
581     marc_duplicate();
582    
583     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
584     C<< _get_marc_fields( offset => 42 ) >>.
585    
586     =cut
587    
588     sub marc_duplicate {
589     my $m = $marc_record->[ -1 ];
590     die "can't duplicate record which isn't defined" unless ($m);
591     push @{ $marc_record }, dclone( $m );
592     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
593     $marc_record_offset = $#{ $marc_record };
594     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
595     }
596    
597     =head2 marc_remove
598    
599     Remove some field or subfield from MARC record.
600    
601     marc_remove('200');
602     marc_remove('200','a');
603    
604     This will erase field C<200> or C<200^a> from current MARC record.
605    
606     This is useful after calling C<marc_duplicate> or on it's own (but, you
607     should probably just remove that subfield definition if you are not
608     using C<marc_duplicate>).
609    
610     FIXME: support fields < 10.
611    
612     =cut
613    
614     sub marc_remove {
615     my ($f, $sf) = @_;
616    
617     die "marc_remove needs record number" unless defined($f);
618    
619     my $marc = $marc_record->[ $marc_record_offset ];
620    
621     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
622    
623 dpavlin 579 my $i = 0;
624     foreach ( 0 .. $#{ $marc } ) {
625 dpavlin 574 last unless (defined $marc->[$i]);
626     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
627     if ($marc->[$i]->[0] eq $f) {
628     if (! defined $sf) {
629     # remove whole field
630     splice @$marc, $i, 1;
631     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
632     $i--;
633     } else {
634     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
635     my $o = ($j * 2) + 3;
636     if ($marc->[$i]->[$o] eq $sf) {
637     # remove subfield
638     splice @{$marc->[$i]}, $o, 2;
639     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
640     # is record now empty?
641     if ($#{ $marc->[$i] } == 2) {
642     splice @$marc, $i, 1;
643     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
644     $i--;
645     };
646     }
647     }
648     }
649     }
650 dpavlin 579 $i++;
651 dpavlin 574 }
652    
653     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
654    
655     $marc_record->[ $marc_record_offset ] = $marc;
656    
657     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
658     }
659    
660 dpavlin 604 =head2 marc_original_order
661    
662     Copy all subfields preserving original order to marc field.
663    
664 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
665 dpavlin 604
666 dpavlin 616 Please note that field numbers are consistent with other commands (marc
667     field number first), but somewhat counter-intuitive (destination and then
668     source).
669    
670 dpavlin 604 You might want to use this command if you are just renaming subfields or
671     using pre-processing modify_record in C<config.yml> and don't need any
672     post-processing or want to preserve order of original subfields.
673    
674 dpavlin 616
675 dpavlin 604 =cut
676    
677     sub marc_original_order {
678    
679 dpavlin 616 my ($to, $from) = @_;
680 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
681    
682 dpavlin 616 return unless defined($rec->{$from});
683    
684     my $r = $rec->{$from};
685 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
686    
687     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
688 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
689 dpavlin 604
690     foreach my $d (@$r) {
691    
692 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
693 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
694 dpavlin 605 next;
695     }
696    
697 dpavlin 604 my @sfs = @{ $d->{subfields} };
698    
699     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
700    
701 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
702 dpavlin 604
703     my $m = [ $to, $i1, $i2 ];
704    
705     while (my $sf = shift @sfs) {
706 dpavlin 618
707     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
708 dpavlin 604 my $offset = shift @sfs;
709     die "corrupted sufields specification for field $from\n" unless defined($offset);
710    
711     my $v;
712     if (ref($d->{$sf}) eq 'ARRAY') {
713     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
714     } elsif ($offset == 0) {
715     $v = $d->{$sf};
716     } else {
717     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
718     }
719     push @$m, ( $sf, $v ) if (defined($v));
720     }
721    
722     if ($#{$m} > 2) {
723     push @{ $marc_record->[ $marc_record_offset ] }, $m;
724     }
725     }
726    
727     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
728     }
729    
730    
731 dpavlin 540 =head1 Functions to extract data from input
732    
733     This function should be used inside functions to create C<data_structure> described
734     above.
735    
736 dpavlin 641 =head2 _pack_subfields_hash
737    
738 dpavlin 669 @subfields = _pack_subfields_hash( $h );
739     $subfields = _pack_subfields_hash( $h, 1 );
740 dpavlin 641
741 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
742     with subfields (denoted by C<^>) and values.
743    
744 dpavlin 641 =cut
745    
746     sub _pack_subfields_hash {
747    
748 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
749 dpavlin 641
750     my ($h,$include_subfields) = @_;
751    
752     if ( defined($h->{subfields}) ) {
753     my $sfs = delete $h->{subfields} || die "no subfields?";
754     my @out;
755     while (@$sfs) {
756     my $sf = shift @$sfs;
757     push @out, '^' . $sf if ($include_subfields);
758     my $o = shift @$sfs;
759     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
760     # single element subfields are not arrays
761 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
762 dpavlin 667
763 dpavlin 641 push @out, $h->{$sf};
764     } else {
765 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
766 dpavlin 641 push @out, $h->{$sf}->[$o];
767     }
768     }
769 dpavlin 667 if ($include_subfields) {
770     return join('', @out);
771     } else {
772     return @out;
773     }
774 dpavlin 641 } else {
775 dpavlin 667 if ($include_subfields) {
776     my $out = '';
777 dpavlin 668 foreach my $sf (sort keys %$h) {
778 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
779     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
780     } else {
781     $out .= '^' . $sf . $h->{$sf};
782     }
783     }
784     return $out;
785     } else {
786     # FIXME this should probably be in alphabetical order instead of hash order
787     values %{$h};
788     }
789 dpavlin 641 }
790     }
791    
792 dpavlin 536 =head2 rec1
793 dpavlin 371
794 dpavlin 536 Return all values in some field
795 dpavlin 371
796 dpavlin 536 @v = rec1('200')
797 dpavlin 15
798 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
799 dpavlin 15
800 dpavlin 536 =cut
801 dpavlin 15
802 dpavlin 536 sub rec1 {
803     my $f = shift;
804 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
805 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
806 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
807 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
808 dpavlin 641 my @out;
809     foreach my $h ( @{ $rec->{$f} } ) {
810     if (ref($h) eq 'HASH') {
811     push @out, ( _pack_subfields_hash( $h ) );
812 dpavlin 31 } else {
813 dpavlin 641 push @out, $h;
814 dpavlin 31 }
815 dpavlin 641 }
816     return @out;
817 dpavlin 536 } elsif( defined($rec->{$f}) ) {
818     return $rec->{$f};
819 dpavlin 15 }
820     }
821    
822 dpavlin 536 =head2 rec2
823 dpavlin 15
824 dpavlin 536 Return all values in specific field and subfield
825 dpavlin 13
826 dpavlin 536 @v = rec2('200','a')
827 dpavlin 13
828     =cut
829    
830 dpavlin 536 sub rec2 {
831     my $f = shift;
832     return unless (defined($rec && $rec->{$f}));
833     my $sf = shift;
834 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
835 dpavlin 589 return map {
836     if (ref($_->{$sf}) eq 'ARRAY') {
837     @{ $_->{$sf} };
838     } else {
839     $_->{$sf};
840     }
841     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
842 dpavlin 536 }
843 dpavlin 13
844 dpavlin 536 =head2 rec
845 dpavlin 13
846 dpavlin 536 syntaxtic sugar for
847 dpavlin 13
848 dpavlin 536 @v = rec('200')
849     @v = rec('200','a')
850 dpavlin 13
851 dpavlin 536 =cut
852 dpavlin 373
853 dpavlin 536 sub rec {
854 dpavlin 583 my @out;
855 dpavlin 536 if ($#_ == 0) {
856 dpavlin 583 @out = rec1(@_);
857 dpavlin 536 } elsif ($#_ == 1) {
858 dpavlin 583 @out = rec2(@_);
859 dpavlin 13 }
860 dpavlin 583 if (@out) {
861     return @out;
862     } else {
863     return '';
864     }
865 dpavlin 13 }
866    
867 dpavlin 536 =head2 regex
868 dpavlin 15
869 dpavlin 536 Apply regex to some or all values
870 dpavlin 15
871 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
872 dpavlin 15
873     =cut
874    
875 dpavlin 536 sub regex {
876     my $r = shift;
877     my @out;
878 dpavlin 550 #warn "r: $r\n", dump(\@_);
879 dpavlin 536 foreach my $t (@_) {
880     next unless ($t);
881     eval "\$t =~ $r";
882     push @out, $t if ($t && $t ne '');
883 dpavlin 368 }
884 dpavlin 536 return @out;
885 dpavlin 15 }
886    
887 dpavlin 536 =head2 prefix
888 dpavlin 15
889 dpavlin 536 Prefix all values with a string
890 dpavlin 15
891 dpavlin 536 @v = prefix( 'my_', @v );
892 dpavlin 15
893     =cut
894    
895 dpavlin 536 sub prefix {
896 dpavlin 592 my $p = shift or return;
897 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
898     }
899 dpavlin 15
900 dpavlin 536 =head2 suffix
901 dpavlin 15
902 dpavlin 536 suffix all values with a string
903 dpavlin 15
904 dpavlin 536 @v = suffix( '_my', @v );
905 dpavlin 15
906 dpavlin 536 =cut
907 dpavlin 15
908 dpavlin 536 sub suffix {
909     my $s = shift or die "suffix needs string as first argument";
910     return map { $_ . $s } grep { defined($_) } @_;
911 dpavlin 15 }
912    
913 dpavlin 536 =head2 surround
914 dpavlin 13
915 dpavlin 536 surround all values with a two strings
916 dpavlin 13
917 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
918 dpavlin 13
919     =cut
920    
921 dpavlin 536 sub surround {
922     my $p = shift or die "surround need prefix as first argument";
923     my $s = shift or die "surround needs suffix as second argument";
924     return map { $p . $_ . $s } grep { defined($_) } @_;
925 dpavlin 13 }
926    
927 dpavlin 536 =head2 first
928 dpavlin 13
929 dpavlin 536 Return first element
930 dpavlin 15
931 dpavlin 536 $v = first( @v );
932 dpavlin 13
933     =cut
934    
935 dpavlin 536 sub first {
936     my $r = shift;
937     return $r;
938 dpavlin 13 }
939    
940 dpavlin 536 =head2 lookup
941 dpavlin 13
942 dpavlin 536 Consult lookup hashes for some value
943 dpavlin 13
944 dpavlin 536 @v = lookup( $v );
945     @v = lookup( @v );
946 dpavlin 13
947 dpavlin 707 FIXME B<currently this one is broken!>
948    
949 dpavlin 13 =cut
950    
951 dpavlin 536 sub lookup {
952     my $k = shift or return;
953     return unless (defined($lookup->{$k}));
954     if (ref($lookup->{$k}) eq 'ARRAY') {
955     return @{ $lookup->{$k} };
956     } else {
957     return $lookup->{$k};
958     }
959 dpavlin 13 }
960    
961 dpavlin 707 =head2 save_into_lookup
962    
963     Save value into lookup.
964    
965 dpavlin 712 save_into_lookup($database,$input,$key,sub {
966 dpavlin 707 # code which produce one or more values
967     });
968    
969     This function shouldn't be called directly, it's called from code created by L<WebPAC::Parser>.
970    
971     =cut
972    
973     sub save_into_lookup {
974 dpavlin 712 my ($database,$input,$key,$coderef) = @_;
975     die "save_into_lookup needs database" unless defined($database);
976     die "save_into_lookup needs input" unless defined($input);
977     die "save_into_lookup needs key" unless defined($key);
978 dpavlin 707 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
979     my $mfn = $rec->{'000'}->[0] || die "mfn not defined or zero";
980     foreach my $v ( $coderef->() ) {
981 dpavlin 712 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
982 dpavlin 721 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
983 dpavlin 707 }
984     }
985    
986 dpavlin 595 =head2 config
987    
988     Consult config values stored in C<config.yml>
989    
990     # return database code (key under databases in yaml)
991     $database_code = config(); # use _ from hash
992     $database_name = config('name');
993     $database_input_name = config('input name');
994     $tag = config('input normalize tag');
995    
996     Up to three levels are supported.
997    
998     =cut
999    
1000     sub config {
1001     return unless ($config);
1002    
1003     my $p = shift;
1004    
1005     $p ||= '';
1006    
1007     my $v;
1008    
1009     warn "### getting config($p)\n" if ($debug > 1);
1010    
1011     my @p = split(/\s+/,$p);
1012     if ($#p < 0) {
1013     $v = $config->{ '_' }; # special, database code
1014     } else {
1015    
1016     my $c = dclone( $config );
1017    
1018     foreach my $k (@p) {
1019     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1020     if (ref($c) eq 'ARRAY') {
1021     $c = shift @$c;
1022     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1023     last;
1024     }
1025    
1026     if (! defined($c->{$k}) ) {
1027     $c = undef;
1028     last;
1029     } else {
1030     $c = $c->{$k};
1031     }
1032     }
1033     $v = $c if ($c);
1034    
1035     }
1036    
1037     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1038     warn "config( '$p' ) is empty\n" if (! $v);
1039    
1040     return $v;
1041     }
1042    
1043     =head2 id
1044    
1045     Returns unique id of this record
1046    
1047     $id = id();
1048    
1049     Returns C<42/2> for 2nd occurence of MFN 42.
1050    
1051     =cut
1052    
1053     sub id {
1054     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1055     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1056     }
1057    
1058 dpavlin 536 =head2 join_with
1059 dpavlin 13
1060 dpavlin 536 Joins walues with some delimiter
1061 dpavlin 10
1062 dpavlin 536 $v = join_with(", ", @v);
1063 dpavlin 10
1064 dpavlin 536 =cut
1065 dpavlin 10
1066 dpavlin 536 sub join_with {
1067     my $d = shift;
1068 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1069 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1070     return '' unless defined($v);
1071     return $v;
1072 dpavlin 536 }
1073 dpavlin 10
1074 dpavlin 562 =head2 split_rec_on
1075    
1076     Split record subfield on some regex and take one of parts out
1077    
1078     $a_before_semi_column =
1079     split_rec_on('200','a', /\s*;\s*/, $part);
1080    
1081     C<$part> is optional number of element. First element is
1082     B<1>, not 0!
1083    
1084     If there is no C<$part> parameter or C<$part> is 0, this function will
1085     return all values produced by splitting.
1086    
1087     =cut
1088    
1089     sub split_rec_on {
1090     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1091    
1092     my ($fld, $sf, $regex, $part) = @_;
1093 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1094 dpavlin 562
1095     my @r = rec( $fld, $sf );
1096     my $v = shift @r;
1097 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1098 dpavlin 562
1099 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1100 dpavlin 566
1101 dpavlin 562 my @s = split( $regex, $v );
1102 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1103 dpavlin 566 if ($part && $part > 0) {
1104 dpavlin 562 return $s[ $part - 1 ];
1105     } else {
1106 dpavlin 571 return @s;
1107 dpavlin 562 }
1108     }
1109    
1110 dpavlin 536 # END
1111     1;

  ViewVC Help
Powered by ViewVC 1.1.26