/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 788 - (hide annotations)
Sun Dec 10 12:56:59 2006 UTC (17 years, 3 months ago) by dpavlin
File size: 28257 byte(s)
better logging

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5 dpavlin 736 _set_load_row
6 dpavlin 538 _get_ds _clean_ds
7 dpavlin 554 _debug
8 dpavlin 641 _pack_subfields_hash
9 dpavlin 538
10 dpavlin 536 tag search display
11 dpavlin 547 marc marc_indicators marc_repeatable_subfield
12 dpavlin 564 marc_compose marc_leader
13 dpavlin 574 marc_duplicate marc_remove
14 dpavlin 604 marc_original_order
15 dpavlin 540
16 dpavlin 536 rec1 rec2 rec
17     regex prefix suffix surround
18     first lookup join_with
19 dpavlin 707 save_into_lookup
20 dpavlin 562
21     split_rec_on
22 dpavlin 785
23     get set
24 dpavlin 536 /;
25 dpavlin 10
26     use warnings;
27     use strict;
28 dpavlin 536
29     #use base qw/WebPAC::Common/;
30 dpavlin 550 use Data::Dump qw/dump/;
31 dpavlin 574 use Storable qw/dclone/;
32 dpavlin 725 use Carp qw/confess/;
33 dpavlin 10
34 dpavlin 550 # debugging warn(s)
35     my $debug = 0;
36    
37    
38 dpavlin 10 =head1 NAME
39    
40 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
41 dpavlin 10
42     =head1 VERSION
43    
44 dpavlin 786 Version 0.25
45 dpavlin 10
46     =cut
47    
48 dpavlin 786 our $VERSION = '0.25';
49 dpavlin 10
50     =head1 SYNOPSIS
51    
52 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
53     from input records using perl functions which are specialized for set
54     processing.
55 dpavlin 10
56 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
57     means that you check it's validity before running WebPAC using
58     C<perl -c normalize.pl>.
59 dpavlin 15
60 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
61 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
62 dpavlin 547 C<marc>.
63 dpavlin 15
64 dpavlin 10 =head1 FUNCTIONS
65    
66 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
67     All other functions are available for use within normalisation rules.
68    
69 dpavlin 536 =head2 data_structure
70 dpavlin 10
71 dpavlin 536 Return data structure
72 dpavlin 13
73 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
74 dpavlin 725 lookup => $lookup_hash,
75 dpavlin 536 row => $row,
76     rules => $normalize_pl_config,
77 dpavlin 541 marc_encoding => 'utf-8',
78 dpavlin 595 config => $config,
79 dpavlin 736 load_row_coderef => sub {
80 dpavlin 725 my ($database,$input,$mfn) = shift;
81 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
82 dpavlin 725 },
83 dpavlin 13 );
84    
85 dpavlin 707 Options C<row>, C<rules> and C<log> are mandatory while all
86 dpavlin 540 other are optional.
87    
88 dpavlin 736 C<load_row_coderef> is closure only used when executing lookups, so they will
89 dpavlin 725 die if it's not defined.
90    
91 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
92 dpavlin 15
93 dpavlin 538 Since this function isn't exported you have to call it with
94     C<WebPAC::Normalize::data_structure>.
95    
96 dpavlin 10 =cut
97    
98 dpavlin 736 my $load_row_coderef;
99 dpavlin 725
100 dpavlin 536 sub data_structure {
101     my $arg = {@_};
102 dpavlin 13
103 dpavlin 536 die "need row argument" unless ($arg->{row});
104     die "need normalisation argument" unless ($arg->{rules});
105 dpavlin 31
106 dpavlin 536 no strict 'subs';
107 dpavlin 730 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
108 dpavlin 538 _set_rec( $arg->{row} );
109 dpavlin 730 _set_config( $arg->{config} ) if defined($arg->{config});
110 dpavlin 541 _clean_ds( %{ $arg } );
111 dpavlin 736 $load_row_coderef = $arg->{load_row_coderef};
112 dpavlin 725
113 dpavlin 536 eval "$arg->{rules}";
114     die "error evaling $arg->{rules}: $@\n" if ($@);
115 dpavlin 540
116 dpavlin 538 return _get_ds();
117 dpavlin 10 }
118    
119 dpavlin 538 =head2 _set_rec
120 dpavlin 13
121 dpavlin 536 Set current record hash
122 dpavlin 433
123 dpavlin 538 _set_rec( $rec );
124 dpavlin 433
125     =cut
126    
127 dpavlin 536 my $rec;
128 dpavlin 433
129 dpavlin 538 sub _set_rec {
130 dpavlin 536 $rec = shift or die "no record hash";
131 dpavlin 433 }
132    
133 dpavlin 595 =head2 _set_config
134    
135     Set current config hash
136    
137     _set_config( $config );
138    
139     Magic keys are:
140    
141     =over 4
142    
143     =item _
144    
145     Code of current database
146    
147     =item _mfn
148    
149     Current MFN
150    
151     =back
152    
153     =cut
154    
155     my $config;
156    
157     sub _set_config {
158     $config = shift;
159     }
160    
161 dpavlin 538 =head2 _get_ds
162    
163     Return hash formatted as data structure
164    
165     my $ds = _get_ds();
166    
167     =cut
168    
169 dpavlin 741 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
170 dpavlin 574 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
171 dpavlin 538
172     sub _get_ds {
173     return $out;
174     }
175    
176     =head2 _clean_ds
177    
178     Clean data structure hash for next record
179    
180     _clean_ds();
181    
182     =cut
183    
184     sub _clean_ds {
185 dpavlin 541 my $a = {@_};
186 dpavlin 741 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
187 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
188 dpavlin 541 $marc_encoding = $a->{marc_encoding};
189 dpavlin 538 }
190    
191     =head2 _set_lookup
192    
193     Set current lookup hash
194    
195     _set_lookup( $lookup );
196    
197     =cut
198    
199     my $lookup;
200    
201     sub _set_lookup {
202     $lookup = shift;
203     }
204    
205 dpavlin 707 =head2 _get_lookup
206    
207     Get current lookup hash
208    
209     my $lookup = _get_lookup();
210    
211     =cut
212    
213     sub _get_lookup {
214     return $lookup;
215     }
216    
217 dpavlin 736 =head2 _set_load_row
218 dpavlin 725
219     Setup code reference which will return L<data_structure> from
220     L<WebPAC::Store>
221    
222 dpavlin 736 _set_load_row(sub {
223 dpavlin 725 my ($database,$input,$mfn) = @_;
224 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
225 dpavlin 725 });
226    
227     =cut
228    
229 dpavlin 736 sub _set_load_row {
230 dpavlin 725 my $coderef = shift;
231     confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
232    
233 dpavlin 736 $load_row_coderef = $coderef;
234 dpavlin 725 }
235    
236 dpavlin 547 =head2 _get_marc_fields
237 dpavlin 540
238 dpavlin 547 Get all fields defined by calls to C<marc>
239 dpavlin 540
240 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
241 dpavlin 540
242 dpavlin 543 We are using I<magic> which detect repeatable fields only from
243     sequence of field/subfield data generated by normalization.
244    
245 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
246     if any of indicators are different.
247 dpavlin 543
248 dpavlin 554 This is sane for most cases. Something like:
249 dpavlin 543
250 dpavlin 554 900a-1 900b-1 900c-1
251     900a-2 900b-2
252     900a-3
253    
254     will be created from any combination of:
255    
256     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
257    
258     and following rules:
259    
260     marc('900','a', rec('200','a') );
261     marc('900','b', rec('200','b') );
262     marc('900','c', rec('200','c') );
263    
264     which might not be what you have in mind. If you need repeatable subfield,
265     define it using C<marc_repeatable_subfield> like this:
266    
267 dpavlin 574 marc_repeatable_subfield('900','a');
268     marc('900','a', rec('200','a') );
269     marc('900','b', rec('200','b') );
270     marc('900','c', rec('200','c') );
271 dpavlin 554
272 dpavlin 574 will create:
273    
274     900a-1 900a-2 900a-3 900b-1 900c-1
275     900b-2
276    
277     There is also support for returning next or specific using:
278    
279     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
280     # do something with $mf
281     }
282    
283     will always return fields from next MARC record or
284    
285     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
286    
287     will return 42th copy record (if it exists).
288    
289 dpavlin 540 =cut
290    
291 dpavlin 547 sub _get_marc_fields {
292 dpavlin 550
293 dpavlin 574 my $arg = {@_};
294     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
295     my $offset = $marc_fetch_offset;
296     if ($arg->{offset}) {
297     $offset = $arg->{offset};
298     } elsif($arg->{fetch_next}) {
299     $marc_fetch_offset++;
300     }
301 dpavlin 550
302 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
303    
304     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
305    
306     my $marc_rec = $marc_record->[ $offset ];
307    
308     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
309    
310     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
311    
312 dpavlin 550 # first, sort all existing fields
313     # XXX might not be needed, but modern perl might randomize elements in hash
314     my @sorted_marc_record = sort {
315 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
316 dpavlin 574 } @{ $marc_rec };
317 dpavlin 550
318 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
319 dpavlin 562
320 dpavlin 550 # output marc fields
321 dpavlin 542 my @m;
322 dpavlin 550
323     # count unique field-subfields (used for offset when walking to next subfield)
324     my $u;
325 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
326 dpavlin 550
327     if ($debug) {
328 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
329     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
330     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
331     warn "## subfield count = ", dump( $u ), $/;
332 dpavlin 550 }
333    
334     my $len = $#sorted_marc_record;
335     my $visited;
336     my $i = 0;
337     my $field;
338    
339     foreach ( 0 .. $len ) {
340    
341     # find next element which isn't visited
342     while ($visited->{$i}) {
343     $i = ($i + 1) % ($len + 1);
344 dpavlin 542 }
345    
346 dpavlin 550 # mark it visited
347     $visited->{$i}++;
348    
349 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
350 dpavlin 550
351     # field and subfield which is key for
352     # marc_repeatable_subfield and u
353 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
354 dpavlin 550
355     if ($debug > 1) {
356    
357     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
358     print "### this [$i]: ", dump( $row ),$/;
359     print "### sf: ", $row->[3], " vs ", $field->[3],
360     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
361     if ($#$field >= 0);
362    
363     }
364    
365     # if field exists
366     if ( $#$field >= 0 ) {
367     if (
368     $row->[0] ne $field->[0] || # field
369     $row->[1] ne $field->[1] || # i1
370     $row->[2] ne $field->[2] # i2
371     ) {
372     push @m, $field;
373     warn "## saved/1 ", dump( $field ),$/ if ($debug);
374     $field = $row;
375    
376     } elsif (
377     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
378     ||
379     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
380     ! $marc_repeatable_subfield->{ $fsf }
381     )
382     ) {
383     push @m, $field;
384     warn "## saved/2 ", dump( $field ),$/ if ($debug);
385     $field = $row;
386    
387     } else {
388     # append new subfields to existing field
389     push @$field, ( $row->[3], $row->[4] );
390     }
391     } else {
392     # insert first field
393     $field = $row;
394     }
395    
396     if (! $marc_repeatable_subfield->{ $fsf }) {
397     # make step to next subfield
398     $i = ($i + $u->{ $fsf } ) % ($len + 1);
399     }
400 dpavlin 542 }
401    
402 dpavlin 550 if ($#$field >= 0) {
403     push @m, $field;
404     warn "## saved/3 ", dump( $field ),$/ if ($debug);
405     }
406 dpavlin 542
407 dpavlin 579 return \@m;
408 dpavlin 540 }
409    
410 dpavlin 554 =head2 _debug
411    
412     Change level of debug warnings
413    
414     _debug( 2 );
415    
416     =cut
417    
418     sub _debug {
419     my $l = shift;
420     return $debug unless defined($l);
421 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
422 dpavlin 554 $debug = $l;
423     }
424    
425 dpavlin 540 =head1 Functions to create C<data_structure>
426    
427     Those functions generally have to first in your normalization file.
428    
429 dpavlin 536 =head2 tag
430 dpavlin 433
431 dpavlin 536 Define new tag for I<search> and I<display>.
432 dpavlin 433
433 dpavlin 536 tag('Title', rec('200','a') );
434 dpavlin 13
435    
436     =cut
437    
438 dpavlin 536 sub tag {
439     my $name = shift or die "tag needs name as first argument";
440     my @o = grep { defined($_) && $_ ne '' } @_;
441     return unless (@o);
442     $out->{$name}->{tag} = $name;
443     $out->{$name}->{search} = \@o;
444     $out->{$name}->{display} = \@o;
445     }
446 dpavlin 13
447 dpavlin 536 =head2 display
448 dpavlin 13
449 dpavlin 536 Define tag just for I<display>
450 dpavlin 125
451 dpavlin 536 @v = display('Title', rec('200','a') );
452 dpavlin 125
453 dpavlin 536 =cut
454 dpavlin 125
455 dpavlin 536 sub display {
456     my $name = shift or die "display needs name as first argument";
457     my @o = grep { defined($_) && $_ ne '' } @_;
458     return unless (@o);
459     $out->{$name}->{tag} = $name;
460     $out->{$name}->{display} = \@o;
461     }
462 dpavlin 13
463 dpavlin 536 =head2 search
464 dpavlin 13
465 dpavlin 536 Prepare values just for I<search>
466 dpavlin 13
467 dpavlin 536 @v = search('Title', rec('200','a') );
468 dpavlin 433
469 dpavlin 536 =cut
470 dpavlin 13
471 dpavlin 536 sub search {
472     my $name = shift or die "search needs name as first argument";
473     my @o = grep { defined($_) && $_ ne '' } @_;
474     return unless (@o);
475     $out->{$name}->{tag} = $name;
476     $out->{$name}->{search} = \@o;
477 dpavlin 13 }
478    
479 dpavlin 564 =head2 marc_leader
480    
481     Setup fields within MARC leader or get leader
482    
483     marc_leader('05','c');
484     my $leader = marc_leader();
485    
486     =cut
487    
488     sub marc_leader {
489     my ($offset,$value) = @_;
490    
491     if ($offset) {
492 dpavlin 741 $leader->{ $offset } = $value;
493 dpavlin 564 } else {
494 dpavlin 741 return $leader;
495 dpavlin 564 }
496     }
497    
498 dpavlin 547 =head2 marc
499 dpavlin 540
500     Save value for MARC field
501    
502 dpavlin 547 marc('900','a', rec('200','a') );
503 dpavlin 571 marc('001', rec('000') );
504 dpavlin 540
505     =cut
506    
507 dpavlin 547 sub marc {
508     my $f = shift or die "marc needs field";
509     die "marc field must be numer" unless ($f =~ /^\d+$/);
510 dpavlin 540
511 dpavlin 571 my $sf;
512     if ($f >= 10) {
513     $sf = shift or die "marc needs subfield";
514     }
515 dpavlin 540
516 dpavlin 541 foreach (@_) {
517     my $v = $_; # make var read-write for Encode
518 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
519 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
520 dpavlin 571 if (defined $sf) {
521 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
522 dpavlin 571 } else {
523 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
524 dpavlin 571 }
525 dpavlin 540 }
526     }
527    
528 dpavlin 547 =head2 marc_repeatable_subfield
529    
530     Save values for MARC repetable subfield
531    
532     marc_repeatable_subfield('910', 'z', rec('909') );
533    
534     =cut
535    
536     sub marc_repeatable_subfield {
537 dpavlin 550 my ($f,$sf) = @_;
538     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
539     $marc_repeatable_subfield->{ $f . $sf }++;
540 dpavlin 547 marc(@_);
541     }
542    
543     =head2 marc_indicators
544    
545     Set both indicators for MARC field
546    
547     marc_indicators('900', ' ', 1);
548    
549     Any indicator value other than C<0-9> will be treated as undefined.
550    
551     =cut
552    
553     sub marc_indicators {
554     my $f = shift || die "marc_indicators need field!\n";
555     my ($i1,$i2) = @_;
556     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
557     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
558    
559     $i1 = ' ' if ($i1 !~ /^\d$/);
560     $i2 = ' ' if ($i2 !~ /^\d$/);
561 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
562 dpavlin 547 }
563    
564 dpavlin 562 =head2 marc_compose
565 dpavlin 547
566 dpavlin 562 Save values for each MARC subfield explicitly
567    
568     marc_compose('900',
569     'a', rec('200','a')
570     'b', rec('201','a')
571     'a', rec('200','b')
572     'c', rec('200','c')
573     );
574    
575 dpavlin 603 If you specify C<+> for subfield, value will be appended
576     to previous defined subfield.
577    
578 dpavlin 562 =cut
579    
580     sub marc_compose {
581     my $f = shift or die "marc_compose needs field";
582     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
583    
584     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
585     my $m = [ $f, $i1, $i2 ];
586    
587 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
588    
589 dpavlin 619 if ($#_ % 2 != 1) {
590     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
591     }
592    
593 dpavlin 562 while (@_) {
594 dpavlin 619 my $sf = shift;
595 dpavlin 565 my $v = shift;
596 dpavlin 562
597     next unless (defined($v) && $v !~ /^\s*$/);
598 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
599 dpavlin 603 if ($sf ne '+') {
600     push @$m, ( $sf, $v );
601     } else {
602     $m->[ $#$m ] .= $v;
603     }
604 dpavlin 562 }
605    
606 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
607 dpavlin 562
608 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
609 dpavlin 562 }
610    
611 dpavlin 574 =head2 marc_duplicate
612 dpavlin 562
613 dpavlin 574 Generate copy of current MARC record and continue working on copy
614    
615     marc_duplicate();
616    
617     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
618     C<< _get_marc_fields( offset => 42 ) >>.
619    
620     =cut
621    
622     sub marc_duplicate {
623     my $m = $marc_record->[ -1 ];
624     die "can't duplicate record which isn't defined" unless ($m);
625     push @{ $marc_record }, dclone( $m );
626     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
627     $marc_record_offset = $#{ $marc_record };
628     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
629     }
630    
631     =head2 marc_remove
632    
633     Remove some field or subfield from MARC record.
634    
635     marc_remove('200');
636     marc_remove('200','a');
637    
638     This will erase field C<200> or C<200^a> from current MARC record.
639    
640 dpavlin 786 marc_remove('*');
641    
642     Will remove all fields in current MARC record.
643    
644 dpavlin 574 This is useful after calling C<marc_duplicate> or on it's own (but, you
645     should probably just remove that subfield definition if you are not
646     using C<marc_duplicate>).
647    
648     FIXME: support fields < 10.
649    
650     =cut
651    
652     sub marc_remove {
653     my ($f, $sf) = @_;
654    
655     die "marc_remove needs record number" unless defined($f);
656    
657     my $marc = $marc_record->[ $marc_record_offset ];
658    
659     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
660    
661 dpavlin 786 if ($f eq '*') {
662    
663     delete( $marc_record->[ $marc_record_offset ] );
664 dpavlin 788 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
665 dpavlin 786
666     } else {
667    
668     my $i = 0;
669     foreach ( 0 .. $#{ $marc } ) {
670     last unless (defined $marc->[$i]);
671     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
672     if ($marc->[$i]->[0] eq $f) {
673     if (! defined $sf) {
674     # remove whole field
675     splice @$marc, $i, 1;
676     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
677     $i--;
678     } else {
679     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
680     my $o = ($j * 2) + 3;
681     if ($marc->[$i]->[$o] eq $sf) {
682     # remove subfield
683     splice @{$marc->[$i]}, $o, 2;
684     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
685     # is record now empty?
686     if ($#{ $marc->[$i] } == 2) {
687     splice @$marc, $i, 1;
688     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
689     $i--;
690     };
691     }
692 dpavlin 574 }
693     }
694     }
695 dpavlin 786 $i++;
696 dpavlin 574 }
697 dpavlin 786
698     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
699    
700     $marc_record->[ $marc_record_offset ] = $marc;
701 dpavlin 574 }
702    
703     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
704     }
705    
706 dpavlin 604 =head2 marc_original_order
707    
708     Copy all subfields preserving original order to marc field.
709    
710 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
711 dpavlin 604
712 dpavlin 616 Please note that field numbers are consistent with other commands (marc
713     field number first), but somewhat counter-intuitive (destination and then
714     source).
715    
716 dpavlin 604 You might want to use this command if you are just renaming subfields or
717     using pre-processing modify_record in C<config.yml> and don't need any
718     post-processing or want to preserve order of original subfields.
719    
720 dpavlin 616
721 dpavlin 604 =cut
722    
723     sub marc_original_order {
724    
725 dpavlin 616 my ($to, $from) = @_;
726 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
727    
728 dpavlin 616 return unless defined($rec->{$from});
729    
730     my $r = $rec->{$from};
731 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
732    
733     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
734 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
735 dpavlin 604
736     foreach my $d (@$r) {
737    
738 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
739 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
740 dpavlin 605 next;
741     }
742    
743 dpavlin 604 my @sfs = @{ $d->{subfields} };
744    
745     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
746    
747 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
748 dpavlin 604
749     my $m = [ $to, $i1, $i2 ];
750    
751     while (my $sf = shift @sfs) {
752 dpavlin 618
753     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
754 dpavlin 604 my $offset = shift @sfs;
755     die "corrupted sufields specification for field $from\n" unless defined($offset);
756    
757     my $v;
758     if (ref($d->{$sf}) eq 'ARRAY') {
759     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
760     } elsif ($offset == 0) {
761     $v = $d->{$sf};
762     } else {
763     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
764     }
765     push @$m, ( $sf, $v ) if (defined($v));
766     }
767    
768     if ($#{$m} > 2) {
769     push @{ $marc_record->[ $marc_record_offset ] }, $m;
770     }
771     }
772    
773     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
774     }
775    
776    
777 dpavlin 540 =head1 Functions to extract data from input
778    
779     This function should be used inside functions to create C<data_structure> described
780     above.
781    
782 dpavlin 641 =head2 _pack_subfields_hash
783    
784 dpavlin 669 @subfields = _pack_subfields_hash( $h );
785     $subfields = _pack_subfields_hash( $h, 1 );
786 dpavlin 641
787 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
788     with subfields (denoted by C<^>) and values.
789    
790 dpavlin 641 =cut
791    
792     sub _pack_subfields_hash {
793    
794 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
795 dpavlin 641
796     my ($h,$include_subfields) = @_;
797    
798     if ( defined($h->{subfields}) ) {
799     my $sfs = delete $h->{subfields} || die "no subfields?";
800     my @out;
801     while (@$sfs) {
802     my $sf = shift @$sfs;
803     push @out, '^' . $sf if ($include_subfields);
804     my $o = shift @$sfs;
805     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
806     # single element subfields are not arrays
807 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
808 dpavlin 667
809 dpavlin 641 push @out, $h->{$sf};
810     } else {
811 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
812 dpavlin 641 push @out, $h->{$sf}->[$o];
813     }
814     }
815 dpavlin 667 if ($include_subfields) {
816     return join('', @out);
817     } else {
818     return @out;
819     }
820 dpavlin 641 } else {
821 dpavlin 667 if ($include_subfields) {
822     my $out = '';
823 dpavlin 668 foreach my $sf (sort keys %$h) {
824 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
825     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
826     } else {
827     $out .= '^' . $sf . $h->{$sf};
828     }
829     }
830     return $out;
831     } else {
832     # FIXME this should probably be in alphabetical order instead of hash order
833     values %{$h};
834     }
835 dpavlin 641 }
836     }
837    
838 dpavlin 536 =head2 rec1
839 dpavlin 371
840 dpavlin 536 Return all values in some field
841 dpavlin 371
842 dpavlin 536 @v = rec1('200')
843 dpavlin 15
844 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
845 dpavlin 15
846 dpavlin 536 =cut
847 dpavlin 15
848 dpavlin 536 sub rec1 {
849     my $f = shift;
850 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
851 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
852 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
853 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
854 dpavlin 641 my @out;
855     foreach my $h ( @{ $rec->{$f} } ) {
856     if (ref($h) eq 'HASH') {
857     push @out, ( _pack_subfields_hash( $h ) );
858 dpavlin 31 } else {
859 dpavlin 641 push @out, $h;
860 dpavlin 31 }
861 dpavlin 641 }
862     return @out;
863 dpavlin 536 } elsif( defined($rec->{$f}) ) {
864     return $rec->{$f};
865 dpavlin 15 }
866     }
867    
868 dpavlin 536 =head2 rec2
869 dpavlin 15
870 dpavlin 536 Return all values in specific field and subfield
871 dpavlin 13
872 dpavlin 536 @v = rec2('200','a')
873 dpavlin 13
874     =cut
875    
876 dpavlin 536 sub rec2 {
877     my $f = shift;
878     return unless (defined($rec && $rec->{$f}));
879     my $sf = shift;
880 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
881 dpavlin 589 return map {
882     if (ref($_->{$sf}) eq 'ARRAY') {
883     @{ $_->{$sf} };
884     } else {
885     $_->{$sf};
886     }
887     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
888 dpavlin 536 }
889 dpavlin 13
890 dpavlin 536 =head2 rec
891 dpavlin 13
892 dpavlin 536 syntaxtic sugar for
893 dpavlin 13
894 dpavlin 536 @v = rec('200')
895     @v = rec('200','a')
896 dpavlin 13
897 dpavlin 750 If rec() returns just single value, it will
898     return scalar, not array.
899    
900 dpavlin 536 =cut
901 dpavlin 373
902 dpavlin 536 sub rec {
903 dpavlin 583 my @out;
904 dpavlin 536 if ($#_ == 0) {
905 dpavlin 583 @out = rec1(@_);
906 dpavlin 536 } elsif ($#_ == 1) {
907 dpavlin 583 @out = rec2(@_);
908 dpavlin 13 }
909 dpavlin 750 if ($#out == 0 && ! wantarray) {
910     return $out[0];
911     } elsif (@out) {
912 dpavlin 583 return @out;
913     } else {
914     return '';
915     }
916 dpavlin 13 }
917    
918 dpavlin 536 =head2 regex
919 dpavlin 15
920 dpavlin 536 Apply regex to some or all values
921 dpavlin 15
922 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
923 dpavlin 15
924     =cut
925    
926 dpavlin 536 sub regex {
927     my $r = shift;
928     my @out;
929 dpavlin 550 #warn "r: $r\n", dump(\@_);
930 dpavlin 536 foreach my $t (@_) {
931     next unless ($t);
932     eval "\$t =~ $r";
933     push @out, $t if ($t && $t ne '');
934 dpavlin 368 }
935 dpavlin 536 return @out;
936 dpavlin 15 }
937    
938 dpavlin 536 =head2 prefix
939 dpavlin 15
940 dpavlin 536 Prefix all values with a string
941 dpavlin 15
942 dpavlin 536 @v = prefix( 'my_', @v );
943 dpavlin 15
944     =cut
945    
946 dpavlin 536 sub prefix {
947 dpavlin 592 my $p = shift or return;
948 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
949     }
950 dpavlin 15
951 dpavlin 536 =head2 suffix
952 dpavlin 15
953 dpavlin 536 suffix all values with a string
954 dpavlin 15
955 dpavlin 536 @v = suffix( '_my', @v );
956 dpavlin 15
957 dpavlin 536 =cut
958 dpavlin 15
959 dpavlin 536 sub suffix {
960     my $s = shift or die "suffix needs string as first argument";
961     return map { $_ . $s } grep { defined($_) } @_;
962 dpavlin 15 }
963    
964 dpavlin 536 =head2 surround
965 dpavlin 13
966 dpavlin 536 surround all values with a two strings
967 dpavlin 13
968 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
969 dpavlin 13
970     =cut
971    
972 dpavlin 536 sub surround {
973     my $p = shift or die "surround need prefix as first argument";
974     my $s = shift or die "surround needs suffix as second argument";
975     return map { $p . $_ . $s } grep { defined($_) } @_;
976 dpavlin 13 }
977    
978 dpavlin 536 =head2 first
979 dpavlin 13
980 dpavlin 536 Return first element
981 dpavlin 15
982 dpavlin 536 $v = first( @v );
983 dpavlin 13
984     =cut
985    
986 dpavlin 536 sub first {
987     my $r = shift;
988     return $r;
989 dpavlin 13 }
990    
991 dpavlin 536 =head2 lookup
992 dpavlin 13
993 dpavlin 536 Consult lookup hashes for some value
994 dpavlin 13
995 dpavlin 725 @v = lookup(
996     sub {
997     'ffkk/peri/mfn'.rec('000')
998     },
999     'ffkk','peri','200-a-200-e',
1000     sub {
1001     first(rec(200,'a')).' '.first(rec('200','e'))
1002     }
1003     );
1004 dpavlin 13
1005 dpavlin 725 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1006     normal lookup definition in C<conf/lookup/something.pl> which looks like:
1007 dpavlin 707
1008 dpavlin 725 lookup(
1009     # which results to return from record recorded in lookup
1010     sub { 'ffkk/peri/mfn' . rec('000') },
1011     # from which database and input
1012     'ffkk','peri',
1013     # such that following values match
1014     sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1015     # if this part is missing, we will try to match same fields
1016     # from lookup record and current one, or you can override
1017     # which records to use from current record using
1018     sub { rec('900','x') . ' ' . rec('900','y') },
1019     )
1020    
1021     You can think about this lookup as SQL (if that helps):
1022    
1023     select
1024     sub { what }
1025     from
1026     database, input
1027     where
1028     sub { filter from lookuped record }
1029     having
1030     sub { optional filter on current record }
1031    
1032     Easy as pie, right?
1033    
1034 dpavlin 13 =cut
1035    
1036 dpavlin 536 sub lookup {
1037 dpavlin 725 my ($what, $database, $input, $key, $having) = @_;
1038    
1039 dpavlin 766 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1040 dpavlin 725
1041 dpavlin 729 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1042 dpavlin 725 return unless (defined($lookup->{$database}->{$input}->{$key}));
1043    
1044 dpavlin 736 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1045 dpavlin 725
1046     my $mfns;
1047     my @having = $having->();
1048    
1049 dpavlin 729 warn "## having = ", dump( @having ) if ($debug > 2);
1050 dpavlin 725
1051     foreach my $h ( @having ) {
1052     if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1053 dpavlin 752 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1054 dpavlin 725 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1055     }
1056 dpavlin 536 }
1057 dpavlin 725
1058     return unless ($mfns);
1059    
1060     my @mfns = sort keys %$mfns;
1061    
1062 dpavlin 750 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1063 dpavlin 725
1064     my $old_rec = $rec;
1065     my @out;
1066    
1067     foreach my $mfn (@mfns) {
1068 dpavlin 736 $rec = $load_row_coderef->( $database, $input, $mfn );
1069 dpavlin 725
1070 dpavlin 752 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1071 dpavlin 725
1072     my @vals = $what->();
1073    
1074     push @out, ( @vals );
1075    
1076 dpavlin 752 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1077 dpavlin 725 }
1078    
1079     # if (ref($lookup->{$k}) eq 'ARRAY') {
1080     # return @{ $lookup->{$k} };
1081     # } else {
1082     # return $lookup->{$k};
1083     # }
1084    
1085     $rec = $old_rec;
1086    
1087 dpavlin 750 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1088 dpavlin 725
1089 dpavlin 740 if ($#out == 0) {
1090     return $out[0];
1091     } else {
1092     return @out;
1093     }
1094 dpavlin 13 }
1095    
1096 dpavlin 707 =head2 save_into_lookup
1097    
1098 dpavlin 725 Save value into lookup. It associates current database, input
1099     and specific keys with one or more values which will be
1100     associated over MFN.
1101 dpavlin 707
1102 dpavlin 725 MFN will be extracted from first occurence current of field 000
1103     in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1104    
1105     my $nr = save_into_lookup($database,$input,$key,sub {
1106 dpavlin 707 # code which produce one or more values
1107     });
1108    
1109 dpavlin 725 It returns number of items saved.
1110 dpavlin 707
1111 dpavlin 725 This function shouldn't be called directly, it's called from code created by
1112     L<WebPAC::Parser>.
1113    
1114 dpavlin 707 =cut
1115    
1116     sub save_into_lookup {
1117 dpavlin 712 my ($database,$input,$key,$coderef) = @_;
1118     die "save_into_lookup needs database" unless defined($database);
1119     die "save_into_lookup needs input" unless defined($input);
1120     die "save_into_lookup needs key" unless defined($key);
1121 dpavlin 707 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1122 dpavlin 725
1123 dpavlin 729 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1124 dpavlin 725
1125     my $mfn =
1126     defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1127     defined($config->{_mfn}) ? $config->{_mfn} :
1128     die "mfn not defined or zero";
1129    
1130     my $nr = 0;
1131    
1132 dpavlin 707 foreach my $v ( $coderef->() ) {
1133 dpavlin 712 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1134 dpavlin 721 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1135 dpavlin 725 $nr++;
1136 dpavlin 707 }
1137 dpavlin 725
1138     return $nr;
1139 dpavlin 707 }
1140    
1141 dpavlin 595 =head2 config
1142    
1143     Consult config values stored in C<config.yml>
1144    
1145     # return database code (key under databases in yaml)
1146     $database_code = config(); # use _ from hash
1147     $database_name = config('name');
1148     $database_input_name = config('input name');
1149     $tag = config('input normalize tag');
1150    
1151     Up to three levels are supported.
1152    
1153     =cut
1154    
1155     sub config {
1156     return unless ($config);
1157    
1158     my $p = shift;
1159    
1160     $p ||= '';
1161    
1162     my $v;
1163    
1164     warn "### getting config($p)\n" if ($debug > 1);
1165    
1166     my @p = split(/\s+/,$p);
1167     if ($#p < 0) {
1168     $v = $config->{ '_' }; # special, database code
1169     } else {
1170    
1171     my $c = dclone( $config );
1172    
1173     foreach my $k (@p) {
1174     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1175     if (ref($c) eq 'ARRAY') {
1176     $c = shift @$c;
1177     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1178     last;
1179     }
1180    
1181     if (! defined($c->{$k}) ) {
1182     $c = undef;
1183     last;
1184     } else {
1185     $c = $c->{$k};
1186     }
1187     }
1188     $v = $c if ($c);
1189    
1190     }
1191    
1192     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1193     warn "config( '$p' ) is empty\n" if (! $v);
1194    
1195     return $v;
1196     }
1197    
1198     =head2 id
1199    
1200     Returns unique id of this record
1201    
1202     $id = id();
1203    
1204     Returns C<42/2> for 2nd occurence of MFN 42.
1205    
1206     =cut
1207    
1208     sub id {
1209     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1210     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1211     }
1212    
1213 dpavlin 536 =head2 join_with
1214 dpavlin 13
1215 dpavlin 536 Joins walues with some delimiter
1216 dpavlin 10
1217 dpavlin 536 $v = join_with(", ", @v);
1218 dpavlin 10
1219 dpavlin 536 =cut
1220 dpavlin 10
1221 dpavlin 536 sub join_with {
1222     my $d = shift;
1223 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1224 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1225     return '' unless defined($v);
1226     return $v;
1227 dpavlin 536 }
1228 dpavlin 10
1229 dpavlin 562 =head2 split_rec_on
1230    
1231     Split record subfield on some regex and take one of parts out
1232    
1233     $a_before_semi_column =
1234     split_rec_on('200','a', /\s*;\s*/, $part);
1235    
1236     C<$part> is optional number of element. First element is
1237     B<1>, not 0!
1238    
1239     If there is no C<$part> parameter or C<$part> is 0, this function will
1240     return all values produced by splitting.
1241    
1242     =cut
1243    
1244     sub split_rec_on {
1245     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1246    
1247     my ($fld, $sf, $regex, $part) = @_;
1248 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1249 dpavlin 562
1250     my @r = rec( $fld, $sf );
1251     my $v = shift @r;
1252 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1253 dpavlin 562
1254 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1255 dpavlin 566
1256 dpavlin 562 my @s = split( $regex, $v );
1257 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1258 dpavlin 566 if ($part && $part > 0) {
1259 dpavlin 562 return $s[ $part - 1 ];
1260     } else {
1261 dpavlin 571 return @s;
1262 dpavlin 562 }
1263     }
1264    
1265 dpavlin 785 my $hash;
1266    
1267     =head2 set
1268    
1269     set( key => 'value' );
1270    
1271     =cut
1272    
1273     sub set {
1274     my ($k,$v) = @_;
1275     warn "## set ( $k => ", dump($v), " )", $/;
1276     $hash->{$k} = $v;
1277     };
1278    
1279     =head2 get
1280    
1281     get( 'key' );
1282    
1283     =cut
1284    
1285     sub get {
1286     my $k = shift || return;
1287     my $v = $hash->{$k};
1288     warn "## get $k = ", dump( $v ), $/;
1289     return $v;
1290     }
1291    
1292    
1293 dpavlin 536 # END
1294     1;

  ViewVC Help
Powered by ViewVC 1.1.26