/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 810 - (hide annotations)
Sun Apr 1 21:47:38 2007 UTC (17 years ago) by dpavlin
File size: 28470 byte(s)
 r1178@llin:  dpavlin | 2007-04-01 22:39:11 +0200
 cleanup debug output

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5 dpavlin 736 _set_load_row
6 dpavlin 538 _get_ds _clean_ds
7 dpavlin 554 _debug
8 dpavlin 641 _pack_subfields_hash
9 dpavlin 538
10 dpavlin 536 tag search display
11 dpavlin 547 marc marc_indicators marc_repeatable_subfield
12 dpavlin 564 marc_compose marc_leader
13 dpavlin 574 marc_duplicate marc_remove
14 dpavlin 604 marc_original_order
15 dpavlin 540
16 dpavlin 536 rec1 rec2 rec
17     regex prefix suffix surround
18     first lookup join_with
19 dpavlin 707 save_into_lookup
20 dpavlin 562
21     split_rec_on
22 dpavlin 785
23     get set
24 dpavlin 791 count
25 dpavlin 536 /;
26 dpavlin 10
27     use warnings;
28     use strict;
29 dpavlin 536
30     #use base qw/WebPAC::Common/;
31 dpavlin 550 use Data::Dump qw/dump/;
32 dpavlin 574 use Storable qw/dclone/;
33 dpavlin 725 use Carp qw/confess/;
34 dpavlin 10
35 dpavlin 550 # debugging warn(s)
36     my $debug = 0;
37    
38    
39 dpavlin 10 =head1 NAME
40    
41 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
42 dpavlin 10
43     =head1 VERSION
44    
45 dpavlin 791 Version 0.26
46 dpavlin 10
47     =cut
48    
49 dpavlin 791 our $VERSION = '0.26';
50 dpavlin 10
51     =head1 SYNOPSIS
52    
53 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
54     from input records using perl functions which are specialized for set
55     processing.
56 dpavlin 10
57 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
58     means that you check it's validity before running WebPAC using
59     C<perl -c normalize.pl>.
60 dpavlin 15
61 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
62 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
63 dpavlin 547 C<marc>.
64 dpavlin 15
65 dpavlin 10 =head1 FUNCTIONS
66    
67 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
68     All other functions are available for use within normalisation rules.
69    
70 dpavlin 536 =head2 data_structure
71 dpavlin 10
72 dpavlin 536 Return data structure
73 dpavlin 13
74 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
75 dpavlin 725 lookup => $lookup_hash,
76 dpavlin 536 row => $row,
77     rules => $normalize_pl_config,
78 dpavlin 541 marc_encoding => 'utf-8',
79 dpavlin 595 config => $config,
80 dpavlin 736 load_row_coderef => sub {
81 dpavlin 725 my ($database,$input,$mfn) = shift;
82 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
83 dpavlin 725 },
84 dpavlin 13 );
85    
86 dpavlin 707 Options C<row>, C<rules> and C<log> are mandatory while all
87 dpavlin 540 other are optional.
88    
89 dpavlin 736 C<load_row_coderef> is closure only used when executing lookups, so they will
90 dpavlin 725 die if it's not defined.
91    
92 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
93 dpavlin 15
94 dpavlin 538 Since this function isn't exported you have to call it with
95     C<WebPAC::Normalize::data_structure>.
96    
97 dpavlin 10 =cut
98    
99 dpavlin 736 my $load_row_coderef;
100 dpavlin 725
101 dpavlin 536 sub data_structure {
102     my $arg = {@_};
103 dpavlin 13
104 dpavlin 536 die "need row argument" unless ($arg->{row});
105     die "need normalisation argument" unless ($arg->{rules});
106 dpavlin 31
107 dpavlin 536 no strict 'subs';
108 dpavlin 730 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 dpavlin 538 _set_rec( $arg->{row} );
110 dpavlin 730 _set_config( $arg->{config} ) if defined($arg->{config});
111 dpavlin 541 _clean_ds( %{ $arg } );
112 dpavlin 736 $load_row_coderef = $arg->{load_row_coderef};
113 dpavlin 725
114 dpavlin 536 eval "$arg->{rules}";
115     die "error evaling $arg->{rules}: $@\n" if ($@);
116 dpavlin 540
117 dpavlin 538 return _get_ds();
118 dpavlin 10 }
119    
120 dpavlin 538 =head2 _set_rec
121 dpavlin 13
122 dpavlin 536 Set current record hash
123 dpavlin 433
124 dpavlin 538 _set_rec( $rec );
125 dpavlin 433
126     =cut
127    
128 dpavlin 536 my $rec;
129 dpavlin 433
130 dpavlin 538 sub _set_rec {
131 dpavlin 536 $rec = shift or die "no record hash";
132 dpavlin 433 }
133    
134 dpavlin 595 =head2 _set_config
135    
136     Set current config hash
137    
138     _set_config( $config );
139    
140     Magic keys are:
141    
142     =over 4
143    
144     =item _
145    
146     Code of current database
147    
148     =item _mfn
149    
150     Current MFN
151    
152     =back
153    
154     =cut
155    
156     my $config;
157    
158     sub _set_config {
159     $config = shift;
160     }
161    
162 dpavlin 538 =head2 _get_ds
163    
164     Return hash formatted as data structure
165    
166     my $ds = _get_ds();
167    
168     =cut
169    
170 dpavlin 741 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
171 dpavlin 574 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
172 dpavlin 538
173     sub _get_ds {
174     return $out;
175     }
176    
177     =head2 _clean_ds
178    
179     Clean data structure hash for next record
180    
181     _clean_ds();
182    
183     =cut
184    
185     sub _clean_ds {
186 dpavlin 541 my $a = {@_};
187 dpavlin 741 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
188 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
189 dpavlin 541 $marc_encoding = $a->{marc_encoding};
190 dpavlin 538 }
191    
192     =head2 _set_lookup
193    
194     Set current lookup hash
195    
196     _set_lookup( $lookup );
197    
198     =cut
199    
200     my $lookup;
201    
202     sub _set_lookup {
203     $lookup = shift;
204     }
205    
206 dpavlin 707 =head2 _get_lookup
207    
208     Get current lookup hash
209    
210     my $lookup = _get_lookup();
211    
212     =cut
213    
214     sub _get_lookup {
215     return $lookup;
216     }
217    
218 dpavlin 736 =head2 _set_load_row
219 dpavlin 725
220     Setup code reference which will return L<data_structure> from
221     L<WebPAC::Store>
222    
223 dpavlin 736 _set_load_row(sub {
224 dpavlin 725 my ($database,$input,$mfn) = @_;
225 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
226 dpavlin 725 });
227    
228     =cut
229    
230 dpavlin 736 sub _set_load_row {
231 dpavlin 725 my $coderef = shift;
232     confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
233    
234 dpavlin 736 $load_row_coderef = $coderef;
235 dpavlin 725 }
236    
237 dpavlin 547 =head2 _get_marc_fields
238 dpavlin 540
239 dpavlin 547 Get all fields defined by calls to C<marc>
240 dpavlin 540
241 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
242 dpavlin 540
243 dpavlin 543 We are using I<magic> which detect repeatable fields only from
244     sequence of field/subfield data generated by normalization.
245    
246 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
247     if any of indicators are different.
248 dpavlin 543
249 dpavlin 554 This is sane for most cases. Something like:
250 dpavlin 543
251 dpavlin 554 900a-1 900b-1 900c-1
252     900a-2 900b-2
253     900a-3
254    
255     will be created from any combination of:
256    
257     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
258    
259     and following rules:
260    
261     marc('900','a', rec('200','a') );
262     marc('900','b', rec('200','b') );
263     marc('900','c', rec('200','c') );
264    
265     which might not be what you have in mind. If you need repeatable subfield,
266     define it using C<marc_repeatable_subfield> like this:
267    
268 dpavlin 574 marc_repeatable_subfield('900','a');
269     marc('900','a', rec('200','a') );
270     marc('900','b', rec('200','b') );
271     marc('900','c', rec('200','c') );
272 dpavlin 554
273 dpavlin 574 will create:
274    
275     900a-1 900a-2 900a-3 900b-1 900c-1
276     900b-2
277    
278     There is also support for returning next or specific using:
279    
280     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
281     # do something with $mf
282     }
283    
284     will always return fields from next MARC record or
285    
286     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
287    
288     will return 42th copy record (if it exists).
289    
290 dpavlin 540 =cut
291    
292 dpavlin 547 sub _get_marc_fields {
293 dpavlin 550
294 dpavlin 574 my $arg = {@_};
295     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
296     my $offset = $marc_fetch_offset;
297     if ($arg->{offset}) {
298     $offset = $arg->{offset};
299     } elsif($arg->{fetch_next}) {
300     $marc_fetch_offset++;
301     }
302 dpavlin 550
303 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
304    
305     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
306    
307     my $marc_rec = $marc_record->[ $offset ];
308    
309     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
310    
311     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
312    
313 dpavlin 550 # first, sort all existing fields
314     # XXX might not be needed, but modern perl might randomize elements in hash
315     my @sorted_marc_record = sort {
316 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
317 dpavlin 574 } @{ $marc_rec };
318 dpavlin 550
319 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
320 dpavlin 562
321 dpavlin 550 # output marc fields
322 dpavlin 542 my @m;
323 dpavlin 550
324     # count unique field-subfields (used for offset when walking to next subfield)
325     my $u;
326 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
327 dpavlin 550
328     if ($debug) {
329 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
330     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
331     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
332     warn "## subfield count = ", dump( $u ), $/;
333 dpavlin 550 }
334    
335     my $len = $#sorted_marc_record;
336     my $visited;
337     my $i = 0;
338     my $field;
339    
340     foreach ( 0 .. $len ) {
341    
342     # find next element which isn't visited
343     while ($visited->{$i}) {
344     $i = ($i + 1) % ($len + 1);
345 dpavlin 542 }
346    
347 dpavlin 550 # mark it visited
348     $visited->{$i}++;
349    
350 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
351 dpavlin 550
352     # field and subfield which is key for
353     # marc_repeatable_subfield and u
354 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
355 dpavlin 550
356     if ($debug > 1) {
357    
358     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
359     print "### this [$i]: ", dump( $row ),$/;
360     print "### sf: ", $row->[3], " vs ", $field->[3],
361     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
362     if ($#$field >= 0);
363    
364     }
365    
366     # if field exists
367     if ( $#$field >= 0 ) {
368     if (
369     $row->[0] ne $field->[0] || # field
370     $row->[1] ne $field->[1] || # i1
371     $row->[2] ne $field->[2] # i2
372     ) {
373     push @m, $field;
374     warn "## saved/1 ", dump( $field ),$/ if ($debug);
375     $field = $row;
376    
377     } elsif (
378     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
379     ||
380     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
381     ! $marc_repeatable_subfield->{ $fsf }
382     )
383     ) {
384     push @m, $field;
385     warn "## saved/2 ", dump( $field ),$/ if ($debug);
386     $field = $row;
387    
388     } else {
389     # append new subfields to existing field
390     push @$field, ( $row->[3], $row->[4] );
391     }
392     } else {
393     # insert first field
394     $field = $row;
395     }
396    
397     if (! $marc_repeatable_subfield->{ $fsf }) {
398     # make step to next subfield
399     $i = ($i + $u->{ $fsf } ) % ($len + 1);
400     }
401 dpavlin 542 }
402    
403 dpavlin 550 if ($#$field >= 0) {
404     push @m, $field;
405     warn "## saved/3 ", dump( $field ),$/ if ($debug);
406     }
407 dpavlin 542
408 dpavlin 579 return \@m;
409 dpavlin 540 }
410    
411 dpavlin 554 =head2 _debug
412    
413     Change level of debug warnings
414    
415     _debug( 2 );
416    
417     =cut
418    
419     sub _debug {
420     my $l = shift;
421     return $debug unless defined($l);
422 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
423 dpavlin 554 $debug = $l;
424     }
425    
426 dpavlin 540 =head1 Functions to create C<data_structure>
427    
428     Those functions generally have to first in your normalization file.
429    
430 dpavlin 536 =head2 tag
431 dpavlin 433
432 dpavlin 536 Define new tag for I<search> and I<display>.
433 dpavlin 433
434 dpavlin 536 tag('Title', rec('200','a') );
435 dpavlin 13
436    
437     =cut
438    
439 dpavlin 536 sub tag {
440     my $name = shift or die "tag needs name as first argument";
441     my @o = grep { defined($_) && $_ ne '' } @_;
442     return unless (@o);
443     $out->{$name}->{tag} = $name;
444     $out->{$name}->{search} = \@o;
445     $out->{$name}->{display} = \@o;
446     }
447 dpavlin 13
448 dpavlin 536 =head2 display
449 dpavlin 13
450 dpavlin 536 Define tag just for I<display>
451 dpavlin 125
452 dpavlin 536 @v = display('Title', rec('200','a') );
453 dpavlin 125
454 dpavlin 536 =cut
455 dpavlin 125
456 dpavlin 536 sub display {
457     my $name = shift or die "display needs name as first argument";
458     my @o = grep { defined($_) && $_ ne '' } @_;
459     return unless (@o);
460     $out->{$name}->{tag} = $name;
461     $out->{$name}->{display} = \@o;
462     }
463 dpavlin 13
464 dpavlin 536 =head2 search
465 dpavlin 13
466 dpavlin 536 Prepare values just for I<search>
467 dpavlin 13
468 dpavlin 536 @v = search('Title', rec('200','a') );
469 dpavlin 433
470 dpavlin 536 =cut
471 dpavlin 13
472 dpavlin 536 sub search {
473     my $name = shift or die "search needs name as first argument";
474     my @o = grep { defined($_) && $_ ne '' } @_;
475     return unless (@o);
476     $out->{$name}->{tag} = $name;
477     $out->{$name}->{search} = \@o;
478 dpavlin 13 }
479    
480 dpavlin 564 =head2 marc_leader
481    
482     Setup fields within MARC leader or get leader
483    
484     marc_leader('05','c');
485     my $leader = marc_leader();
486    
487     =cut
488    
489     sub marc_leader {
490     my ($offset,$value) = @_;
491    
492     if ($offset) {
493 dpavlin 741 $leader->{ $offset } = $value;
494 dpavlin 564 } else {
495 dpavlin 741 return $leader;
496 dpavlin 564 }
497     }
498    
499 dpavlin 547 =head2 marc
500 dpavlin 540
501     Save value for MARC field
502    
503 dpavlin 547 marc('900','a', rec('200','a') );
504 dpavlin 571 marc('001', rec('000') );
505 dpavlin 540
506     =cut
507    
508 dpavlin 547 sub marc {
509     my $f = shift or die "marc needs field";
510     die "marc field must be numer" unless ($f =~ /^\d+$/);
511 dpavlin 540
512 dpavlin 571 my $sf;
513     if ($f >= 10) {
514     $sf = shift or die "marc needs subfield";
515     }
516 dpavlin 540
517 dpavlin 541 foreach (@_) {
518     my $v = $_; # make var read-write for Encode
519 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
520 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
521 dpavlin 571 if (defined $sf) {
522 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
523 dpavlin 571 } else {
524 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
525 dpavlin 571 }
526 dpavlin 540 }
527     }
528    
529 dpavlin 547 =head2 marc_repeatable_subfield
530    
531     Save values for MARC repetable subfield
532    
533     marc_repeatable_subfield('910', 'z', rec('909') );
534    
535     =cut
536    
537     sub marc_repeatable_subfield {
538 dpavlin 550 my ($f,$sf) = @_;
539     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
540     $marc_repeatable_subfield->{ $f . $sf }++;
541 dpavlin 547 marc(@_);
542     }
543    
544     =head2 marc_indicators
545    
546     Set both indicators for MARC field
547    
548     marc_indicators('900', ' ', 1);
549    
550     Any indicator value other than C<0-9> will be treated as undefined.
551    
552     =cut
553    
554     sub marc_indicators {
555     my $f = shift || die "marc_indicators need field!\n";
556     my ($i1,$i2) = @_;
557     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
558     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
559    
560     $i1 = ' ' if ($i1 !~ /^\d$/);
561     $i2 = ' ' if ($i2 !~ /^\d$/);
562 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
563 dpavlin 547 }
564    
565 dpavlin 562 =head2 marc_compose
566 dpavlin 547
567 dpavlin 562 Save values for each MARC subfield explicitly
568    
569     marc_compose('900',
570     'a', rec('200','a')
571     'b', rec('201','a')
572     'a', rec('200','b')
573     'c', rec('200','c')
574     );
575    
576 dpavlin 603 If you specify C<+> for subfield, value will be appended
577     to previous defined subfield.
578    
579 dpavlin 562 =cut
580    
581     sub marc_compose {
582     my $f = shift or die "marc_compose needs field";
583     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
584    
585     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
586     my $m = [ $f, $i1, $i2 ];
587    
588 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
589    
590 dpavlin 619 if ($#_ % 2 != 1) {
591     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
592     }
593    
594 dpavlin 562 while (@_) {
595 dpavlin 619 my $sf = shift;
596 dpavlin 565 my $v = shift;
597 dpavlin 562
598     next unless (defined($v) && $v !~ /^\s*$/);
599 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
600 dpavlin 603 if ($sf ne '+') {
601     push @$m, ( $sf, $v );
602     } else {
603     $m->[ $#$m ] .= $v;
604     }
605 dpavlin 562 }
606    
607 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
608 dpavlin 562
609 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
610 dpavlin 562 }
611    
612 dpavlin 574 =head2 marc_duplicate
613 dpavlin 562
614 dpavlin 574 Generate copy of current MARC record and continue working on copy
615    
616     marc_duplicate();
617    
618     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
619     C<< _get_marc_fields( offset => 42 ) >>.
620    
621     =cut
622    
623     sub marc_duplicate {
624     my $m = $marc_record->[ -1 ];
625     die "can't duplicate record which isn't defined" unless ($m);
626     push @{ $marc_record }, dclone( $m );
627     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
628     $marc_record_offset = $#{ $marc_record };
629     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
630     }
631    
632     =head2 marc_remove
633    
634     Remove some field or subfield from MARC record.
635    
636     marc_remove('200');
637     marc_remove('200','a');
638    
639     This will erase field C<200> or C<200^a> from current MARC record.
640    
641 dpavlin 786 marc_remove('*');
642    
643     Will remove all fields in current MARC record.
644    
645 dpavlin 574 This is useful after calling C<marc_duplicate> or on it's own (but, you
646     should probably just remove that subfield definition if you are not
647     using C<marc_duplicate>).
648    
649     FIXME: support fields < 10.
650    
651     =cut
652    
653     sub marc_remove {
654     my ($f, $sf) = @_;
655    
656     die "marc_remove needs record number" unless defined($f);
657    
658     my $marc = $marc_record->[ $marc_record_offset ];
659    
660     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
661    
662 dpavlin 786 if ($f eq '*') {
663    
664     delete( $marc_record->[ $marc_record_offset ] );
665 dpavlin 788 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
666 dpavlin 786
667     } else {
668    
669     my $i = 0;
670     foreach ( 0 .. $#{ $marc } ) {
671     last unless (defined $marc->[$i]);
672     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
673     if ($marc->[$i]->[0] eq $f) {
674     if (! defined $sf) {
675     # remove whole field
676     splice @$marc, $i, 1;
677     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
678     $i--;
679     } else {
680     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
681     my $o = ($j * 2) + 3;
682     if ($marc->[$i]->[$o] eq $sf) {
683     # remove subfield
684     splice @{$marc->[$i]}, $o, 2;
685     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
686     # is record now empty?
687     if ($#{ $marc->[$i] } == 2) {
688     splice @$marc, $i, 1;
689     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
690     $i--;
691     };
692     }
693 dpavlin 574 }
694     }
695     }
696 dpavlin 786 $i++;
697 dpavlin 574 }
698 dpavlin 786
699     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
700    
701     $marc_record->[ $marc_record_offset ] = $marc;
702 dpavlin 574 }
703    
704     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
705     }
706    
707 dpavlin 604 =head2 marc_original_order
708    
709     Copy all subfields preserving original order to marc field.
710    
711 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
712 dpavlin 604
713 dpavlin 616 Please note that field numbers are consistent with other commands (marc
714     field number first), but somewhat counter-intuitive (destination and then
715     source).
716    
717 dpavlin 604 You might want to use this command if you are just renaming subfields or
718     using pre-processing modify_record in C<config.yml> and don't need any
719     post-processing or want to preserve order of original subfields.
720    
721 dpavlin 616
722 dpavlin 604 =cut
723    
724     sub marc_original_order {
725    
726 dpavlin 616 my ($to, $from) = @_;
727 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
728    
729 dpavlin 616 return unless defined($rec->{$from});
730    
731     my $r = $rec->{$from};
732 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
733    
734     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
735 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
736 dpavlin 604
737     foreach my $d (@$r) {
738    
739 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
740 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
741 dpavlin 605 next;
742     }
743    
744 dpavlin 604 my @sfs = @{ $d->{subfields} };
745    
746     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
747    
748 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
749 dpavlin 604
750     my $m = [ $to, $i1, $i2 ];
751    
752     while (my $sf = shift @sfs) {
753 dpavlin 618
754     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
755 dpavlin 604 my $offset = shift @sfs;
756     die "corrupted sufields specification for field $from\n" unless defined($offset);
757    
758     my $v;
759     if (ref($d->{$sf}) eq 'ARRAY') {
760     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
761     } elsif ($offset == 0) {
762     $v = $d->{$sf};
763     } else {
764     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
765     }
766     push @$m, ( $sf, $v ) if (defined($v));
767     }
768    
769     if ($#{$m} > 2) {
770     push @{ $marc_record->[ $marc_record_offset ] }, $m;
771     }
772     }
773    
774     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
775     }
776    
777    
778 dpavlin 540 =head1 Functions to extract data from input
779    
780     This function should be used inside functions to create C<data_structure> described
781     above.
782    
783 dpavlin 641 =head2 _pack_subfields_hash
784    
785 dpavlin 669 @subfields = _pack_subfields_hash( $h );
786     $subfields = _pack_subfields_hash( $h, 1 );
787 dpavlin 641
788 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
789     with subfields (denoted by C<^>) and values.
790    
791 dpavlin 641 =cut
792    
793     sub _pack_subfields_hash {
794    
795 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
796 dpavlin 641
797     my ($h,$include_subfields) = @_;
798    
799     if ( defined($h->{subfields}) ) {
800     my $sfs = delete $h->{subfields} || die "no subfields?";
801     my @out;
802     while (@$sfs) {
803     my $sf = shift @$sfs;
804     push @out, '^' . $sf if ($include_subfields);
805     my $o = shift @$sfs;
806     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
807     # single element subfields are not arrays
808 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
809 dpavlin 667
810 dpavlin 641 push @out, $h->{$sf};
811     } else {
812 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
813 dpavlin 641 push @out, $h->{$sf}->[$o];
814     }
815     }
816 dpavlin 667 if ($include_subfields) {
817     return join('', @out);
818     } else {
819     return @out;
820     }
821 dpavlin 641 } else {
822 dpavlin 667 if ($include_subfields) {
823     my $out = '';
824 dpavlin 668 foreach my $sf (sort keys %$h) {
825 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
826     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
827     } else {
828     $out .= '^' . $sf . $h->{$sf};
829     }
830     }
831     return $out;
832     } else {
833     # FIXME this should probably be in alphabetical order instead of hash order
834     values %{$h};
835     }
836 dpavlin 641 }
837     }
838    
839 dpavlin 536 =head2 rec1
840 dpavlin 371
841 dpavlin 536 Return all values in some field
842 dpavlin 371
843 dpavlin 536 @v = rec1('200')
844 dpavlin 15
845 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
846 dpavlin 15
847 dpavlin 536 =cut
848 dpavlin 15
849 dpavlin 536 sub rec1 {
850     my $f = shift;
851 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
852 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
853 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
854 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
855 dpavlin 641 my @out;
856     foreach my $h ( @{ $rec->{$f} } ) {
857     if (ref($h) eq 'HASH') {
858     push @out, ( _pack_subfields_hash( $h ) );
859 dpavlin 31 } else {
860 dpavlin 641 push @out, $h;
861 dpavlin 31 }
862 dpavlin 641 }
863     return @out;
864 dpavlin 536 } elsif( defined($rec->{$f}) ) {
865     return $rec->{$f};
866 dpavlin 15 }
867     }
868    
869 dpavlin 536 =head2 rec2
870 dpavlin 15
871 dpavlin 536 Return all values in specific field and subfield
872 dpavlin 13
873 dpavlin 536 @v = rec2('200','a')
874 dpavlin 13
875     =cut
876    
877 dpavlin 536 sub rec2 {
878     my $f = shift;
879     return unless (defined($rec && $rec->{$f}));
880     my $sf = shift;
881 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
882 dpavlin 589 return map {
883     if (ref($_->{$sf}) eq 'ARRAY') {
884     @{ $_->{$sf} };
885     } else {
886     $_->{$sf};
887     }
888     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
889 dpavlin 536 }
890 dpavlin 13
891 dpavlin 536 =head2 rec
892 dpavlin 13
893 dpavlin 536 syntaxtic sugar for
894 dpavlin 13
895 dpavlin 536 @v = rec('200')
896     @v = rec('200','a')
897 dpavlin 13
898 dpavlin 750 If rec() returns just single value, it will
899     return scalar, not array.
900    
901 dpavlin 536 =cut
902 dpavlin 373
903 dpavlin 536 sub rec {
904 dpavlin 583 my @out;
905 dpavlin 536 if ($#_ == 0) {
906 dpavlin 583 @out = rec1(@_);
907 dpavlin 536 } elsif ($#_ == 1) {
908 dpavlin 583 @out = rec2(@_);
909 dpavlin 13 }
910 dpavlin 750 if ($#out == 0 && ! wantarray) {
911     return $out[0];
912     } elsif (@out) {
913 dpavlin 583 return @out;
914     } else {
915     return '';
916     }
917 dpavlin 13 }
918    
919 dpavlin 536 =head2 regex
920 dpavlin 15
921 dpavlin 536 Apply regex to some or all values
922 dpavlin 15
923 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
924 dpavlin 15
925     =cut
926    
927 dpavlin 536 sub regex {
928     my $r = shift;
929     my @out;
930 dpavlin 550 #warn "r: $r\n", dump(\@_);
931 dpavlin 536 foreach my $t (@_) {
932     next unless ($t);
933     eval "\$t =~ $r";
934     push @out, $t if ($t && $t ne '');
935 dpavlin 368 }
936 dpavlin 536 return @out;
937 dpavlin 15 }
938    
939 dpavlin 536 =head2 prefix
940 dpavlin 15
941 dpavlin 536 Prefix all values with a string
942 dpavlin 15
943 dpavlin 536 @v = prefix( 'my_', @v );
944 dpavlin 15
945     =cut
946    
947 dpavlin 536 sub prefix {
948 dpavlin 592 my $p = shift or return;
949 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
950     }
951 dpavlin 15
952 dpavlin 536 =head2 suffix
953 dpavlin 15
954 dpavlin 536 suffix all values with a string
955 dpavlin 15
956 dpavlin 536 @v = suffix( '_my', @v );
957 dpavlin 15
958 dpavlin 536 =cut
959 dpavlin 15
960 dpavlin 536 sub suffix {
961     my $s = shift or die "suffix needs string as first argument";
962     return map { $_ . $s } grep { defined($_) } @_;
963 dpavlin 15 }
964    
965 dpavlin 536 =head2 surround
966 dpavlin 13
967 dpavlin 536 surround all values with a two strings
968 dpavlin 13
969 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
970 dpavlin 13
971     =cut
972    
973 dpavlin 536 sub surround {
974     my $p = shift or die "surround need prefix as first argument";
975     my $s = shift or die "surround needs suffix as second argument";
976     return map { $p . $_ . $s } grep { defined($_) } @_;
977 dpavlin 13 }
978    
979 dpavlin 536 =head2 first
980 dpavlin 13
981 dpavlin 536 Return first element
982 dpavlin 15
983 dpavlin 536 $v = first( @v );
984 dpavlin 13
985     =cut
986    
987 dpavlin 536 sub first {
988     my $r = shift;
989     return $r;
990 dpavlin 13 }
991    
992 dpavlin 536 =head2 lookup
993 dpavlin 13
994 dpavlin 536 Consult lookup hashes for some value
995 dpavlin 13
996 dpavlin 725 @v = lookup(
997     sub {
998     'ffkk/peri/mfn'.rec('000')
999     },
1000     'ffkk','peri','200-a-200-e',
1001     sub {
1002     first(rec(200,'a')).' '.first(rec('200','e'))
1003     }
1004     );
1005 dpavlin 13
1006 dpavlin 725 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1007     normal lookup definition in C<conf/lookup/something.pl> which looks like:
1008 dpavlin 707
1009 dpavlin 725 lookup(
1010     # which results to return from record recorded in lookup
1011     sub { 'ffkk/peri/mfn' . rec('000') },
1012     # from which database and input
1013     'ffkk','peri',
1014     # such that following values match
1015     sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1016     # if this part is missing, we will try to match same fields
1017     # from lookup record and current one, or you can override
1018     # which records to use from current record using
1019     sub { rec('900','x') . ' ' . rec('900','y') },
1020     )
1021    
1022     You can think about this lookup as SQL (if that helps):
1023    
1024     select
1025     sub { what }
1026     from
1027     database, input
1028     where
1029     sub { filter from lookuped record }
1030     having
1031     sub { optional filter on current record }
1032    
1033     Easy as pie, right?
1034    
1035 dpavlin 13 =cut
1036    
1037 dpavlin 536 sub lookup {
1038 dpavlin 725 my ($what, $database, $input, $key, $having) = @_;
1039    
1040 dpavlin 766 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1041 dpavlin 725
1042 dpavlin 729 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1043 dpavlin 725 return unless (defined($lookup->{$database}->{$input}->{$key}));
1044    
1045 dpavlin 736 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1046 dpavlin 725
1047     my $mfns;
1048     my @having = $having->();
1049    
1050 dpavlin 729 warn "## having = ", dump( @having ) if ($debug > 2);
1051 dpavlin 725
1052     foreach my $h ( @having ) {
1053     if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1054 dpavlin 752 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1055 dpavlin 725 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1056     }
1057 dpavlin 536 }
1058 dpavlin 725
1059     return unless ($mfns);
1060    
1061     my @mfns = sort keys %$mfns;
1062    
1063 dpavlin 750 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1064 dpavlin 725
1065     my $old_rec = $rec;
1066     my @out;
1067    
1068     foreach my $mfn (@mfns) {
1069 dpavlin 736 $rec = $load_row_coderef->( $database, $input, $mfn );
1070 dpavlin 725
1071 dpavlin 752 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1072 dpavlin 725
1073     my @vals = $what->();
1074    
1075     push @out, ( @vals );
1076    
1077 dpavlin 752 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1078 dpavlin 725 }
1079    
1080     # if (ref($lookup->{$k}) eq 'ARRAY') {
1081     # return @{ $lookup->{$k} };
1082     # } else {
1083     # return $lookup->{$k};
1084     # }
1085    
1086     $rec = $old_rec;
1087    
1088 dpavlin 750 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1089 dpavlin 725
1090 dpavlin 740 if ($#out == 0) {
1091     return $out[0];
1092     } else {
1093     return @out;
1094     }
1095 dpavlin 13 }
1096    
1097 dpavlin 707 =head2 save_into_lookup
1098    
1099 dpavlin 725 Save value into lookup. It associates current database, input
1100     and specific keys with one or more values which will be
1101     associated over MFN.
1102 dpavlin 707
1103 dpavlin 725 MFN will be extracted from first occurence current of field 000
1104     in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1105    
1106     my $nr = save_into_lookup($database,$input,$key,sub {
1107 dpavlin 707 # code which produce one or more values
1108     });
1109    
1110 dpavlin 725 It returns number of items saved.
1111 dpavlin 707
1112 dpavlin 725 This function shouldn't be called directly, it's called from code created by
1113     L<WebPAC::Parser>.
1114    
1115 dpavlin 707 =cut
1116    
1117     sub save_into_lookup {
1118 dpavlin 712 my ($database,$input,$key,$coderef) = @_;
1119     die "save_into_lookup needs database" unless defined($database);
1120     die "save_into_lookup needs input" unless defined($input);
1121     die "save_into_lookup needs key" unless defined($key);
1122 dpavlin 707 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1123 dpavlin 725
1124 dpavlin 729 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1125 dpavlin 725
1126     my $mfn =
1127     defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1128     defined($config->{_mfn}) ? $config->{_mfn} :
1129     die "mfn not defined or zero";
1130    
1131     my $nr = 0;
1132    
1133 dpavlin 707 foreach my $v ( $coderef->() ) {
1134 dpavlin 712 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1135 dpavlin 721 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1136 dpavlin 725 $nr++;
1137 dpavlin 707 }
1138 dpavlin 725
1139     return $nr;
1140 dpavlin 707 }
1141    
1142 dpavlin 595 =head2 config
1143    
1144     Consult config values stored in C<config.yml>
1145    
1146     # return database code (key under databases in yaml)
1147     $database_code = config(); # use _ from hash
1148     $database_name = config('name');
1149     $database_input_name = config('input name');
1150     $tag = config('input normalize tag');
1151    
1152     Up to three levels are supported.
1153    
1154     =cut
1155    
1156     sub config {
1157     return unless ($config);
1158    
1159     my $p = shift;
1160    
1161     $p ||= '';
1162    
1163     my $v;
1164    
1165     warn "### getting config($p)\n" if ($debug > 1);
1166    
1167     my @p = split(/\s+/,$p);
1168     if ($#p < 0) {
1169     $v = $config->{ '_' }; # special, database code
1170     } else {
1171    
1172     my $c = dclone( $config );
1173    
1174     foreach my $k (@p) {
1175     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1176     if (ref($c) eq 'ARRAY') {
1177     $c = shift @$c;
1178     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1179     last;
1180     }
1181    
1182     if (! defined($c->{$k}) ) {
1183     $c = undef;
1184     last;
1185     } else {
1186     $c = $c->{$k};
1187     }
1188     }
1189     $v = $c if ($c);
1190    
1191     }
1192    
1193     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1194     warn "config( '$p' ) is empty\n" if (! $v);
1195    
1196     return $v;
1197     }
1198    
1199     =head2 id
1200    
1201     Returns unique id of this record
1202    
1203     $id = id();
1204    
1205     Returns C<42/2> for 2nd occurence of MFN 42.
1206    
1207     =cut
1208    
1209     sub id {
1210     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1211     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1212     }
1213    
1214 dpavlin 536 =head2 join_with
1215 dpavlin 13
1216 dpavlin 536 Joins walues with some delimiter
1217 dpavlin 10
1218 dpavlin 536 $v = join_with(", ", @v);
1219 dpavlin 10
1220 dpavlin 536 =cut
1221 dpavlin 10
1222 dpavlin 536 sub join_with {
1223     my $d = shift;
1224 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1225 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1226     return '' unless defined($v);
1227     return $v;
1228 dpavlin 536 }
1229 dpavlin 10
1230 dpavlin 562 =head2 split_rec_on
1231    
1232     Split record subfield on some regex and take one of parts out
1233    
1234     $a_before_semi_column =
1235     split_rec_on('200','a', /\s*;\s*/, $part);
1236    
1237     C<$part> is optional number of element. First element is
1238     B<1>, not 0!
1239    
1240     If there is no C<$part> parameter or C<$part> is 0, this function will
1241     return all values produced by splitting.
1242    
1243     =cut
1244    
1245     sub split_rec_on {
1246     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1247    
1248     my ($fld, $sf, $regex, $part) = @_;
1249 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1250 dpavlin 562
1251     my @r = rec( $fld, $sf );
1252     my $v = shift @r;
1253 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1254 dpavlin 562
1255 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1256 dpavlin 566
1257 dpavlin 562 my @s = split( $regex, $v );
1258 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1259 dpavlin 566 if ($part && $part > 0) {
1260 dpavlin 562 return $s[ $part - 1 ];
1261     } else {
1262 dpavlin 571 return @s;
1263 dpavlin 562 }
1264     }
1265    
1266 dpavlin 785 my $hash;
1267    
1268     =head2 set
1269    
1270     set( key => 'value' );
1271    
1272     =cut
1273    
1274     sub set {
1275     my ($k,$v) = @_;
1276 dpavlin 810 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1277 dpavlin 785 $hash->{$k} = $v;
1278     };
1279    
1280     =head2 get
1281    
1282     get( 'key' );
1283    
1284     =cut
1285    
1286     sub get {
1287     my $k = shift || return;
1288     my $v = $hash->{$k};
1289 dpavlin 810 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1290 dpavlin 785 return $v;
1291     }
1292    
1293 dpavlin 791 =head2 count
1294 dpavlin 785
1295 dpavlin 791 if ( count( @result ) == 1 ) {
1296     # do something if only 1 result is there
1297     }
1298    
1299     =cut
1300    
1301     sub count {
1302 dpavlin 810 warn "## count ",dump(@_),$/ if ( $debug );
1303 dpavlin 791 return @_ . '';
1304     }
1305    
1306 dpavlin 536 # END
1307     1;

  ViewVC Help
Powered by ViewVC 1.1.26