/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 766 - (hide annotations)
Tue Oct 31 13:19:47 2006 UTC (17 years, 5 months ago) by dpavlin
File size: 27685 byte(s)
 r1113@llin:  dpavlin | 2006-10-31 14:17:02 +0100
 cleanup normalization error message for lookups and modify tests so they don't fail anymore

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5 dpavlin 736 _set_load_row
6 dpavlin 538 _get_ds _clean_ds
7 dpavlin 554 _debug
8 dpavlin 641 _pack_subfields_hash
9 dpavlin 538
10 dpavlin 536 tag search display
11 dpavlin 547 marc marc_indicators marc_repeatable_subfield
12 dpavlin 564 marc_compose marc_leader
13 dpavlin 574 marc_duplicate marc_remove
14 dpavlin 604 marc_original_order
15 dpavlin 540
16 dpavlin 536 rec1 rec2 rec
17     regex prefix suffix surround
18     first lookup join_with
19 dpavlin 707 save_into_lookup
20 dpavlin 562
21     split_rec_on
22 dpavlin 536 /;
23 dpavlin 10
24     use warnings;
25     use strict;
26 dpavlin 536
27     #use base qw/WebPAC::Common/;
28 dpavlin 550 use Data::Dump qw/dump/;
29 dpavlin 574 use Storable qw/dclone/;
30 dpavlin 725 use Carp qw/confess/;
31 dpavlin 10
32 dpavlin 550 # debugging warn(s)
33     my $debug = 0;
34    
35    
36 dpavlin 10 =head1 NAME
37    
38 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
39 dpavlin 10
40     =head1 VERSION
41    
42 dpavlin 750 Version 0.23
43 dpavlin 10
44     =cut
45    
46 dpavlin 750 our $VERSION = '0.23';
47 dpavlin 10
48     =head1 SYNOPSIS
49    
50 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
51     from input records using perl functions which are specialized for set
52     processing.
53 dpavlin 10
54 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
55     means that you check it's validity before running WebPAC using
56     C<perl -c normalize.pl>.
57 dpavlin 15
58 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
59 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 dpavlin 547 C<marc>.
61 dpavlin 15
62 dpavlin 10 =head1 FUNCTIONS
63    
64 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
65     All other functions are available for use within normalisation rules.
66    
67 dpavlin 536 =head2 data_structure
68 dpavlin 10
69 dpavlin 536 Return data structure
70 dpavlin 13
71 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
72 dpavlin 725 lookup => $lookup_hash,
73 dpavlin 536 row => $row,
74     rules => $normalize_pl_config,
75 dpavlin 541 marc_encoding => 'utf-8',
76 dpavlin 595 config => $config,
77 dpavlin 736 load_row_coderef => sub {
78 dpavlin 725 my ($database,$input,$mfn) = shift;
79 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
80 dpavlin 725 },
81 dpavlin 13 );
82    
83 dpavlin 707 Options C<row>, C<rules> and C<log> are mandatory while all
84 dpavlin 540 other are optional.
85    
86 dpavlin 736 C<load_row_coderef> is closure only used when executing lookups, so they will
87 dpavlin 725 die if it's not defined.
88    
89 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
90 dpavlin 15
91 dpavlin 538 Since this function isn't exported you have to call it with
92     C<WebPAC::Normalize::data_structure>.
93    
94 dpavlin 10 =cut
95    
96 dpavlin 736 my $load_row_coderef;
97 dpavlin 725
98 dpavlin 536 sub data_structure {
99     my $arg = {@_};
100 dpavlin 13
101 dpavlin 536 die "need row argument" unless ($arg->{row});
102     die "need normalisation argument" unless ($arg->{rules});
103 dpavlin 31
104 dpavlin 536 no strict 'subs';
105 dpavlin 730 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 dpavlin 538 _set_rec( $arg->{row} );
107 dpavlin 730 _set_config( $arg->{config} ) if defined($arg->{config});
108 dpavlin 541 _clean_ds( %{ $arg } );
109 dpavlin 736 $load_row_coderef = $arg->{load_row_coderef};
110 dpavlin 725
111 dpavlin 536 eval "$arg->{rules}";
112     die "error evaling $arg->{rules}: $@\n" if ($@);
113 dpavlin 540
114 dpavlin 538 return _get_ds();
115 dpavlin 10 }
116    
117 dpavlin 538 =head2 _set_rec
118 dpavlin 13
119 dpavlin 536 Set current record hash
120 dpavlin 433
121 dpavlin 538 _set_rec( $rec );
122 dpavlin 433
123     =cut
124    
125 dpavlin 536 my $rec;
126 dpavlin 433
127 dpavlin 538 sub _set_rec {
128 dpavlin 536 $rec = shift or die "no record hash";
129 dpavlin 433 }
130    
131 dpavlin 595 =head2 _set_config
132    
133     Set current config hash
134    
135     _set_config( $config );
136    
137     Magic keys are:
138    
139     =over 4
140    
141     =item _
142    
143     Code of current database
144    
145     =item _mfn
146    
147     Current MFN
148    
149     =back
150    
151     =cut
152    
153     my $config;
154    
155     sub _set_config {
156     $config = shift;
157     }
158    
159 dpavlin 538 =head2 _get_ds
160    
161     Return hash formatted as data structure
162    
163     my $ds = _get_ds();
164    
165     =cut
166    
167 dpavlin 741 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
168 dpavlin 574 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
169 dpavlin 538
170     sub _get_ds {
171     return $out;
172     }
173    
174     =head2 _clean_ds
175    
176     Clean data structure hash for next record
177    
178     _clean_ds();
179    
180     =cut
181    
182     sub _clean_ds {
183 dpavlin 541 my $a = {@_};
184 dpavlin 741 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
185 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
186 dpavlin 541 $marc_encoding = $a->{marc_encoding};
187 dpavlin 538 }
188    
189     =head2 _set_lookup
190    
191     Set current lookup hash
192    
193     _set_lookup( $lookup );
194    
195     =cut
196    
197     my $lookup;
198    
199     sub _set_lookup {
200     $lookup = shift;
201     }
202    
203 dpavlin 707 =head2 _get_lookup
204    
205     Get current lookup hash
206    
207     my $lookup = _get_lookup();
208    
209     =cut
210    
211     sub _get_lookup {
212     return $lookup;
213     }
214    
215 dpavlin 736 =head2 _set_load_row
216 dpavlin 725
217     Setup code reference which will return L<data_structure> from
218     L<WebPAC::Store>
219    
220 dpavlin 736 _set_load_row(sub {
221 dpavlin 725 my ($database,$input,$mfn) = @_;
222 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
223 dpavlin 725 });
224    
225     =cut
226    
227 dpavlin 736 sub _set_load_row {
228 dpavlin 725 my $coderef = shift;
229     confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
230    
231 dpavlin 736 $load_row_coderef = $coderef;
232 dpavlin 725 }
233    
234 dpavlin 547 =head2 _get_marc_fields
235 dpavlin 540
236 dpavlin 547 Get all fields defined by calls to C<marc>
237 dpavlin 540
238 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
239 dpavlin 540
240 dpavlin 543 We are using I<magic> which detect repeatable fields only from
241     sequence of field/subfield data generated by normalization.
242    
243 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
244     if any of indicators are different.
245 dpavlin 543
246 dpavlin 554 This is sane for most cases. Something like:
247 dpavlin 543
248 dpavlin 554 900a-1 900b-1 900c-1
249     900a-2 900b-2
250     900a-3
251    
252     will be created from any combination of:
253    
254     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
255    
256     and following rules:
257    
258     marc('900','a', rec('200','a') );
259     marc('900','b', rec('200','b') );
260     marc('900','c', rec('200','c') );
261    
262     which might not be what you have in mind. If you need repeatable subfield,
263     define it using C<marc_repeatable_subfield> like this:
264    
265 dpavlin 574 marc_repeatable_subfield('900','a');
266     marc('900','a', rec('200','a') );
267     marc('900','b', rec('200','b') );
268     marc('900','c', rec('200','c') );
269 dpavlin 554
270 dpavlin 574 will create:
271    
272     900a-1 900a-2 900a-3 900b-1 900c-1
273     900b-2
274    
275     There is also support for returning next or specific using:
276    
277     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
278     # do something with $mf
279     }
280    
281     will always return fields from next MARC record or
282    
283     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
284    
285     will return 42th copy record (if it exists).
286    
287 dpavlin 540 =cut
288    
289 dpavlin 547 sub _get_marc_fields {
290 dpavlin 550
291 dpavlin 574 my $arg = {@_};
292     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
293     my $offset = $marc_fetch_offset;
294     if ($arg->{offset}) {
295     $offset = $arg->{offset};
296     } elsif($arg->{fetch_next}) {
297     $marc_fetch_offset++;
298     }
299 dpavlin 550
300 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
301    
302     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
303    
304     my $marc_rec = $marc_record->[ $offset ];
305    
306     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
307    
308     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
309    
310 dpavlin 550 # first, sort all existing fields
311     # XXX might not be needed, but modern perl might randomize elements in hash
312     my @sorted_marc_record = sort {
313 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
314 dpavlin 574 } @{ $marc_rec };
315 dpavlin 550
316 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
317 dpavlin 562
318 dpavlin 550 # output marc fields
319 dpavlin 542 my @m;
320 dpavlin 550
321     # count unique field-subfields (used for offset when walking to next subfield)
322     my $u;
323 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
324 dpavlin 550
325     if ($debug) {
326 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
327     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
328     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
329     warn "## subfield count = ", dump( $u ), $/;
330 dpavlin 550 }
331    
332     my $len = $#sorted_marc_record;
333     my $visited;
334     my $i = 0;
335     my $field;
336    
337     foreach ( 0 .. $len ) {
338    
339     # find next element which isn't visited
340     while ($visited->{$i}) {
341     $i = ($i + 1) % ($len + 1);
342 dpavlin 542 }
343    
344 dpavlin 550 # mark it visited
345     $visited->{$i}++;
346    
347 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
348 dpavlin 550
349     # field and subfield which is key for
350     # marc_repeatable_subfield and u
351 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
352 dpavlin 550
353     if ($debug > 1) {
354    
355     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
356     print "### this [$i]: ", dump( $row ),$/;
357     print "### sf: ", $row->[3], " vs ", $field->[3],
358     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
359     if ($#$field >= 0);
360    
361     }
362    
363     # if field exists
364     if ( $#$field >= 0 ) {
365     if (
366     $row->[0] ne $field->[0] || # field
367     $row->[1] ne $field->[1] || # i1
368     $row->[2] ne $field->[2] # i2
369     ) {
370     push @m, $field;
371     warn "## saved/1 ", dump( $field ),$/ if ($debug);
372     $field = $row;
373    
374     } elsif (
375     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
376     ||
377     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
378     ! $marc_repeatable_subfield->{ $fsf }
379     )
380     ) {
381     push @m, $field;
382     warn "## saved/2 ", dump( $field ),$/ if ($debug);
383     $field = $row;
384    
385     } else {
386     # append new subfields to existing field
387     push @$field, ( $row->[3], $row->[4] );
388     }
389     } else {
390     # insert first field
391     $field = $row;
392     }
393    
394     if (! $marc_repeatable_subfield->{ $fsf }) {
395     # make step to next subfield
396     $i = ($i + $u->{ $fsf } ) % ($len + 1);
397     }
398 dpavlin 542 }
399    
400 dpavlin 550 if ($#$field >= 0) {
401     push @m, $field;
402     warn "## saved/3 ", dump( $field ),$/ if ($debug);
403     }
404 dpavlin 542
405 dpavlin 579 return \@m;
406 dpavlin 540 }
407    
408 dpavlin 554 =head2 _debug
409    
410     Change level of debug warnings
411    
412     _debug( 2 );
413    
414     =cut
415    
416     sub _debug {
417     my $l = shift;
418     return $debug unless defined($l);
419 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
420 dpavlin 554 $debug = $l;
421     }
422    
423 dpavlin 540 =head1 Functions to create C<data_structure>
424    
425     Those functions generally have to first in your normalization file.
426    
427 dpavlin 536 =head2 tag
428 dpavlin 433
429 dpavlin 536 Define new tag for I<search> and I<display>.
430 dpavlin 433
431 dpavlin 536 tag('Title', rec('200','a') );
432 dpavlin 13
433    
434     =cut
435    
436 dpavlin 536 sub tag {
437     my $name = shift or die "tag needs name as first argument";
438     my @o = grep { defined($_) && $_ ne '' } @_;
439     return unless (@o);
440     $out->{$name}->{tag} = $name;
441     $out->{$name}->{search} = \@o;
442     $out->{$name}->{display} = \@o;
443     }
444 dpavlin 13
445 dpavlin 536 =head2 display
446 dpavlin 13
447 dpavlin 536 Define tag just for I<display>
448 dpavlin 125
449 dpavlin 536 @v = display('Title', rec('200','a') );
450 dpavlin 125
451 dpavlin 536 =cut
452 dpavlin 125
453 dpavlin 536 sub display {
454     my $name = shift or die "display needs name as first argument";
455     my @o = grep { defined($_) && $_ ne '' } @_;
456     return unless (@o);
457     $out->{$name}->{tag} = $name;
458     $out->{$name}->{display} = \@o;
459     }
460 dpavlin 13
461 dpavlin 536 =head2 search
462 dpavlin 13
463 dpavlin 536 Prepare values just for I<search>
464 dpavlin 13
465 dpavlin 536 @v = search('Title', rec('200','a') );
466 dpavlin 433
467 dpavlin 536 =cut
468 dpavlin 13
469 dpavlin 536 sub search {
470     my $name = shift or die "search needs name as first argument";
471     my @o = grep { defined($_) && $_ ne '' } @_;
472     return unless (@o);
473     $out->{$name}->{tag} = $name;
474     $out->{$name}->{search} = \@o;
475 dpavlin 13 }
476    
477 dpavlin 564 =head2 marc_leader
478    
479     Setup fields within MARC leader or get leader
480    
481     marc_leader('05','c');
482     my $leader = marc_leader();
483    
484     =cut
485    
486     sub marc_leader {
487     my ($offset,$value) = @_;
488    
489     if ($offset) {
490 dpavlin 741 $leader->{ $offset } = $value;
491 dpavlin 564 } else {
492 dpavlin 741 return $leader;
493 dpavlin 564 }
494     }
495    
496 dpavlin 547 =head2 marc
497 dpavlin 540
498     Save value for MARC field
499    
500 dpavlin 547 marc('900','a', rec('200','a') );
501 dpavlin 571 marc('001', rec('000') );
502 dpavlin 540
503     =cut
504    
505 dpavlin 547 sub marc {
506     my $f = shift or die "marc needs field";
507     die "marc field must be numer" unless ($f =~ /^\d+$/);
508 dpavlin 540
509 dpavlin 571 my $sf;
510     if ($f >= 10) {
511     $sf = shift or die "marc needs subfield";
512     }
513 dpavlin 540
514 dpavlin 541 foreach (@_) {
515     my $v = $_; # make var read-write for Encode
516 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
517 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
518 dpavlin 571 if (defined $sf) {
519 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
520 dpavlin 571 } else {
521 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
522 dpavlin 571 }
523 dpavlin 540 }
524     }
525    
526 dpavlin 547 =head2 marc_repeatable_subfield
527    
528     Save values for MARC repetable subfield
529    
530     marc_repeatable_subfield('910', 'z', rec('909') );
531    
532     =cut
533    
534     sub marc_repeatable_subfield {
535 dpavlin 550 my ($f,$sf) = @_;
536     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
537     $marc_repeatable_subfield->{ $f . $sf }++;
538 dpavlin 547 marc(@_);
539     }
540    
541     =head2 marc_indicators
542    
543     Set both indicators for MARC field
544    
545     marc_indicators('900', ' ', 1);
546    
547     Any indicator value other than C<0-9> will be treated as undefined.
548    
549     =cut
550    
551     sub marc_indicators {
552     my $f = shift || die "marc_indicators need field!\n";
553     my ($i1,$i2) = @_;
554     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
555     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
556    
557     $i1 = ' ' if ($i1 !~ /^\d$/);
558     $i2 = ' ' if ($i2 !~ /^\d$/);
559 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
560 dpavlin 547 }
561    
562 dpavlin 562 =head2 marc_compose
563 dpavlin 547
564 dpavlin 562 Save values for each MARC subfield explicitly
565    
566     marc_compose('900',
567     'a', rec('200','a')
568     'b', rec('201','a')
569     'a', rec('200','b')
570     'c', rec('200','c')
571     );
572    
573 dpavlin 603 If you specify C<+> for subfield, value will be appended
574     to previous defined subfield.
575    
576 dpavlin 562 =cut
577    
578     sub marc_compose {
579     my $f = shift or die "marc_compose needs field";
580     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
581    
582     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
583     my $m = [ $f, $i1, $i2 ];
584    
585 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
586    
587 dpavlin 619 if ($#_ % 2 != 1) {
588     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
589     }
590    
591 dpavlin 562 while (@_) {
592 dpavlin 619 my $sf = shift;
593 dpavlin 565 my $v = shift;
594 dpavlin 562
595     next unless (defined($v) && $v !~ /^\s*$/);
596 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
597 dpavlin 603 if ($sf ne '+') {
598     push @$m, ( $sf, $v );
599     } else {
600     $m->[ $#$m ] .= $v;
601     }
602 dpavlin 562 }
603    
604 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
605 dpavlin 562
606 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
607 dpavlin 562 }
608    
609 dpavlin 574 =head2 marc_duplicate
610 dpavlin 562
611 dpavlin 574 Generate copy of current MARC record and continue working on copy
612    
613     marc_duplicate();
614    
615     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
616     C<< _get_marc_fields( offset => 42 ) >>.
617    
618     =cut
619    
620     sub marc_duplicate {
621     my $m = $marc_record->[ -1 ];
622     die "can't duplicate record which isn't defined" unless ($m);
623     push @{ $marc_record }, dclone( $m );
624     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
625     $marc_record_offset = $#{ $marc_record };
626     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
627     }
628    
629     =head2 marc_remove
630    
631     Remove some field or subfield from MARC record.
632    
633     marc_remove('200');
634     marc_remove('200','a');
635    
636     This will erase field C<200> or C<200^a> from current MARC record.
637    
638     This is useful after calling C<marc_duplicate> or on it's own (but, you
639     should probably just remove that subfield definition if you are not
640     using C<marc_duplicate>).
641    
642     FIXME: support fields < 10.
643    
644     =cut
645    
646     sub marc_remove {
647     my ($f, $sf) = @_;
648    
649     die "marc_remove needs record number" unless defined($f);
650    
651     my $marc = $marc_record->[ $marc_record_offset ];
652    
653     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
654    
655 dpavlin 579 my $i = 0;
656     foreach ( 0 .. $#{ $marc } ) {
657 dpavlin 574 last unless (defined $marc->[$i]);
658     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
659     if ($marc->[$i]->[0] eq $f) {
660     if (! defined $sf) {
661     # remove whole field
662     splice @$marc, $i, 1;
663     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
664     $i--;
665     } else {
666     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
667     my $o = ($j * 2) + 3;
668     if ($marc->[$i]->[$o] eq $sf) {
669     # remove subfield
670     splice @{$marc->[$i]}, $o, 2;
671     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
672     # is record now empty?
673     if ($#{ $marc->[$i] } == 2) {
674     splice @$marc, $i, 1;
675     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
676     $i--;
677     };
678     }
679     }
680     }
681     }
682 dpavlin 579 $i++;
683 dpavlin 574 }
684    
685     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
686    
687     $marc_record->[ $marc_record_offset ] = $marc;
688    
689     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
690     }
691    
692 dpavlin 604 =head2 marc_original_order
693    
694     Copy all subfields preserving original order to marc field.
695    
696 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
697 dpavlin 604
698 dpavlin 616 Please note that field numbers are consistent with other commands (marc
699     field number first), but somewhat counter-intuitive (destination and then
700     source).
701    
702 dpavlin 604 You might want to use this command if you are just renaming subfields or
703     using pre-processing modify_record in C<config.yml> and don't need any
704     post-processing or want to preserve order of original subfields.
705    
706 dpavlin 616
707 dpavlin 604 =cut
708    
709     sub marc_original_order {
710    
711 dpavlin 616 my ($to, $from) = @_;
712 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
713    
714 dpavlin 616 return unless defined($rec->{$from});
715    
716     my $r = $rec->{$from};
717 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
718    
719     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
720 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
721 dpavlin 604
722     foreach my $d (@$r) {
723    
724 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
725 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
726 dpavlin 605 next;
727     }
728    
729 dpavlin 604 my @sfs = @{ $d->{subfields} };
730    
731     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
732    
733 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
734 dpavlin 604
735     my $m = [ $to, $i1, $i2 ];
736    
737     while (my $sf = shift @sfs) {
738 dpavlin 618
739     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
740 dpavlin 604 my $offset = shift @sfs;
741     die "corrupted sufields specification for field $from\n" unless defined($offset);
742    
743     my $v;
744     if (ref($d->{$sf}) eq 'ARRAY') {
745     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
746     } elsif ($offset == 0) {
747     $v = $d->{$sf};
748     } else {
749     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
750     }
751     push @$m, ( $sf, $v ) if (defined($v));
752     }
753    
754     if ($#{$m} > 2) {
755     push @{ $marc_record->[ $marc_record_offset ] }, $m;
756     }
757     }
758    
759     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
760     }
761    
762    
763 dpavlin 540 =head1 Functions to extract data from input
764    
765     This function should be used inside functions to create C<data_structure> described
766     above.
767    
768 dpavlin 641 =head2 _pack_subfields_hash
769    
770 dpavlin 669 @subfields = _pack_subfields_hash( $h );
771     $subfields = _pack_subfields_hash( $h, 1 );
772 dpavlin 641
773 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
774     with subfields (denoted by C<^>) and values.
775    
776 dpavlin 641 =cut
777    
778     sub _pack_subfields_hash {
779    
780 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
781 dpavlin 641
782     my ($h,$include_subfields) = @_;
783    
784     if ( defined($h->{subfields}) ) {
785     my $sfs = delete $h->{subfields} || die "no subfields?";
786     my @out;
787     while (@$sfs) {
788     my $sf = shift @$sfs;
789     push @out, '^' . $sf if ($include_subfields);
790     my $o = shift @$sfs;
791     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
792     # single element subfields are not arrays
793 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
794 dpavlin 667
795 dpavlin 641 push @out, $h->{$sf};
796     } else {
797 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
798 dpavlin 641 push @out, $h->{$sf}->[$o];
799     }
800     }
801 dpavlin 667 if ($include_subfields) {
802     return join('', @out);
803     } else {
804     return @out;
805     }
806 dpavlin 641 } else {
807 dpavlin 667 if ($include_subfields) {
808     my $out = '';
809 dpavlin 668 foreach my $sf (sort keys %$h) {
810 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
811     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
812     } else {
813     $out .= '^' . $sf . $h->{$sf};
814     }
815     }
816     return $out;
817     } else {
818     # FIXME this should probably be in alphabetical order instead of hash order
819     values %{$h};
820     }
821 dpavlin 641 }
822     }
823    
824 dpavlin 536 =head2 rec1
825 dpavlin 371
826 dpavlin 536 Return all values in some field
827 dpavlin 371
828 dpavlin 536 @v = rec1('200')
829 dpavlin 15
830 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
831 dpavlin 15
832 dpavlin 536 =cut
833 dpavlin 15
834 dpavlin 536 sub rec1 {
835     my $f = shift;
836 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
837 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
838 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
839 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
840 dpavlin 641 my @out;
841     foreach my $h ( @{ $rec->{$f} } ) {
842     if (ref($h) eq 'HASH') {
843     push @out, ( _pack_subfields_hash( $h ) );
844 dpavlin 31 } else {
845 dpavlin 641 push @out, $h;
846 dpavlin 31 }
847 dpavlin 641 }
848     return @out;
849 dpavlin 536 } elsif( defined($rec->{$f}) ) {
850     return $rec->{$f};
851 dpavlin 15 }
852     }
853    
854 dpavlin 536 =head2 rec2
855 dpavlin 15
856 dpavlin 536 Return all values in specific field and subfield
857 dpavlin 13
858 dpavlin 536 @v = rec2('200','a')
859 dpavlin 13
860     =cut
861    
862 dpavlin 536 sub rec2 {
863     my $f = shift;
864     return unless (defined($rec && $rec->{$f}));
865     my $sf = shift;
866 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
867 dpavlin 589 return map {
868     if (ref($_->{$sf}) eq 'ARRAY') {
869     @{ $_->{$sf} };
870     } else {
871     $_->{$sf};
872     }
873     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
874 dpavlin 536 }
875 dpavlin 13
876 dpavlin 536 =head2 rec
877 dpavlin 13
878 dpavlin 536 syntaxtic sugar for
879 dpavlin 13
880 dpavlin 536 @v = rec('200')
881     @v = rec('200','a')
882 dpavlin 13
883 dpavlin 750 If rec() returns just single value, it will
884     return scalar, not array.
885    
886 dpavlin 536 =cut
887 dpavlin 373
888 dpavlin 536 sub rec {
889 dpavlin 583 my @out;
890 dpavlin 536 if ($#_ == 0) {
891 dpavlin 583 @out = rec1(@_);
892 dpavlin 536 } elsif ($#_ == 1) {
893 dpavlin 583 @out = rec2(@_);
894 dpavlin 13 }
895 dpavlin 750 if ($#out == 0 && ! wantarray) {
896     return $out[0];
897     } elsif (@out) {
898 dpavlin 583 return @out;
899     } else {
900     return '';
901     }
902 dpavlin 13 }
903    
904 dpavlin 536 =head2 regex
905 dpavlin 15
906 dpavlin 536 Apply regex to some or all values
907 dpavlin 15
908 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
909 dpavlin 15
910     =cut
911    
912 dpavlin 536 sub regex {
913     my $r = shift;
914     my @out;
915 dpavlin 550 #warn "r: $r\n", dump(\@_);
916 dpavlin 536 foreach my $t (@_) {
917     next unless ($t);
918     eval "\$t =~ $r";
919     push @out, $t if ($t && $t ne '');
920 dpavlin 368 }
921 dpavlin 536 return @out;
922 dpavlin 15 }
923    
924 dpavlin 536 =head2 prefix
925 dpavlin 15
926 dpavlin 536 Prefix all values with a string
927 dpavlin 15
928 dpavlin 536 @v = prefix( 'my_', @v );
929 dpavlin 15
930     =cut
931    
932 dpavlin 536 sub prefix {
933 dpavlin 592 my $p = shift or return;
934 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
935     }
936 dpavlin 15
937 dpavlin 536 =head2 suffix
938 dpavlin 15
939 dpavlin 536 suffix all values with a string
940 dpavlin 15
941 dpavlin 536 @v = suffix( '_my', @v );
942 dpavlin 15
943 dpavlin 536 =cut
944 dpavlin 15
945 dpavlin 536 sub suffix {
946     my $s = shift or die "suffix needs string as first argument";
947     return map { $_ . $s } grep { defined($_) } @_;
948 dpavlin 15 }
949    
950 dpavlin 536 =head2 surround
951 dpavlin 13
952 dpavlin 536 surround all values with a two strings
953 dpavlin 13
954 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
955 dpavlin 13
956     =cut
957    
958 dpavlin 536 sub surround {
959     my $p = shift or die "surround need prefix as first argument";
960     my $s = shift or die "surround needs suffix as second argument";
961     return map { $p . $_ . $s } grep { defined($_) } @_;
962 dpavlin 13 }
963    
964 dpavlin 536 =head2 first
965 dpavlin 13
966 dpavlin 536 Return first element
967 dpavlin 15
968 dpavlin 536 $v = first( @v );
969 dpavlin 13
970     =cut
971    
972 dpavlin 536 sub first {
973     my $r = shift;
974     return $r;
975 dpavlin 13 }
976    
977 dpavlin 536 =head2 lookup
978 dpavlin 13
979 dpavlin 536 Consult lookup hashes for some value
980 dpavlin 13
981 dpavlin 725 @v = lookup(
982     sub {
983     'ffkk/peri/mfn'.rec('000')
984     },
985     'ffkk','peri','200-a-200-e',
986     sub {
987     first(rec(200,'a')).' '.first(rec('200','e'))
988     }
989     );
990 dpavlin 13
991 dpavlin 725 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
992     normal lookup definition in C<conf/lookup/something.pl> which looks like:
993 dpavlin 707
994 dpavlin 725 lookup(
995     # which results to return from record recorded in lookup
996     sub { 'ffkk/peri/mfn' . rec('000') },
997     # from which database and input
998     'ffkk','peri',
999     # such that following values match
1000     sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1001     # if this part is missing, we will try to match same fields
1002     # from lookup record and current one, or you can override
1003     # which records to use from current record using
1004     sub { rec('900','x') . ' ' . rec('900','y') },
1005     )
1006    
1007     You can think about this lookup as SQL (if that helps):
1008    
1009     select
1010     sub { what }
1011     from
1012     database, input
1013     where
1014     sub { filter from lookuped record }
1015     having
1016     sub { optional filter on current record }
1017    
1018     Easy as pie, right?
1019    
1020 dpavlin 13 =cut
1021    
1022 dpavlin 536 sub lookup {
1023 dpavlin 725 my ($what, $database, $input, $key, $having) = @_;
1024    
1025 dpavlin 766 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1026 dpavlin 725
1027 dpavlin 729 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1028 dpavlin 725 return unless (defined($lookup->{$database}->{$input}->{$key}));
1029    
1030 dpavlin 736 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1031 dpavlin 725
1032     my $mfns;
1033     my @having = $having->();
1034    
1035 dpavlin 729 warn "## having = ", dump( @having ) if ($debug > 2);
1036 dpavlin 725
1037     foreach my $h ( @having ) {
1038     if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1039 dpavlin 752 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1040 dpavlin 725 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1041     }
1042 dpavlin 536 }
1043 dpavlin 725
1044     return unless ($mfns);
1045    
1046     my @mfns = sort keys %$mfns;
1047    
1048 dpavlin 750 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1049 dpavlin 725
1050     my $old_rec = $rec;
1051     my @out;
1052    
1053     foreach my $mfn (@mfns) {
1054 dpavlin 736 $rec = $load_row_coderef->( $database, $input, $mfn );
1055 dpavlin 725
1056 dpavlin 752 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1057 dpavlin 725
1058     my @vals = $what->();
1059    
1060     push @out, ( @vals );
1061    
1062 dpavlin 752 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1063 dpavlin 725 }
1064    
1065     # if (ref($lookup->{$k}) eq 'ARRAY') {
1066     # return @{ $lookup->{$k} };
1067     # } else {
1068     # return $lookup->{$k};
1069     # }
1070    
1071     $rec = $old_rec;
1072    
1073 dpavlin 750 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1074 dpavlin 725
1075 dpavlin 740 if ($#out == 0) {
1076     return $out[0];
1077     } else {
1078     return @out;
1079     }
1080 dpavlin 13 }
1081    
1082 dpavlin 707 =head2 save_into_lookup
1083    
1084 dpavlin 725 Save value into lookup. It associates current database, input
1085     and specific keys with one or more values which will be
1086     associated over MFN.
1087 dpavlin 707
1088 dpavlin 725 MFN will be extracted from first occurence current of field 000
1089     in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1090    
1091     my $nr = save_into_lookup($database,$input,$key,sub {
1092 dpavlin 707 # code which produce one or more values
1093     });
1094    
1095 dpavlin 725 It returns number of items saved.
1096 dpavlin 707
1097 dpavlin 725 This function shouldn't be called directly, it's called from code created by
1098     L<WebPAC::Parser>.
1099    
1100 dpavlin 707 =cut
1101    
1102     sub save_into_lookup {
1103 dpavlin 712 my ($database,$input,$key,$coderef) = @_;
1104     die "save_into_lookup needs database" unless defined($database);
1105     die "save_into_lookup needs input" unless defined($input);
1106     die "save_into_lookup needs key" unless defined($key);
1107 dpavlin 707 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1108 dpavlin 725
1109 dpavlin 729 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1110 dpavlin 725
1111     my $mfn =
1112     defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1113     defined($config->{_mfn}) ? $config->{_mfn} :
1114     die "mfn not defined or zero";
1115    
1116     my $nr = 0;
1117    
1118 dpavlin 707 foreach my $v ( $coderef->() ) {
1119 dpavlin 712 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1120 dpavlin 721 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1121 dpavlin 725 $nr++;
1122 dpavlin 707 }
1123 dpavlin 725
1124     return $nr;
1125 dpavlin 707 }
1126    
1127 dpavlin 595 =head2 config
1128    
1129     Consult config values stored in C<config.yml>
1130    
1131     # return database code (key under databases in yaml)
1132     $database_code = config(); # use _ from hash
1133     $database_name = config('name');
1134     $database_input_name = config('input name');
1135     $tag = config('input normalize tag');
1136    
1137     Up to three levels are supported.
1138    
1139     =cut
1140    
1141     sub config {
1142     return unless ($config);
1143    
1144     my $p = shift;
1145    
1146     $p ||= '';
1147    
1148     my $v;
1149    
1150     warn "### getting config($p)\n" if ($debug > 1);
1151    
1152     my @p = split(/\s+/,$p);
1153     if ($#p < 0) {
1154     $v = $config->{ '_' }; # special, database code
1155     } else {
1156    
1157     my $c = dclone( $config );
1158    
1159     foreach my $k (@p) {
1160     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1161     if (ref($c) eq 'ARRAY') {
1162     $c = shift @$c;
1163     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1164     last;
1165     }
1166    
1167     if (! defined($c->{$k}) ) {
1168     $c = undef;
1169     last;
1170     } else {
1171     $c = $c->{$k};
1172     }
1173     }
1174     $v = $c if ($c);
1175    
1176     }
1177    
1178     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1179     warn "config( '$p' ) is empty\n" if (! $v);
1180    
1181     return $v;
1182     }
1183    
1184     =head2 id
1185    
1186     Returns unique id of this record
1187    
1188     $id = id();
1189    
1190     Returns C<42/2> for 2nd occurence of MFN 42.
1191    
1192     =cut
1193    
1194     sub id {
1195     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1196     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1197     }
1198    
1199 dpavlin 536 =head2 join_with
1200 dpavlin 13
1201 dpavlin 536 Joins walues with some delimiter
1202 dpavlin 10
1203 dpavlin 536 $v = join_with(", ", @v);
1204 dpavlin 10
1205 dpavlin 536 =cut
1206 dpavlin 10
1207 dpavlin 536 sub join_with {
1208     my $d = shift;
1209 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1210 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1211     return '' unless defined($v);
1212     return $v;
1213 dpavlin 536 }
1214 dpavlin 10
1215 dpavlin 562 =head2 split_rec_on
1216    
1217     Split record subfield on some regex and take one of parts out
1218    
1219     $a_before_semi_column =
1220     split_rec_on('200','a', /\s*;\s*/, $part);
1221    
1222     C<$part> is optional number of element. First element is
1223     B<1>, not 0!
1224    
1225     If there is no C<$part> parameter or C<$part> is 0, this function will
1226     return all values produced by splitting.
1227    
1228     =cut
1229    
1230     sub split_rec_on {
1231     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1232    
1233     my ($fld, $sf, $regex, $part) = @_;
1234 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1235 dpavlin 562
1236     my @r = rec( $fld, $sf );
1237     my $v = shift @r;
1238 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1239 dpavlin 562
1240 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1241 dpavlin 566
1242 dpavlin 562 my @s = split( $regex, $v );
1243 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1244 dpavlin 566 if ($part && $part > 0) {
1245 dpavlin 562 return $s[ $part - 1 ];
1246     } else {
1247 dpavlin 571 return @s;
1248 dpavlin 562 }
1249     }
1250    
1251 dpavlin 536 # END
1252     1;

  ViewVC Help
Powered by ViewVC 1.1.26