/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 736 - (hide annotations)
Thu Oct 5 12:57:51 2006 UTC (17 years, 6 months ago) by dpavlin
File size: 27446 byte(s)
 r1065@llin:  dpavlin | 2006-10-05 14:54:48 +0200
 actually we don't need *_load_ds, but _load_row for lookups

1 dpavlin 10 package WebPAC::Normalize;
2 dpavlin 536 use Exporter 'import';
3     @EXPORT = qw/
4 dpavlin 538 _set_rec _set_lookup
5 dpavlin 736 _set_load_row
6 dpavlin 538 _get_ds _clean_ds
7 dpavlin 554 _debug
8 dpavlin 641 _pack_subfields_hash
9 dpavlin 538
10 dpavlin 536 tag search display
11 dpavlin 547 marc marc_indicators marc_repeatable_subfield
12 dpavlin 564 marc_compose marc_leader
13 dpavlin 574 marc_duplicate marc_remove
14 dpavlin 604 marc_original_order
15 dpavlin 540
16 dpavlin 536 rec1 rec2 rec
17     regex prefix suffix surround
18     first lookup join_with
19 dpavlin 707 save_into_lookup
20 dpavlin 562
21     split_rec_on
22 dpavlin 536 /;
23 dpavlin 10
24     use warnings;
25     use strict;
26 dpavlin 536
27     #use base qw/WebPAC::Common/;
28 dpavlin 550 use Data::Dump qw/dump/;
29 dpavlin 574 use Storable qw/dclone/;
30 dpavlin 725 use Carp qw/confess/;
31 dpavlin 10
32 dpavlin 550 # debugging warn(s)
33     my $debug = 0;
34    
35    
36 dpavlin 10 =head1 NAME
37    
38 dpavlin 536 WebPAC::Normalize - describe normalisaton rules using sets
39 dpavlin 10
40     =head1 VERSION
41    
42 dpavlin 736 Version 0.22
43 dpavlin 10
44     =cut
45    
46 dpavlin 736 our $VERSION = '0.22';
47 dpavlin 10
48     =head1 SYNOPSIS
49    
50 dpavlin 536 This module uses C<conf/normalize/*.pl> files to perform normalisation
51     from input records using perl functions which are specialized for set
52     processing.
53 dpavlin 10
54 dpavlin 536 Sets are implemented as arrays, and normalisation file is valid perl, which
55     means that you check it's validity before running WebPAC using
56     C<perl -c normalize.pl>.
57 dpavlin 15
58 dpavlin 536 Normalisation can generate multiple output normalized data. For now, supported output
59 dpavlin 540 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
60 dpavlin 547 C<marc>.
61 dpavlin 15
62 dpavlin 10 =head1 FUNCTIONS
63    
64 dpavlin 538 Functions which start with C<_> are private and used by WebPAC internally.
65     All other functions are available for use within normalisation rules.
66    
67 dpavlin 536 =head2 data_structure
68 dpavlin 10
69 dpavlin 536 Return data structure
70 dpavlin 13
71 dpavlin 538 my $ds = WebPAC::Normalize::data_structure(
72 dpavlin 725 lookup => $lookup_hash,
73 dpavlin 536 row => $row,
74     rules => $normalize_pl_config,
75 dpavlin 541 marc_encoding => 'utf-8',
76 dpavlin 595 config => $config,
77 dpavlin 736 load_row_coderef => sub {
78 dpavlin 725 my ($database,$input,$mfn) = shift;
79 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
80 dpavlin 725 },
81 dpavlin 13 );
82    
83 dpavlin 707 Options C<row>, C<rules> and C<log> are mandatory while all
84 dpavlin 540 other are optional.
85    
86 dpavlin 736 C<load_row_coderef> is closure only used when executing lookups, so they will
87 dpavlin 725 die if it's not defined.
88    
89 dpavlin 536 This function will B<die> if normalizastion can't be evaled.
90 dpavlin 15
91 dpavlin 538 Since this function isn't exported you have to call it with
92     C<WebPAC::Normalize::data_structure>.
93    
94 dpavlin 10 =cut
95    
96 dpavlin 736 my $load_row_coderef;
97 dpavlin 725
98 dpavlin 536 sub data_structure {
99     my $arg = {@_};
100 dpavlin 13
101 dpavlin 536 die "need row argument" unless ($arg->{row});
102     die "need normalisation argument" unless ($arg->{rules});
103 dpavlin 31
104 dpavlin 536 no strict 'subs';
105 dpavlin 730 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
106 dpavlin 538 _set_rec( $arg->{row} );
107 dpavlin 730 _set_config( $arg->{config} ) if defined($arg->{config});
108 dpavlin 541 _clean_ds( %{ $arg } );
109 dpavlin 736 $load_row_coderef = $arg->{load_row_coderef};
110 dpavlin 725
111 dpavlin 536 eval "$arg->{rules}";
112     die "error evaling $arg->{rules}: $@\n" if ($@);
113 dpavlin 540
114 dpavlin 538 return _get_ds();
115 dpavlin 10 }
116    
117 dpavlin 538 =head2 _set_rec
118 dpavlin 13
119 dpavlin 536 Set current record hash
120 dpavlin 433
121 dpavlin 538 _set_rec( $rec );
122 dpavlin 433
123     =cut
124    
125 dpavlin 536 my $rec;
126 dpavlin 433
127 dpavlin 538 sub _set_rec {
128 dpavlin 536 $rec = shift or die "no record hash";
129 dpavlin 433 }
130    
131 dpavlin 595 =head2 _set_config
132    
133     Set current config hash
134    
135     _set_config( $config );
136    
137     Magic keys are:
138    
139     =over 4
140    
141     =item _
142    
143     Code of current database
144    
145     =item _mfn
146    
147     Current MFN
148    
149     =back
150    
151     =cut
152    
153     my $config;
154    
155     sub _set_config {
156     $config = shift;
157     }
158    
159 dpavlin 538 =head2 _get_ds
160    
161     Return hash formatted as data structure
162    
163     my $ds = _get_ds();
164    
165     =cut
166    
167 dpavlin 574 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
168     my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
169 dpavlin 538
170     sub _get_ds {
171     return $out;
172     }
173    
174     =head2 _clean_ds
175    
176     Clean data structure hash for next record
177    
178     _clean_ds();
179    
180     =cut
181    
182     sub _clean_ds {
183 dpavlin 541 my $a = {@_};
184 dpavlin 550 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
185 dpavlin 574 ($marc_record_offset, $marc_fetch_offset) = (0,0);
186 dpavlin 541 $marc_encoding = $a->{marc_encoding};
187 dpavlin 538 }
188    
189     =head2 _set_lookup
190    
191     Set current lookup hash
192    
193     _set_lookup( $lookup );
194    
195     =cut
196    
197     my $lookup;
198    
199     sub _set_lookup {
200     $lookup = shift;
201     }
202    
203 dpavlin 707 =head2 _get_lookup
204    
205     Get current lookup hash
206    
207     my $lookup = _get_lookup();
208    
209     =cut
210    
211     sub _get_lookup {
212     return $lookup;
213     }
214    
215 dpavlin 736 =head2 _set_load_row
216 dpavlin 725
217     Setup code reference which will return L<data_structure> from
218     L<WebPAC::Store>
219    
220 dpavlin 736 _set_load_row(sub {
221 dpavlin 725 my ($database,$input,$mfn) = @_;
222 dpavlin 736 $store->load_row( database => $database, input => $input, id => $mfn );
223 dpavlin 725 });
224    
225     =cut
226    
227 dpavlin 736 sub _set_load_row {
228 dpavlin 725 my $coderef = shift;
229     confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
230    
231 dpavlin 736 $load_row_coderef = $coderef;
232 dpavlin 725 }
233    
234 dpavlin 547 =head2 _get_marc_fields
235 dpavlin 540
236 dpavlin 547 Get all fields defined by calls to C<marc>
237 dpavlin 540
238 dpavlin 547 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
239 dpavlin 540
240 dpavlin 543 We are using I<magic> which detect repeatable fields only from
241     sequence of field/subfield data generated by normalization.
242    
243 dpavlin 554 Repeatable field is created when there is second occurence of same subfield or
244     if any of indicators are different.
245 dpavlin 543
246 dpavlin 554 This is sane for most cases. Something like:
247 dpavlin 543
248 dpavlin 554 900a-1 900b-1 900c-1
249     900a-2 900b-2
250     900a-3
251    
252     will be created from any combination of:
253    
254     900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
255    
256     and following rules:
257    
258     marc('900','a', rec('200','a') );
259     marc('900','b', rec('200','b') );
260     marc('900','c', rec('200','c') );
261    
262     which might not be what you have in mind. If you need repeatable subfield,
263     define it using C<marc_repeatable_subfield> like this:
264    
265 dpavlin 574 marc_repeatable_subfield('900','a');
266     marc('900','a', rec('200','a') );
267     marc('900','b', rec('200','b') );
268     marc('900','c', rec('200','c') );
269 dpavlin 554
270 dpavlin 574 will create:
271    
272     900a-1 900a-2 900a-3 900b-1 900c-1
273     900b-2
274    
275     There is also support for returning next or specific using:
276    
277     while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
278     # do something with $mf
279     }
280    
281     will always return fields from next MARC record or
282    
283     my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
284    
285     will return 42th copy record (if it exists).
286    
287 dpavlin 540 =cut
288    
289 dpavlin 547 sub _get_marc_fields {
290 dpavlin 550
291 dpavlin 574 my $arg = {@_};
292     warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
293     my $offset = $marc_fetch_offset;
294     if ($arg->{offset}) {
295     $offset = $arg->{offset};
296     } elsif($arg->{fetch_next}) {
297     $marc_fetch_offset++;
298     }
299 dpavlin 550
300 dpavlin 574 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
301    
302     warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
303    
304     my $marc_rec = $marc_record->[ $offset ];
305    
306     warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
307    
308     return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
309    
310 dpavlin 550 # first, sort all existing fields
311     # XXX might not be needed, but modern perl might randomize elements in hash
312     my @sorted_marc_record = sort {
313 dpavlin 572 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
314 dpavlin 574 } @{ $marc_rec };
315 dpavlin 550
316 dpavlin 574 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
317 dpavlin 562
318 dpavlin 550 # output marc fields
319 dpavlin 542 my @m;
320 dpavlin 550
321     # count unique field-subfields (used for offset when walking to next subfield)
322     my $u;
323 dpavlin 572 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
324 dpavlin 550
325     if ($debug) {
326 dpavlin 574 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
327     warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
328     warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
329     warn "## subfield count = ", dump( $u ), $/;
330 dpavlin 550 }
331    
332     my $len = $#sorted_marc_record;
333     my $visited;
334     my $i = 0;
335     my $field;
336    
337     foreach ( 0 .. $len ) {
338    
339     # find next element which isn't visited
340     while ($visited->{$i}) {
341     $i = ($i + 1) % ($len + 1);
342 dpavlin 542 }
343    
344 dpavlin 550 # mark it visited
345     $visited->{$i}++;
346    
347 dpavlin 574 my $row = dclone( $sorted_marc_record[$i] );
348 dpavlin 550
349     # field and subfield which is key for
350     # marc_repeatable_subfield and u
351 dpavlin 572 my $fsf = $row->[0] . ( $row->[3] || '' );
352 dpavlin 550
353     if ($debug > 1) {
354    
355     print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
356     print "### this [$i]: ", dump( $row ),$/;
357     print "### sf: ", $row->[3], " vs ", $field->[3],
358     $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
359     if ($#$field >= 0);
360    
361     }
362    
363     # if field exists
364     if ( $#$field >= 0 ) {
365     if (
366     $row->[0] ne $field->[0] || # field
367     $row->[1] ne $field->[1] || # i1
368     $row->[2] ne $field->[2] # i2
369     ) {
370     push @m, $field;
371     warn "## saved/1 ", dump( $field ),$/ if ($debug);
372     $field = $row;
373    
374     } elsif (
375     ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
376     ||
377     ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
378     ! $marc_repeatable_subfield->{ $fsf }
379     )
380     ) {
381     push @m, $field;
382     warn "## saved/2 ", dump( $field ),$/ if ($debug);
383     $field = $row;
384    
385     } else {
386     # append new subfields to existing field
387     push @$field, ( $row->[3], $row->[4] );
388     }
389     } else {
390     # insert first field
391     $field = $row;
392     }
393    
394     if (! $marc_repeatable_subfield->{ $fsf }) {
395     # make step to next subfield
396     $i = ($i + $u->{ $fsf } ) % ($len + 1);
397     }
398 dpavlin 542 }
399    
400 dpavlin 550 if ($#$field >= 0) {
401     push @m, $field;
402     warn "## saved/3 ", dump( $field ),$/ if ($debug);
403     }
404 dpavlin 542
405 dpavlin 579 return \@m;
406 dpavlin 540 }
407    
408 dpavlin 554 =head2 _debug
409    
410     Change level of debug warnings
411    
412     _debug( 2 );
413    
414     =cut
415    
416     sub _debug {
417     my $l = shift;
418     return $debug unless defined($l);
419 dpavlin 568 warn "debug level $l",$/ if ($l > 0);
420 dpavlin 554 $debug = $l;
421     }
422    
423 dpavlin 540 =head1 Functions to create C<data_structure>
424    
425     Those functions generally have to first in your normalization file.
426    
427 dpavlin 536 =head2 tag
428 dpavlin 433
429 dpavlin 536 Define new tag for I<search> and I<display>.
430 dpavlin 433
431 dpavlin 536 tag('Title', rec('200','a') );
432 dpavlin 13
433    
434     =cut
435    
436 dpavlin 536 sub tag {
437     my $name = shift or die "tag needs name as first argument";
438     my @o = grep { defined($_) && $_ ne '' } @_;
439     return unless (@o);
440     $out->{$name}->{tag} = $name;
441     $out->{$name}->{search} = \@o;
442     $out->{$name}->{display} = \@o;
443     }
444 dpavlin 13
445 dpavlin 536 =head2 display
446 dpavlin 13
447 dpavlin 536 Define tag just for I<display>
448 dpavlin 125
449 dpavlin 536 @v = display('Title', rec('200','a') );
450 dpavlin 125
451 dpavlin 536 =cut
452 dpavlin 125
453 dpavlin 536 sub display {
454     my $name = shift or die "display needs name as first argument";
455     my @o = grep { defined($_) && $_ ne '' } @_;
456     return unless (@o);
457     $out->{$name}->{tag} = $name;
458     $out->{$name}->{display} = \@o;
459     }
460 dpavlin 13
461 dpavlin 536 =head2 search
462 dpavlin 13
463 dpavlin 536 Prepare values just for I<search>
464 dpavlin 13
465 dpavlin 536 @v = search('Title', rec('200','a') );
466 dpavlin 433
467 dpavlin 536 =cut
468 dpavlin 13
469 dpavlin 536 sub search {
470     my $name = shift or die "search needs name as first argument";
471     my @o = grep { defined($_) && $_ ne '' } @_;
472     return unless (@o);
473     $out->{$name}->{tag} = $name;
474     $out->{$name}->{search} = \@o;
475 dpavlin 13 }
476    
477 dpavlin 564 =head2 marc_leader
478    
479     Setup fields within MARC leader or get leader
480    
481     marc_leader('05','c');
482     my $leader = marc_leader();
483    
484     =cut
485    
486     sub marc_leader {
487     my ($offset,$value) = @_;
488    
489     if ($offset) {
490     $out->{' leader'}->{ $offset } = $value;
491     } else {
492     return $out->{' leader'};
493     }
494     }
495    
496 dpavlin 547 =head2 marc
497 dpavlin 540
498     Save value for MARC field
499    
500 dpavlin 547 marc('900','a', rec('200','a') );
501 dpavlin 571 marc('001', rec('000') );
502 dpavlin 540
503     =cut
504    
505 dpavlin 547 sub marc {
506     my $f = shift or die "marc needs field";
507     die "marc field must be numer" unless ($f =~ /^\d+$/);
508 dpavlin 540
509 dpavlin 571 my $sf;
510     if ($f >= 10) {
511     $sf = shift or die "marc needs subfield";
512     }
513 dpavlin 540
514 dpavlin 541 foreach (@_) {
515     my $v = $_; # make var read-write for Encode
516 dpavlin 543 next unless (defined($v) && $v !~ /^\s*$/);
517 dpavlin 548 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
518 dpavlin 571 if (defined $sf) {
519 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
520 dpavlin 571 } else {
521 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
522 dpavlin 571 }
523 dpavlin 540 }
524     }
525    
526 dpavlin 547 =head2 marc_repeatable_subfield
527    
528     Save values for MARC repetable subfield
529    
530     marc_repeatable_subfield('910', 'z', rec('909') );
531    
532     =cut
533    
534     sub marc_repeatable_subfield {
535 dpavlin 550 my ($f,$sf) = @_;
536     die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
537     $marc_repeatable_subfield->{ $f . $sf }++;
538 dpavlin 547 marc(@_);
539     }
540    
541     =head2 marc_indicators
542    
543     Set both indicators for MARC field
544    
545     marc_indicators('900', ' ', 1);
546    
547     Any indicator value other than C<0-9> will be treated as undefined.
548    
549     =cut
550    
551     sub marc_indicators {
552     my $f = shift || die "marc_indicators need field!\n";
553     my ($i1,$i2) = @_;
554     die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
555     die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
556    
557     $i1 = ' ' if ($i1 !~ /^\d$/);
558     $i2 = ' ' if ($i2 !~ /^\d$/);
559 dpavlin 548 @{ $marc_indicators->{$f} } = ($i1,$i2);
560 dpavlin 547 }
561    
562 dpavlin 562 =head2 marc_compose
563 dpavlin 547
564 dpavlin 562 Save values for each MARC subfield explicitly
565    
566     marc_compose('900',
567     'a', rec('200','a')
568     'b', rec('201','a')
569     'a', rec('200','b')
570     'c', rec('200','c')
571     );
572    
573 dpavlin 603 If you specify C<+> for subfield, value will be appended
574     to previous defined subfield.
575    
576 dpavlin 562 =cut
577    
578     sub marc_compose {
579     my $f = shift or die "marc_compose needs field";
580     die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
581    
582     my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
583     my $m = [ $f, $i1, $i2 ];
584    
585 dpavlin 583 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
586    
587 dpavlin 619 if ($#_ % 2 != 1) {
588     die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
589     }
590    
591 dpavlin 562 while (@_) {
592 dpavlin 619 my $sf = shift;
593 dpavlin 565 my $v = shift;
594 dpavlin 562
595     next unless (defined($v) && $v !~ /^\s*$/);
596 dpavlin 568 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
597 dpavlin 603 if ($sf ne '+') {
598     push @$m, ( $sf, $v );
599     } else {
600     $m->[ $#$m ] .= $v;
601     }
602 dpavlin 562 }
603    
604 dpavlin 586 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
605 dpavlin 562
606 dpavlin 574 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
607 dpavlin 562 }
608    
609 dpavlin 574 =head2 marc_duplicate
610 dpavlin 562
611 dpavlin 574 Generate copy of current MARC record and continue working on copy
612    
613     marc_duplicate();
614    
615     Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
616     C<< _get_marc_fields( offset => 42 ) >>.
617    
618     =cut
619    
620     sub marc_duplicate {
621     my $m = $marc_record->[ -1 ];
622     die "can't duplicate record which isn't defined" unless ($m);
623     push @{ $marc_record }, dclone( $m );
624     warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
625     $marc_record_offset = $#{ $marc_record };
626     warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
627     }
628    
629     =head2 marc_remove
630    
631     Remove some field or subfield from MARC record.
632    
633     marc_remove('200');
634     marc_remove('200','a');
635    
636     This will erase field C<200> or C<200^a> from current MARC record.
637    
638     This is useful after calling C<marc_duplicate> or on it's own (but, you
639     should probably just remove that subfield definition if you are not
640     using C<marc_duplicate>).
641    
642     FIXME: support fields < 10.
643    
644     =cut
645    
646     sub marc_remove {
647     my ($f, $sf) = @_;
648    
649     die "marc_remove needs record number" unless defined($f);
650    
651     my $marc = $marc_record->[ $marc_record_offset ];
652    
653     warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
654    
655 dpavlin 579 my $i = 0;
656     foreach ( 0 .. $#{ $marc } ) {
657 dpavlin 574 last unless (defined $marc->[$i]);
658     warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
659     if ($marc->[$i]->[0] eq $f) {
660     if (! defined $sf) {
661     # remove whole field
662     splice @$marc, $i, 1;
663     warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
664     $i--;
665     } else {
666     foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
667     my $o = ($j * 2) + 3;
668     if ($marc->[$i]->[$o] eq $sf) {
669     # remove subfield
670     splice @{$marc->[$i]}, $o, 2;
671     warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
672     # is record now empty?
673     if ($#{ $marc->[$i] } == 2) {
674     splice @$marc, $i, 1;
675     warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
676     $i--;
677     };
678     }
679     }
680     }
681     }
682 dpavlin 579 $i++;
683 dpavlin 574 }
684    
685     warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
686    
687     $marc_record->[ $marc_record_offset ] = $marc;
688    
689     warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
690     }
691    
692 dpavlin 604 =head2 marc_original_order
693    
694     Copy all subfields preserving original order to marc field.
695    
696 dpavlin 616 marc_original_order( marc_field_number, original_input_field_number );
697 dpavlin 604
698 dpavlin 616 Please note that field numbers are consistent with other commands (marc
699     field number first), but somewhat counter-intuitive (destination and then
700     source).
701    
702 dpavlin 604 You might want to use this command if you are just renaming subfields or
703     using pre-processing modify_record in C<config.yml> and don't need any
704     post-processing or want to preserve order of original subfields.
705    
706 dpavlin 616
707 dpavlin 604 =cut
708    
709     sub marc_original_order {
710    
711 dpavlin 616 my ($to, $from) = @_;
712 dpavlin 604 die "marc_original_order needs from and to fields\n" unless ($from && $to);
713    
714 dpavlin 616 return unless defined($rec->{$from});
715    
716     my $r = $rec->{$from};
717 dpavlin 604 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
718    
719     my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
720 dpavlin 616 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
721 dpavlin 604
722     foreach my $d (@$r) {
723    
724 dpavlin 605 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
725 dpavlin 616 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
726 dpavlin 605 next;
727     }
728    
729 dpavlin 604 my @sfs = @{ $d->{subfields} };
730    
731     die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
732    
733 dpavlin 618 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
734 dpavlin 604
735     my $m = [ $to, $i1, $i2 ];
736    
737     while (my $sf = shift @sfs) {
738 dpavlin 618
739     warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
740 dpavlin 604 my $offset = shift @sfs;
741     die "corrupted sufields specification for field $from\n" unless defined($offset);
742    
743     my $v;
744     if (ref($d->{$sf}) eq 'ARRAY') {
745     $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
746     } elsif ($offset == 0) {
747     $v = $d->{$sf};
748     } else {
749     die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
750     }
751     push @$m, ( $sf, $v ) if (defined($v));
752     }
753    
754     if ($#{$m} > 2) {
755     push @{ $marc_record->[ $marc_record_offset ] }, $m;
756     }
757     }
758    
759     warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
760     }
761    
762    
763 dpavlin 540 =head1 Functions to extract data from input
764    
765     This function should be used inside functions to create C<data_structure> described
766     above.
767    
768 dpavlin 641 =head2 _pack_subfields_hash
769    
770 dpavlin 669 @subfields = _pack_subfields_hash( $h );
771     $subfields = _pack_subfields_hash( $h, 1 );
772 dpavlin 641
773 dpavlin 669 Return each subfield value in array or pack them all together and return scalar
774     with subfields (denoted by C<^>) and values.
775    
776 dpavlin 641 =cut
777    
778     sub _pack_subfields_hash {
779    
780 dpavlin 642 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
781 dpavlin 641
782     my ($h,$include_subfields) = @_;
783    
784     if ( defined($h->{subfields}) ) {
785     my $sfs = delete $h->{subfields} || die "no subfields?";
786     my @out;
787     while (@$sfs) {
788     my $sf = shift @$sfs;
789     push @out, '^' . $sf if ($include_subfields);
790     my $o = shift @$sfs;
791     if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
792     # single element subfields are not arrays
793 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
794 dpavlin 667
795 dpavlin 641 push @out, $h->{$sf};
796     } else {
797 dpavlin 669 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
798 dpavlin 641 push @out, $h->{$sf}->[$o];
799     }
800     }
801 dpavlin 667 if ($include_subfields) {
802     return join('', @out);
803     } else {
804     return @out;
805     }
806 dpavlin 641 } else {
807 dpavlin 667 if ($include_subfields) {
808     my $out = '';
809 dpavlin 668 foreach my $sf (sort keys %$h) {
810 dpavlin 667 if (ref($h->{$sf}) eq 'ARRAY') {
811     $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
812     } else {
813     $out .= '^' . $sf . $h->{$sf};
814     }
815     }
816     return $out;
817     } else {
818     # FIXME this should probably be in alphabetical order instead of hash order
819     values %{$h};
820     }
821 dpavlin 641 }
822     }
823    
824 dpavlin 536 =head2 rec1
825 dpavlin 371
826 dpavlin 536 Return all values in some field
827 dpavlin 371
828 dpavlin 536 @v = rec1('200')
829 dpavlin 15
830 dpavlin 536 TODO: order of values is probably same as in source data, need to investigate that
831 dpavlin 15
832 dpavlin 536 =cut
833 dpavlin 15
834 dpavlin 536 sub rec1 {
835     my $f = shift;
836 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
837 dpavlin 536 return unless (defined($rec) && defined($rec->{$f}));
838 dpavlin 571 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
839 dpavlin 536 if (ref($rec->{$f}) eq 'ARRAY') {
840 dpavlin 641 my @out;
841     foreach my $h ( @{ $rec->{$f} } ) {
842     if (ref($h) eq 'HASH') {
843     push @out, ( _pack_subfields_hash( $h ) );
844 dpavlin 31 } else {
845 dpavlin 641 push @out, $h;
846 dpavlin 31 }
847 dpavlin 641 }
848     return @out;
849 dpavlin 536 } elsif( defined($rec->{$f}) ) {
850     return $rec->{$f};
851 dpavlin 15 }
852     }
853    
854 dpavlin 536 =head2 rec2
855 dpavlin 15
856 dpavlin 536 Return all values in specific field and subfield
857 dpavlin 13
858 dpavlin 536 @v = rec2('200','a')
859 dpavlin 13
860     =cut
861    
862 dpavlin 536 sub rec2 {
863     my $f = shift;
864     return unless (defined($rec && $rec->{$f}));
865     my $sf = shift;
866 dpavlin 601 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
867 dpavlin 589 return map {
868     if (ref($_->{$sf}) eq 'ARRAY') {
869     @{ $_->{$sf} };
870     } else {
871     $_->{$sf};
872     }
873     } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
874 dpavlin 536 }
875 dpavlin 13
876 dpavlin 536 =head2 rec
877 dpavlin 13
878 dpavlin 536 syntaxtic sugar for
879 dpavlin 13
880 dpavlin 536 @v = rec('200')
881     @v = rec('200','a')
882 dpavlin 13
883 dpavlin 536 =cut
884 dpavlin 373
885 dpavlin 536 sub rec {
886 dpavlin 583 my @out;
887 dpavlin 536 if ($#_ == 0) {
888 dpavlin 583 @out = rec1(@_);
889 dpavlin 536 } elsif ($#_ == 1) {
890 dpavlin 583 @out = rec2(@_);
891 dpavlin 13 }
892 dpavlin 583 if (@out) {
893     return @out;
894     } else {
895     return '';
896     }
897 dpavlin 13 }
898    
899 dpavlin 536 =head2 regex
900 dpavlin 15
901 dpavlin 536 Apply regex to some or all values
902 dpavlin 15
903 dpavlin 536 @v = regex( 's/foo/bar/g', @v );
904 dpavlin 15
905     =cut
906    
907 dpavlin 536 sub regex {
908     my $r = shift;
909     my @out;
910 dpavlin 550 #warn "r: $r\n", dump(\@_);
911 dpavlin 536 foreach my $t (@_) {
912     next unless ($t);
913     eval "\$t =~ $r";
914     push @out, $t if ($t && $t ne '');
915 dpavlin 368 }
916 dpavlin 536 return @out;
917 dpavlin 15 }
918    
919 dpavlin 536 =head2 prefix
920 dpavlin 15
921 dpavlin 536 Prefix all values with a string
922 dpavlin 15
923 dpavlin 536 @v = prefix( 'my_', @v );
924 dpavlin 15
925     =cut
926    
927 dpavlin 536 sub prefix {
928 dpavlin 592 my $p = shift or return;
929 dpavlin 536 return map { $p . $_ } grep { defined($_) } @_;
930     }
931 dpavlin 15
932 dpavlin 536 =head2 suffix
933 dpavlin 15
934 dpavlin 536 suffix all values with a string
935 dpavlin 15
936 dpavlin 536 @v = suffix( '_my', @v );
937 dpavlin 15
938 dpavlin 536 =cut
939 dpavlin 15
940 dpavlin 536 sub suffix {
941     my $s = shift or die "suffix needs string as first argument";
942     return map { $_ . $s } grep { defined($_) } @_;
943 dpavlin 15 }
944    
945 dpavlin 536 =head2 surround
946 dpavlin 13
947 dpavlin 536 surround all values with a two strings
948 dpavlin 13
949 dpavlin 536 @v = surround( 'prefix_', '_suffix', @v );
950 dpavlin 13
951     =cut
952    
953 dpavlin 536 sub surround {
954     my $p = shift or die "surround need prefix as first argument";
955     my $s = shift or die "surround needs suffix as second argument";
956     return map { $p . $_ . $s } grep { defined($_) } @_;
957 dpavlin 13 }
958    
959 dpavlin 536 =head2 first
960 dpavlin 13
961 dpavlin 536 Return first element
962 dpavlin 15
963 dpavlin 536 $v = first( @v );
964 dpavlin 13
965     =cut
966    
967 dpavlin 536 sub first {
968     my $r = shift;
969     return $r;
970 dpavlin 13 }
971    
972 dpavlin 536 =head2 lookup
973 dpavlin 13
974 dpavlin 536 Consult lookup hashes for some value
975 dpavlin 13
976 dpavlin 725 @v = lookup(
977     sub {
978     'ffkk/peri/mfn'.rec('000')
979     },
980     'ffkk','peri','200-a-200-e',
981     sub {
982     first(rec(200,'a')).' '.first(rec('200','e'))
983     }
984     );
985 dpavlin 13
986 dpavlin 725 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
987     normal lookup definition in C<conf/lookup/something.pl> which looks like:
988 dpavlin 707
989 dpavlin 725 lookup(
990     # which results to return from record recorded in lookup
991     sub { 'ffkk/peri/mfn' . rec('000') },
992     # from which database and input
993     'ffkk','peri',
994     # such that following values match
995     sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
996     # if this part is missing, we will try to match same fields
997     # from lookup record and current one, or you can override
998     # which records to use from current record using
999     sub { rec('900','x') . ' ' . rec('900','y') },
1000     )
1001    
1002     You can think about this lookup as SQL (if that helps):
1003    
1004     select
1005     sub { what }
1006     from
1007     database, input
1008     where
1009     sub { filter from lookuped record }
1010     having
1011     sub { optional filter on current record }
1012    
1013     Easy as pie, right?
1014    
1015 dpavlin 13 =cut
1016    
1017 dpavlin 536 sub lookup {
1018 dpavlin 725 my ($what, $database, $input, $key, $having) = @_;
1019    
1020     confess "lookup needs 5 arguments: what, database, input, key, having" unless ($#_ == 4);
1021    
1022 dpavlin 729 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1023 dpavlin 725 return unless (defined($lookup->{$database}->{$input}->{$key}));
1024    
1025 dpavlin 736 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1026 dpavlin 725
1027     my $mfns;
1028     my @having = $having->();
1029    
1030 dpavlin 729 warn "## having = ", dump( @having ) if ($debug > 2);
1031 dpavlin 725
1032     foreach my $h ( @having ) {
1033     if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1034     warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n";
1035     $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1036     }
1037 dpavlin 536 }
1038 dpavlin 725
1039     return unless ($mfns);
1040    
1041     my @mfns = sort keys %$mfns;
1042    
1043     warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n";
1044    
1045     my $old_rec = $rec;
1046     my @out;
1047    
1048     foreach my $mfn (@mfns) {
1049 dpavlin 736 $rec = $load_row_coderef->( $database, $input, $mfn );
1050 dpavlin 725
1051     warn "got $database/$input/$mfn = ", dump($rec), $/;
1052    
1053     my @vals = $what->();
1054    
1055     push @out, ( @vals );
1056    
1057     warn "lookup for mfn $mfn returned ", dump(@vals), $/;
1058     }
1059    
1060     # if (ref($lookup->{$k}) eq 'ARRAY') {
1061     # return @{ $lookup->{$k} };
1062     # } else {
1063     # return $lookup->{$k};
1064     # }
1065    
1066     $rec = $old_rec;
1067    
1068     warn "## lookup returns = ", dump(@out), $/;
1069    
1070     return @out;
1071 dpavlin 13 }
1072    
1073 dpavlin 707 =head2 save_into_lookup
1074    
1075 dpavlin 725 Save value into lookup. It associates current database, input
1076     and specific keys with one or more values which will be
1077     associated over MFN.
1078 dpavlin 707
1079 dpavlin 725 MFN will be extracted from first occurence current of field 000
1080     in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1081    
1082     my $nr = save_into_lookup($database,$input,$key,sub {
1083 dpavlin 707 # code which produce one or more values
1084     });
1085    
1086 dpavlin 725 It returns number of items saved.
1087 dpavlin 707
1088 dpavlin 725 This function shouldn't be called directly, it's called from code created by
1089     L<WebPAC::Parser>.
1090    
1091 dpavlin 707 =cut
1092    
1093     sub save_into_lookup {
1094 dpavlin 712 my ($database,$input,$key,$coderef) = @_;
1095     die "save_into_lookup needs database" unless defined($database);
1096     die "save_into_lookup needs input" unless defined($input);
1097     die "save_into_lookup needs key" unless defined($key);
1098 dpavlin 707 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1099 dpavlin 725
1100 dpavlin 729 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1101 dpavlin 725
1102     my $mfn =
1103     defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1104     defined($config->{_mfn}) ? $config->{_mfn} :
1105     die "mfn not defined or zero";
1106    
1107     my $nr = 0;
1108    
1109 dpavlin 707 foreach my $v ( $coderef->() ) {
1110 dpavlin 712 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1111 dpavlin 721 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1112 dpavlin 725 $nr++;
1113 dpavlin 707 }
1114 dpavlin 725
1115     return $nr;
1116 dpavlin 707 }
1117    
1118 dpavlin 595 =head2 config
1119    
1120     Consult config values stored in C<config.yml>
1121    
1122     # return database code (key under databases in yaml)
1123     $database_code = config(); # use _ from hash
1124     $database_name = config('name');
1125     $database_input_name = config('input name');
1126     $tag = config('input normalize tag');
1127    
1128     Up to three levels are supported.
1129    
1130     =cut
1131    
1132     sub config {
1133     return unless ($config);
1134    
1135     my $p = shift;
1136    
1137     $p ||= '';
1138    
1139     my $v;
1140    
1141     warn "### getting config($p)\n" if ($debug > 1);
1142    
1143     my @p = split(/\s+/,$p);
1144     if ($#p < 0) {
1145     $v = $config->{ '_' }; # special, database code
1146     } else {
1147    
1148     my $c = dclone( $config );
1149    
1150     foreach my $k (@p) {
1151     warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1152     if (ref($c) eq 'ARRAY') {
1153     $c = shift @$c;
1154     warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1155     last;
1156     }
1157    
1158     if (! defined($c->{$k}) ) {
1159     $c = undef;
1160     last;
1161     } else {
1162     $c = $c->{$k};
1163     }
1164     }
1165     $v = $c if ($c);
1166    
1167     }
1168    
1169     warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1170     warn "config( '$p' ) is empty\n" if (! $v);
1171    
1172     return $v;
1173     }
1174    
1175     =head2 id
1176    
1177     Returns unique id of this record
1178    
1179     $id = id();
1180    
1181     Returns C<42/2> for 2nd occurence of MFN 42.
1182    
1183     =cut
1184    
1185     sub id {
1186     my $mfn = $config->{_mfn} || die "no _mfn in config data";
1187     return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1188     }
1189    
1190 dpavlin 536 =head2 join_with
1191 dpavlin 13
1192 dpavlin 536 Joins walues with some delimiter
1193 dpavlin 10
1194 dpavlin 536 $v = join_with(", ", @v);
1195 dpavlin 10
1196 dpavlin 536 =cut
1197 dpavlin 10
1198 dpavlin 536 sub join_with {
1199     my $d = shift;
1200 dpavlin 586 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1201 dpavlin 583 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1202     return '' unless defined($v);
1203     return $v;
1204 dpavlin 536 }
1205 dpavlin 10
1206 dpavlin 562 =head2 split_rec_on
1207    
1208     Split record subfield on some regex and take one of parts out
1209    
1210     $a_before_semi_column =
1211     split_rec_on('200','a', /\s*;\s*/, $part);
1212    
1213     C<$part> is optional number of element. First element is
1214     B<1>, not 0!
1215    
1216     If there is no C<$part> parameter or C<$part> is 0, this function will
1217     return all values produced by splitting.
1218    
1219     =cut
1220    
1221     sub split_rec_on {
1222     die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1223    
1224     my ($fld, $sf, $regex, $part) = @_;
1225 dpavlin 568 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1226 dpavlin 562
1227     my @r = rec( $fld, $sf );
1228     my $v = shift @r;
1229 dpavlin 568 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1230 dpavlin 562
1231 dpavlin 604 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1232 dpavlin 566
1233 dpavlin 562 my @s = split( $regex, $v );
1234 dpavlin 568 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1235 dpavlin 566 if ($part && $part > 0) {
1236 dpavlin 562 return $s[ $part - 1 ];
1237     } else {
1238 dpavlin 571 return @s;
1239 dpavlin 562 }
1240     }
1241    
1242 dpavlin 536 # END
1243     1;

  ViewVC Help
Powered by ViewVC 1.1.26