/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 812 - (show annotations)
Sun Apr 1 21:47:42 2007 UTC (15 years, 10 months ago) by dpavlin
File size: 28490 byte(s)
 r1181@llin:  dpavlin | 2007-04-01 22:47:56 +0200
  r1180@llin:  dpavlin | 2007-04-01 22:47:43 +0200
  renamed $leader to $marc_leader to be consistent with other vars
 

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader
13 marc_duplicate marc_remove
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22
23 get set
24 count
25 /;
26
27 use warnings;
28 use strict;
29
30 #use base qw/WebPAC::Common/;
31 use Data::Dump qw/dump/;
32 use Storable qw/dclone/;
33 use Carp qw/confess/;
34
35 # debugging warn(s)
36 my $debug = 0;
37
38
39 =head1 NAME
40
41 WebPAC::Normalize - describe normalisaton rules using sets
42
43 =head1 VERSION
44
45 Version 0.26
46
47 =cut
48
49 our $VERSION = '0.26';
50
51 =head1 SYNOPSIS
52
53 This module uses C<conf/normalize/*.pl> files to perform normalisation
54 from input records using perl functions which are specialized for set
55 processing.
56
57 Sets are implemented as arrays, and normalisation file is valid perl, which
58 means that you check it's validity before running WebPAC using
59 C<perl -c normalize.pl>.
60
61 Normalisation can generate multiple output normalized data. For now, supported output
62 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
63 C<marc>.
64
65 =head1 FUNCTIONS
66
67 Functions which start with C<_> are private and used by WebPAC internally.
68 All other functions are available for use within normalisation rules.
69
70 =head2 data_structure
71
72 Return data structure
73
74 my $ds = WebPAC::Normalize::data_structure(
75 lookup => $lookup_hash,
76 row => $row,
77 rules => $normalize_pl_config,
78 marc_encoding => 'utf-8',
79 config => $config,
80 load_row_coderef => sub {
81 my ($database,$input,$mfn) = shift;
82 $store->load_row( database => $database, input => $input, id => $mfn );
83 },
84 );
85
86 Options C<row>, C<rules> and C<log> are mandatory while all
87 other are optional.
88
89 C<load_row_coderef> is closure only used when executing lookups, so they will
90 die if it's not defined.
91
92 This function will B<die> if normalizastion can't be evaled.
93
94 Since this function isn't exported you have to call it with
95 C<WebPAC::Normalize::data_structure>.
96
97 =cut
98
99 my $load_row_coderef;
100
101 sub data_structure {
102 my $arg = {@_};
103
104 die "need row argument" unless ($arg->{row});
105 die "need normalisation argument" unless ($arg->{rules});
106
107 no strict 'subs';
108 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 _set_rec( $arg->{row} );
110 _set_config( $arg->{config} ) if defined($arg->{config});
111 _clean_ds( %{ $arg } );
112 $load_row_coderef = $arg->{load_row_coderef};
113
114 eval "$arg->{rules}";
115 die "error evaling $arg->{rules}: $@\n" if ($@);
116
117 return _get_ds();
118 }
119
120 =head2 _set_rec
121
122 Set current record hash
123
124 _set_rec( $rec );
125
126 =cut
127
128 my $rec;
129
130 sub _set_rec {
131 $rec = shift or die "no record hash";
132 }
133
134 =head2 _set_config
135
136 Set current config hash
137
138 _set_config( $config );
139
140 Magic keys are:
141
142 =over 4
143
144 =item _
145
146 Code of current database
147
148 =item _mfn
149
150 Current MFN
151
152 =back
153
154 =cut
155
156 my $config;
157
158 sub _set_config {
159 $config = shift;
160 }
161
162 =head2 _get_ds
163
164 Return hash formatted as data structure
165
166 my $ds = _get_ds();
167
168 =cut
169
170 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
171 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
172
173 sub _get_ds {
174 return $out;
175 }
176
177 =head2 _clean_ds
178
179 Clean data structure hash for next record
180
181 _clean_ds();
182
183 =cut
184
185 sub _clean_ds {
186 my $a = {@_};
187 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
188 ($marc_record_offset, $marc_fetch_offset) = (0,0);
189 $marc_encoding = $a->{marc_encoding};
190 }
191
192 =head2 _set_lookup
193
194 Set current lookup hash
195
196 _set_lookup( $lookup );
197
198 =cut
199
200 my $lookup;
201
202 sub _set_lookup {
203 $lookup = shift;
204 }
205
206 =head2 _get_lookup
207
208 Get current lookup hash
209
210 my $lookup = _get_lookup();
211
212 =cut
213
214 sub _get_lookup {
215 return $lookup;
216 }
217
218 =head2 _set_load_row
219
220 Setup code reference which will return L<data_structure> from
221 L<WebPAC::Store>
222
223 _set_load_row(sub {
224 my ($database,$input,$mfn) = @_;
225 $store->load_row( database => $database, input => $input, id => $mfn );
226 });
227
228 =cut
229
230 sub _set_load_row {
231 my $coderef = shift;
232 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
233
234 $load_row_coderef = $coderef;
235 }
236
237 =head2 _get_marc_fields
238
239 Get all fields defined by calls to C<marc>
240
241 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
242
243 We are using I<magic> which detect repeatable fields only from
244 sequence of field/subfield data generated by normalization.
245
246 Repeatable field is created when there is second occurence of same subfield or
247 if any of indicators are different.
248
249 This is sane for most cases. Something like:
250
251 900a-1 900b-1 900c-1
252 900a-2 900b-2
253 900a-3
254
255 will be created from any combination of:
256
257 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
258
259 and following rules:
260
261 marc('900','a', rec('200','a') );
262 marc('900','b', rec('200','b') );
263 marc('900','c', rec('200','c') );
264
265 which might not be what you have in mind. If you need repeatable subfield,
266 define it using C<marc_repeatable_subfield> like this:
267
268 marc_repeatable_subfield('900','a');
269 marc('900','a', rec('200','a') );
270 marc('900','b', rec('200','b') );
271 marc('900','c', rec('200','c') );
272
273 will create:
274
275 900a-1 900a-2 900a-3 900b-1 900c-1
276 900b-2
277
278 There is also support for returning next or specific using:
279
280 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
281 # do something with $mf
282 }
283
284 will always return fields from next MARC record or
285
286 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
287
288 will return 42th copy record (if it exists).
289
290 =cut
291
292 sub _get_marc_fields {
293
294 my $arg = {@_};
295 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
296 my $offset = $marc_fetch_offset;
297 if ($arg->{offset}) {
298 $offset = $arg->{offset};
299 } elsif($arg->{fetch_next}) {
300 $marc_fetch_offset++;
301 }
302
303 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
304
305 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
306
307 my $marc_rec = $marc_record->[ $offset ];
308
309 warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
310
311 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
312
313 # first, sort all existing fields
314 # XXX might not be needed, but modern perl might randomize elements in hash
315 my @sorted_marc_record = sort {
316 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
317 } @{ $marc_rec };
318
319 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
320
321 # output marc fields
322 my @m;
323
324 # count unique field-subfields (used for offset when walking to next subfield)
325 my $u;
326 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
327
328 if ($debug) {
329 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
330 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
331 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
332 warn "## subfield count = ", dump( $u ), $/;
333 }
334
335 my $len = $#sorted_marc_record;
336 my $visited;
337 my $i = 0;
338 my $field;
339
340 foreach ( 0 .. $len ) {
341
342 # find next element which isn't visited
343 while ($visited->{$i}) {
344 $i = ($i + 1) % ($len + 1);
345 }
346
347 # mark it visited
348 $visited->{$i}++;
349
350 my $row = dclone( $sorted_marc_record[$i] );
351
352 # field and subfield which is key for
353 # marc_repeatable_subfield and u
354 my $fsf = $row->[0] . ( $row->[3] || '' );
355
356 if ($debug > 1) {
357
358 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
359 print "### this [$i]: ", dump( $row ),$/;
360 print "### sf: ", $row->[3], " vs ", $field->[3],
361 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
362 if ($#$field >= 0);
363
364 }
365
366 # if field exists
367 if ( $#$field >= 0 ) {
368 if (
369 $row->[0] ne $field->[0] || # field
370 $row->[1] ne $field->[1] || # i1
371 $row->[2] ne $field->[2] # i2
372 ) {
373 push @m, $field;
374 warn "## saved/1 ", dump( $field ),$/ if ($debug);
375 $field = $row;
376
377 } elsif (
378 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
379 ||
380 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
381 ! $marc_repeatable_subfield->{ $fsf }
382 )
383 ) {
384 push @m, $field;
385 warn "## saved/2 ", dump( $field ),$/ if ($debug);
386 $field = $row;
387
388 } else {
389 # append new subfields to existing field
390 push @$field, ( $row->[3], $row->[4] );
391 }
392 } else {
393 # insert first field
394 $field = $row;
395 }
396
397 if (! $marc_repeatable_subfield->{ $fsf }) {
398 # make step to next subfield
399 $i = ($i + $u->{ $fsf } ) % ($len + 1);
400 }
401 }
402
403 if ($#$field >= 0) {
404 push @m, $field;
405 warn "## saved/3 ", dump( $field ),$/ if ($debug);
406 }
407
408 return \@m;
409 }
410
411 =head2 _debug
412
413 Change level of debug warnings
414
415 _debug( 2 );
416
417 =cut
418
419 sub _debug {
420 my $l = shift;
421 return $debug unless defined($l);
422 warn "debug level $l",$/ if ($l > 0);
423 $debug = $l;
424 }
425
426 =head1 Functions to create C<data_structure>
427
428 Those functions generally have to first in your normalization file.
429
430 =head2 tag
431
432 Define new tag for I<search> and I<display>.
433
434 tag('Title', rec('200','a') );
435
436
437 =cut
438
439 sub tag {
440 my $name = shift or die "tag needs name as first argument";
441 my @o = grep { defined($_) && $_ ne '' } @_;
442 return unless (@o);
443 $out->{$name}->{tag} = $name;
444 $out->{$name}->{search} = \@o;
445 $out->{$name}->{display} = \@o;
446 }
447
448 =head2 display
449
450 Define tag just for I<display>
451
452 @v = display('Title', rec('200','a') );
453
454 =cut
455
456 sub display {
457 my $name = shift or die "display needs name as first argument";
458 my @o = grep { defined($_) && $_ ne '' } @_;
459 return unless (@o);
460 $out->{$name}->{tag} = $name;
461 $out->{$name}->{display} = \@o;
462 }
463
464 =head2 search
465
466 Prepare values just for I<search>
467
468 @v = search('Title', rec('200','a') );
469
470 =cut
471
472 sub search {
473 my $name = shift or die "search needs name as first argument";
474 my @o = grep { defined($_) && $_ ne '' } @_;
475 return unless (@o);
476 $out->{$name}->{tag} = $name;
477 $out->{$name}->{search} = \@o;
478 }
479
480 =head2 marc_leader
481
482 Setup fields within MARC leader or get leader
483
484 marc_leader('05','c');
485 my $leader = marc_leader();
486
487 =cut
488
489 sub marc_leader {
490 my ($offset,$value) = @_;
491
492 if ($offset) {
493 $marc_leader->{ $offset } = $value;
494 } else {
495 return $marc_leader;
496 }
497 }
498
499 =head2 marc
500
501 Save value for MARC field
502
503 marc('900','a', rec('200','a') );
504 marc('001', rec('000') );
505
506 =cut
507
508 sub marc {
509 my $f = shift or die "marc needs field";
510 die "marc field must be numer" unless ($f =~ /^\d+$/);
511
512 my $sf;
513 if ($f >= 10) {
514 $sf = shift or die "marc needs subfield";
515 }
516
517 foreach (@_) {
518 my $v = $_; # make var read-write for Encode
519 next unless (defined($v) && $v !~ /^\s*$/);
520 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
521 if (defined $sf) {
522 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
523 } else {
524 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
525 }
526 }
527 }
528
529 =head2 marc_repeatable_subfield
530
531 Save values for MARC repetable subfield
532
533 marc_repeatable_subfield('910', 'z', rec('909') );
534
535 =cut
536
537 sub marc_repeatable_subfield {
538 my ($f,$sf) = @_;
539 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
540 $marc_repeatable_subfield->{ $f . $sf }++;
541 marc(@_);
542 }
543
544 =head2 marc_indicators
545
546 Set both indicators for MARC field
547
548 marc_indicators('900', ' ', 1);
549
550 Any indicator value other than C<0-9> will be treated as undefined.
551
552 =cut
553
554 sub marc_indicators {
555 my $f = shift || die "marc_indicators need field!\n";
556 my ($i1,$i2) = @_;
557 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
558 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
559
560 $i1 = ' ' if ($i1 !~ /^\d$/);
561 $i2 = ' ' if ($i2 !~ /^\d$/);
562 @{ $marc_indicators->{$f} } = ($i1,$i2);
563 }
564
565 =head2 marc_compose
566
567 Save values for each MARC subfield explicitly
568
569 marc_compose('900',
570 'a', rec('200','a')
571 'b', rec('201','a')
572 'a', rec('200','b')
573 'c', rec('200','c')
574 );
575
576 If you specify C<+> for subfield, value will be appended
577 to previous defined subfield.
578
579 =cut
580
581 sub marc_compose {
582 my $f = shift or die "marc_compose needs field";
583 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
584
585 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
586 my $m = [ $f, $i1, $i2 ];
587
588 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
589
590 if ($#_ % 2 != 1) {
591 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
592 }
593
594 while (@_) {
595 my $sf = shift;
596 my $v = shift;
597
598 next unless (defined($v) && $v !~ /^\s*$/);
599 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
600 if ($sf ne '+') {
601 push @$m, ( $sf, $v );
602 } else {
603 $m->[ $#$m ] .= $v;
604 }
605 }
606
607 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
608
609 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
610 }
611
612 =head2 marc_duplicate
613
614 Generate copy of current MARC record and continue working on copy
615
616 marc_duplicate();
617
618 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
619 C<< _get_marc_fields( offset => 42 ) >>.
620
621 =cut
622
623 sub marc_duplicate {
624 my $m = $marc_record->[ -1 ];
625 die "can't duplicate record which isn't defined" unless ($m);
626 push @{ $marc_record }, dclone( $m );
627 warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
628 $marc_record_offset = $#{ $marc_record };
629 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
630 }
631
632 =head2 marc_remove
633
634 Remove some field or subfield from MARC record.
635
636 marc_remove('200');
637 marc_remove('200','a');
638
639 This will erase field C<200> or C<200^a> from current MARC record.
640
641 marc_remove('*');
642
643 Will remove all fields in current MARC record.
644
645 This is useful after calling C<marc_duplicate> or on it's own (but, you
646 should probably just remove that subfield definition if you are not
647 using C<marc_duplicate>).
648
649 FIXME: support fields < 10.
650
651 =cut
652
653 sub marc_remove {
654 my ($f, $sf) = @_;
655
656 die "marc_remove needs record number" unless defined($f);
657
658 my $marc = $marc_record->[ $marc_record_offset ];
659
660 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
661
662 if ($f eq '*') {
663
664 delete( $marc_record->[ $marc_record_offset ] );
665 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
666
667 } else {
668
669 my $i = 0;
670 foreach ( 0 .. $#{ $marc } ) {
671 last unless (defined $marc->[$i]);
672 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
673 if ($marc->[$i]->[0] eq $f) {
674 if (! defined $sf) {
675 # remove whole field
676 splice @$marc, $i, 1;
677 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
678 $i--;
679 } else {
680 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
681 my $o = ($j * 2) + 3;
682 if ($marc->[$i]->[$o] eq $sf) {
683 # remove subfield
684 splice @{$marc->[$i]}, $o, 2;
685 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
686 # is record now empty?
687 if ($#{ $marc->[$i] } == 2) {
688 splice @$marc, $i, 1;
689 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
690 $i--;
691 };
692 }
693 }
694 }
695 }
696 $i++;
697 }
698
699 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
700
701 $marc_record->[ $marc_record_offset ] = $marc;
702 }
703
704 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
705 }
706
707 =head2 marc_original_order
708
709 Copy all subfields preserving original order to marc field.
710
711 marc_original_order( marc_field_number, original_input_field_number );
712
713 Please note that field numbers are consistent with other commands (marc
714 field number first), but somewhat counter-intuitive (destination and then
715 source).
716
717 You might want to use this command if you are just renaming subfields or
718 using pre-processing modify_record in C<config.yml> and don't need any
719 post-processing or want to preserve order of original subfields.
720
721
722 =cut
723
724 sub marc_original_order {
725
726 my ($to, $from) = @_;
727 die "marc_original_order needs from and to fields\n" unless ($from && $to);
728
729 return unless defined($rec->{$from});
730
731 my $r = $rec->{$from};
732 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
733
734 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
735 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
736
737 foreach my $d (@$r) {
738
739 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
740 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
741 next;
742 }
743
744 my @sfs = @{ $d->{subfields} };
745
746 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
747
748 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
749
750 my $m = [ $to, $i1, $i2 ];
751
752 while (my $sf = shift @sfs) {
753
754 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
755 my $offset = shift @sfs;
756 die "corrupted sufields specification for field $from\n" unless defined($offset);
757
758 my $v;
759 if (ref($d->{$sf}) eq 'ARRAY') {
760 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
761 } elsif ($offset == 0) {
762 $v = $d->{$sf};
763 } else {
764 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
765 }
766 push @$m, ( $sf, $v ) if (defined($v));
767 }
768
769 if ($#{$m} > 2) {
770 push @{ $marc_record->[ $marc_record_offset ] }, $m;
771 }
772 }
773
774 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
775 }
776
777
778 =head1 Functions to extract data from input
779
780 This function should be used inside functions to create C<data_structure> described
781 above.
782
783 =head2 _pack_subfields_hash
784
785 @subfields = _pack_subfields_hash( $h );
786 $subfields = _pack_subfields_hash( $h, 1 );
787
788 Return each subfield value in array or pack them all together and return scalar
789 with subfields (denoted by C<^>) and values.
790
791 =cut
792
793 sub _pack_subfields_hash {
794
795 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
796
797 my ($h,$include_subfields) = @_;
798
799 if ( defined($h->{subfields}) ) {
800 my $sfs = delete $h->{subfields} || die "no subfields?";
801 my @out;
802 while (@$sfs) {
803 my $sf = shift @$sfs;
804 push @out, '^' . $sf if ($include_subfields);
805 my $o = shift @$sfs;
806 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
807 # single element subfields are not arrays
808 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
809
810 push @out, $h->{$sf};
811 } else {
812 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
813 push @out, $h->{$sf}->[$o];
814 }
815 }
816 if ($include_subfields) {
817 return join('', @out);
818 } else {
819 return @out;
820 }
821 } else {
822 if ($include_subfields) {
823 my $out = '';
824 foreach my $sf (sort keys %$h) {
825 if (ref($h->{$sf}) eq 'ARRAY') {
826 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
827 } else {
828 $out .= '^' . $sf . $h->{$sf};
829 }
830 }
831 return $out;
832 } else {
833 # FIXME this should probably be in alphabetical order instead of hash order
834 values %{$h};
835 }
836 }
837 }
838
839 =head2 rec1
840
841 Return all values in some field
842
843 @v = rec1('200')
844
845 TODO: order of values is probably same as in source data, need to investigate that
846
847 =cut
848
849 sub rec1 {
850 my $f = shift;
851 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
852 return unless (defined($rec) && defined($rec->{$f}));
853 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
854 if (ref($rec->{$f}) eq 'ARRAY') {
855 my @out;
856 foreach my $h ( @{ $rec->{$f} } ) {
857 if (ref($h) eq 'HASH') {
858 push @out, ( _pack_subfields_hash( $h ) );
859 } else {
860 push @out, $h;
861 }
862 }
863 return @out;
864 } elsif( defined($rec->{$f}) ) {
865 return $rec->{$f};
866 }
867 }
868
869 =head2 rec2
870
871 Return all values in specific field and subfield
872
873 @v = rec2('200','a')
874
875 =cut
876
877 sub rec2 {
878 my $f = shift;
879 return unless (defined($rec && $rec->{$f}));
880 my $sf = shift;
881 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
882 return map {
883 if (ref($_->{$sf}) eq 'ARRAY') {
884 @{ $_->{$sf} };
885 } else {
886 $_->{$sf};
887 }
888 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
889 }
890
891 =head2 rec
892
893 syntaxtic sugar for
894
895 @v = rec('200')
896 @v = rec('200','a')
897
898 If rec() returns just single value, it will
899 return scalar, not array.
900
901 =cut
902
903 sub rec {
904 my @out;
905 if ($#_ == 0) {
906 @out = rec1(@_);
907 } elsif ($#_ == 1) {
908 @out = rec2(@_);
909 }
910 if ($#out == 0 && ! wantarray) {
911 return $out[0];
912 } elsif (@out) {
913 return @out;
914 } else {
915 return '';
916 }
917 }
918
919 =head2 regex
920
921 Apply regex to some or all values
922
923 @v = regex( 's/foo/bar/g', @v );
924
925 =cut
926
927 sub regex {
928 my $r = shift;
929 my @out;
930 #warn "r: $r\n", dump(\@_);
931 foreach my $t (@_) {
932 next unless ($t);
933 eval "\$t =~ $r";
934 push @out, $t if ($t && $t ne '');
935 }
936 return @out;
937 }
938
939 =head2 prefix
940
941 Prefix all values with a string
942
943 @v = prefix( 'my_', @v );
944
945 =cut
946
947 sub prefix {
948 my $p = shift or return;
949 return map { $p . $_ } grep { defined($_) } @_;
950 }
951
952 =head2 suffix
953
954 suffix all values with a string
955
956 @v = suffix( '_my', @v );
957
958 =cut
959
960 sub suffix {
961 my $s = shift or die "suffix needs string as first argument";
962 return map { $_ . $s } grep { defined($_) } @_;
963 }
964
965 =head2 surround
966
967 surround all values with a two strings
968
969 @v = surround( 'prefix_', '_suffix', @v );
970
971 =cut
972
973 sub surround {
974 my $p = shift or die "surround need prefix as first argument";
975 my $s = shift or die "surround needs suffix as second argument";
976 return map { $p . $_ . $s } grep { defined($_) } @_;
977 }
978
979 =head2 first
980
981 Return first element
982
983 $v = first( @v );
984
985 =cut
986
987 sub first {
988 my $r = shift;
989 return $r;
990 }
991
992 =head2 lookup
993
994 Consult lookup hashes for some value
995
996 @v = lookup(
997 sub {
998 'ffkk/peri/mfn'.rec('000')
999 },
1000 'ffkk','peri','200-a-200-e',
1001 sub {
1002 first(rec(200,'a')).' '.first(rec('200','e'))
1003 }
1004 );
1005
1006 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1007 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1008
1009 lookup(
1010 # which results to return from record recorded in lookup
1011 sub { 'ffkk/peri/mfn' . rec('000') },
1012 # from which database and input
1013 'ffkk','peri',
1014 # such that following values match
1015 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1016 # if this part is missing, we will try to match same fields
1017 # from lookup record and current one, or you can override
1018 # which records to use from current record using
1019 sub { rec('900','x') . ' ' . rec('900','y') },
1020 )
1021
1022 You can think about this lookup as SQL (if that helps):
1023
1024 select
1025 sub { what }
1026 from
1027 database, input
1028 where
1029 sub { filter from lookuped record }
1030 having
1031 sub { optional filter on current record }
1032
1033 Easy as pie, right?
1034
1035 =cut
1036
1037 sub lookup {
1038 my ($what, $database, $input, $key, $having) = @_;
1039
1040 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1041
1042 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1043 return unless (defined($lookup->{$database}->{$input}->{$key}));
1044
1045 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1046
1047 my $mfns;
1048 my @having = $having->();
1049
1050 warn "## having = ", dump( @having ) if ($debug > 2);
1051
1052 foreach my $h ( @having ) {
1053 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1054 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1055 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1056 }
1057 }
1058
1059 return unless ($mfns);
1060
1061 my @mfns = sort keys %$mfns;
1062
1063 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1064
1065 my $old_rec = $rec;
1066 my @out;
1067
1068 foreach my $mfn (@mfns) {
1069 $rec = $load_row_coderef->( $database, $input, $mfn );
1070
1071 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1072
1073 my @vals = $what->();
1074
1075 push @out, ( @vals );
1076
1077 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1078 }
1079
1080 # if (ref($lookup->{$k}) eq 'ARRAY') {
1081 # return @{ $lookup->{$k} };
1082 # } else {
1083 # return $lookup->{$k};
1084 # }
1085
1086 $rec = $old_rec;
1087
1088 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1089
1090 if ($#out == 0) {
1091 return $out[0];
1092 } else {
1093 return @out;
1094 }
1095 }
1096
1097 =head2 save_into_lookup
1098
1099 Save value into lookup. It associates current database, input
1100 and specific keys with one or more values which will be
1101 associated over MFN.
1102
1103 MFN will be extracted from first occurence current of field 000
1104 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1105
1106 my $nr = save_into_lookup($database,$input,$key,sub {
1107 # code which produce one or more values
1108 });
1109
1110 It returns number of items saved.
1111
1112 This function shouldn't be called directly, it's called from code created by
1113 L<WebPAC::Parser>.
1114
1115 =cut
1116
1117 sub save_into_lookup {
1118 my ($database,$input,$key,$coderef) = @_;
1119 die "save_into_lookup needs database" unless defined($database);
1120 die "save_into_lookup needs input" unless defined($input);
1121 die "save_into_lookup needs key" unless defined($key);
1122 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1123
1124 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1125
1126 my $mfn =
1127 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1128 defined($config->{_mfn}) ? $config->{_mfn} :
1129 die "mfn not defined or zero";
1130
1131 my $nr = 0;
1132
1133 foreach my $v ( $coderef->() ) {
1134 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1135 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1136 $nr++;
1137 }
1138
1139 return $nr;
1140 }
1141
1142 =head2 config
1143
1144 Consult config values stored in C<config.yml>
1145
1146 # return database code (key under databases in yaml)
1147 $database_code = config(); # use _ from hash
1148 $database_name = config('name');
1149 $database_input_name = config('input name');
1150 $tag = config('input normalize tag');
1151
1152 Up to three levels are supported.
1153
1154 =cut
1155
1156 sub config {
1157 return unless ($config);
1158
1159 my $p = shift;
1160
1161 $p ||= '';
1162
1163 my $v;
1164
1165 warn "### getting config($p)\n" if ($debug > 1);
1166
1167 my @p = split(/\s+/,$p);
1168 if ($#p < 0) {
1169 $v = $config->{ '_' }; # special, database code
1170 } else {
1171
1172 my $c = dclone( $config );
1173
1174 foreach my $k (@p) {
1175 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1176 if (ref($c) eq 'ARRAY') {
1177 $c = shift @$c;
1178 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1179 last;
1180 }
1181
1182 if (! defined($c->{$k}) ) {
1183 $c = undef;
1184 last;
1185 } else {
1186 $c = $c->{$k};
1187 }
1188 }
1189 $v = $c if ($c);
1190
1191 }
1192
1193 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1194 warn "config( '$p' ) is empty\n" if (! $v);
1195
1196 return $v;
1197 }
1198
1199 =head2 id
1200
1201 Returns unique id of this record
1202
1203 $id = id();
1204
1205 Returns C<42/2> for 2nd occurence of MFN 42.
1206
1207 =cut
1208
1209 sub id {
1210 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1211 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1212 }
1213
1214 =head2 join_with
1215
1216 Joins walues with some delimiter
1217
1218 $v = join_with(", ", @v);
1219
1220 =cut
1221
1222 sub join_with {
1223 my $d = shift;
1224 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1225 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1226 return '' unless defined($v);
1227 return $v;
1228 }
1229
1230 =head2 split_rec_on
1231
1232 Split record subfield on some regex and take one of parts out
1233
1234 $a_before_semi_column =
1235 split_rec_on('200','a', /\s*;\s*/, $part);
1236
1237 C<$part> is optional number of element. First element is
1238 B<1>, not 0!
1239
1240 If there is no C<$part> parameter or C<$part> is 0, this function will
1241 return all values produced by splitting.
1242
1243 =cut
1244
1245 sub split_rec_on {
1246 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1247
1248 my ($fld, $sf, $regex, $part) = @_;
1249 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1250
1251 my @r = rec( $fld, $sf );
1252 my $v = shift @r;
1253 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1254
1255 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1256
1257 my @s = split( $regex, $v );
1258 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1259 if ($part && $part > 0) {
1260 return $s[ $part - 1 ];
1261 } else {
1262 return @s;
1263 }
1264 }
1265
1266 my $hash;
1267
1268 =head2 set
1269
1270 set( key => 'value' );
1271
1272 =cut
1273
1274 sub set {
1275 my ($k,$v) = @_;
1276 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1277 $hash->{$k} = $v;
1278 };
1279
1280 =head2 get
1281
1282 get( 'key' );
1283
1284 =cut
1285
1286 sub get {
1287 my $k = shift || return;
1288 my $v = $hash->{$k};
1289 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1290 return $v;
1291 }
1292
1293 =head2 count
1294
1295 if ( count( @result ) == 1 ) {
1296 # do something if only 1 result is there
1297 }
1298
1299 =cut
1300
1301 sub count {
1302 warn "## count ",dump(@_),$/ if ( $debug );
1303 return @_ . '';
1304 }
1305
1306 # END
1307 1;

  ViewVC Help
Powered by ViewVC 1.1.26