/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 813 - (show annotations)
Sun Apr 1 21:47:47 2007 UTC (17 years ago) by dpavlin
File size: 29293 byte(s)
 r1184@llin:  dpavlin | 2007-04-01 23:47:04 +0200
  r1182@llin:  dpavlin | 2007-04-01 23:44:03 +0200
  marc_duplicate now correctly duplicates leader data, new _get_marc_leader to
  return leader corresponding to last _get_marc_fields call, added marc_count
  which returns number of created MARC records
 

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 tag search display
11 marc marc_indicators marc_repeatable_subfield
12 marc_compose marc_leader
13 marc_duplicate marc_remove marc_count
14 marc_original_order
15
16 rec1 rec2 rec
17 regex prefix suffix surround
18 first lookup join_with
19 save_into_lookup
20
21 split_rec_on
22
23 get set
24 count
25 /;
26
27 use warnings;
28 use strict;
29
30 #use base qw/WebPAC::Common/;
31 use Data::Dump qw/dump/;
32 use Storable qw/dclone/;
33 use Carp qw/confess/;
34
35 # debugging warn(s)
36 my $debug = 0;
37
38
39 =head1 NAME
40
41 WebPAC::Normalize - describe normalisaton rules using sets
42
43 =head1 VERSION
44
45 Version 0.27
46
47 =cut
48
49 our $VERSION = '0.27';
50
51 =head1 SYNOPSIS
52
53 This module uses C<conf/normalize/*.pl> files to perform normalisation
54 from input records using perl functions which are specialized for set
55 processing.
56
57 Sets are implemented as arrays, and normalisation file is valid perl, which
58 means that you check it's validity before running WebPAC using
59 C<perl -c normalize.pl>.
60
61 Normalisation can generate multiple output normalized data. For now, supported output
62 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
63 C<marc>.
64
65 =head1 FUNCTIONS
66
67 Functions which start with C<_> are private and used by WebPAC internally.
68 All other functions are available for use within normalisation rules.
69
70 =head2 data_structure
71
72 Return data structure
73
74 my $ds = WebPAC::Normalize::data_structure(
75 lookup => $lookup_hash,
76 row => $row,
77 rules => $normalize_pl_config,
78 marc_encoding => 'utf-8',
79 config => $config,
80 load_row_coderef => sub {
81 my ($database,$input,$mfn) = shift;
82 $store->load_row( database => $database, input => $input, id => $mfn );
83 },
84 );
85
86 Options C<row>, C<rules> and C<log> are mandatory while all
87 other are optional.
88
89 C<load_row_coderef> is closure only used when executing lookups, so they will
90 die if it's not defined.
91
92 This function will B<die> if normalizastion can't be evaled.
93
94 Since this function isn't exported you have to call it with
95 C<WebPAC::Normalize::data_structure>.
96
97 =cut
98
99 my $load_row_coderef;
100
101 sub data_structure {
102 my $arg = {@_};
103
104 die "need row argument" unless ($arg->{row});
105 die "need normalisation argument" unless ($arg->{rules});
106
107 no strict 'subs';
108 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
109 _set_rec( $arg->{row} );
110 _set_config( $arg->{config} ) if defined($arg->{config});
111 _clean_ds( %{ $arg } );
112 $load_row_coderef = $arg->{load_row_coderef};
113
114 eval "$arg->{rules}";
115 die "error evaling $arg->{rules}: $@\n" if ($@);
116
117 return _get_ds();
118 }
119
120 =head2 _set_rec
121
122 Set current record hash
123
124 _set_rec( $rec );
125
126 =cut
127
128 my $rec;
129
130 sub _set_rec {
131 $rec = shift or die "no record hash";
132 }
133
134 =head2 _set_config
135
136 Set current config hash
137
138 _set_config( $config );
139
140 Magic keys are:
141
142 =over 4
143
144 =item _
145
146 Code of current database
147
148 =item _mfn
149
150 Current MFN
151
152 =back
153
154 =cut
155
156 my $config;
157
158 sub _set_config {
159 $config = shift;
160 }
161
162 =head2 _get_ds
163
164 Return hash formatted as data structure
165
166 my $ds = _get_ds();
167
168 =cut
169
170 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
171 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
172
173 sub _get_ds {
174 return $out;
175 }
176
177 =head2 _clean_ds
178
179 Clean data structure hash for next record
180
181 _clean_ds();
182
183 =cut
184
185 sub _clean_ds {
186 my $a = {@_};
187 ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
188 ($marc_record_offset, $marc_fetch_offset) = (0,0);
189 $marc_encoding = $a->{marc_encoding};
190 }
191
192 =head2 _set_lookup
193
194 Set current lookup hash
195
196 _set_lookup( $lookup );
197
198 =cut
199
200 my $lookup;
201
202 sub _set_lookup {
203 $lookup = shift;
204 }
205
206 =head2 _get_lookup
207
208 Get current lookup hash
209
210 my $lookup = _get_lookup();
211
212 =cut
213
214 sub _get_lookup {
215 return $lookup;
216 }
217
218 =head2 _set_load_row
219
220 Setup code reference which will return L<data_structure> from
221 L<WebPAC::Store>
222
223 _set_load_row(sub {
224 my ($database,$input,$mfn) = @_;
225 $store->load_row( database => $database, input => $input, id => $mfn );
226 });
227
228 =cut
229
230 sub _set_load_row {
231 my $coderef = shift;
232 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
233
234 $load_row_coderef = $coderef;
235 }
236
237 =head2 _get_marc_fields
238
239 Get all fields defined by calls to C<marc>
240
241 $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
242
243 We are using I<magic> which detect repeatable fields only from
244 sequence of field/subfield data generated by normalization.
245
246 Repeatable field is created when there is second occurence of same subfield or
247 if any of indicators are different.
248
249 This is sane for most cases. Something like:
250
251 900a-1 900b-1 900c-1
252 900a-2 900b-2
253 900a-3
254
255 will be created from any combination of:
256
257 900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
258
259 and following rules:
260
261 marc('900','a', rec('200','a') );
262 marc('900','b', rec('200','b') );
263 marc('900','c', rec('200','c') );
264
265 which might not be what you have in mind. If you need repeatable subfield,
266 define it using C<marc_repeatable_subfield> like this:
267
268 marc_repeatable_subfield('900','a');
269 marc('900','a', rec('200','a') );
270 marc('900','b', rec('200','b') );
271 marc('900','c', rec('200','c') );
272
273 will create:
274
275 900a-1 900a-2 900a-3 900b-1 900c-1
276 900b-2
277
278 There is also support for returning next or specific using:
279
280 while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
281 # do something with $mf
282 }
283
284 will always return fields from next MARC record or
285
286 my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
287
288 will return 42th copy record (if it exists).
289
290 =cut
291
292 my $fetch_pos;
293
294 sub _get_marc_fields {
295
296 my $arg = {@_};
297 warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
298 $fetch_pos = $marc_fetch_offset;
299 if ($arg->{offset}) {
300 $fetch_pos = $arg->{offset};
301 } elsif($arg->{fetch_next}) {
302 $marc_fetch_offset++;
303 }
304
305 return if (! $marc_record || ref($marc_record) ne 'ARRAY');
306
307 warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
308
309 my $marc_rec = $marc_record->[ $fetch_pos ];
310
311 warn "## _get_marc_fields (at offset: $fetch_pos) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
312
313 return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
314
315 # first, sort all existing fields
316 # XXX might not be needed, but modern perl might randomize elements in hash
317 my @sorted_marc_record = sort {
318 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
319 } @{ $marc_rec };
320
321 @sorted_marc_record = @{ $marc_rec }; ### FIXME disable sorting
322
323 # output marc fields
324 my @m;
325
326 # count unique field-subfields (used for offset when walking to next subfield)
327 my $u;
328 map { $u->{ $_->[0] . ( $_->[3] || '') }++ } @sorted_marc_record;
329
330 if ($debug) {
331 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
332 warn "## marc_record[$fetch_pos] = ", dump( $marc_rec ), $/;
333 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
334 warn "## subfield count = ", dump( $u ), $/;
335 }
336
337 my $len = $#sorted_marc_record;
338 my $visited;
339 my $i = 0;
340 my $field;
341
342 foreach ( 0 .. $len ) {
343
344 # find next element which isn't visited
345 while ($visited->{$i}) {
346 $i = ($i + 1) % ($len + 1);
347 }
348
349 # mark it visited
350 $visited->{$i}++;
351
352 my $row = dclone( $sorted_marc_record[$i] );
353
354 # field and subfield which is key for
355 # marc_repeatable_subfield and u
356 my $fsf = $row->[0] . ( $row->[3] || '' );
357
358 if ($debug > 1) {
359
360 print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
361 print "### this [$i]: ", dump( $row ),$/;
362 print "### sf: ", $row->[3], " vs ", $field->[3],
363 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
364 if ($#$field >= 0);
365
366 }
367
368 # if field exists
369 if ( $#$field >= 0 ) {
370 if (
371 $row->[0] ne $field->[0] || # field
372 $row->[1] ne $field->[1] || # i1
373 $row->[2] ne $field->[2] # i2
374 ) {
375 push @m, $field;
376 warn "## saved/1 ", dump( $field ),$/ if ($debug);
377 $field = $row;
378
379 } elsif (
380 ( $row->[3] lt $field->[-2] ) # subfield which is not next (e.g. a after c)
381 ||
382 ( $row->[3] eq $field->[-2] && # same subfield, but not repeatable
383 ! $marc_repeatable_subfield->{ $fsf }
384 )
385 ) {
386 push @m, $field;
387 warn "## saved/2 ", dump( $field ),$/ if ($debug);
388 $field = $row;
389
390 } else {
391 # append new subfields to existing field
392 push @$field, ( $row->[3], $row->[4] );
393 }
394 } else {
395 # insert first field
396 $field = $row;
397 }
398
399 if (! $marc_repeatable_subfield->{ $fsf }) {
400 # make step to next subfield
401 $i = ($i + $u->{ $fsf } ) % ($len + 1);
402 }
403 }
404
405 if ($#$field >= 0) {
406 push @m, $field;
407 warn "## saved/3 ", dump( $field ),$/ if ($debug);
408 }
409
410 return \@m;
411 }
412
413 =head2 _get_marc_leader
414
415 Return leader from currently fetched record by L</_get_marc_fields>
416
417 print WebPAC::Normalize::_get_marc_leader();
418
419 =cut
420
421 sub _get_marc_leader {
422 die "no fetch_pos, did you called _get_marc_fields first?" unless ( defined( $fetch_pos ) );
423 return $marc_leader->[ $fetch_pos ];
424 }
425
426 =head2 _debug
427
428 Change level of debug warnings
429
430 _debug( 2 );
431
432 =cut
433
434 sub _debug {
435 my $l = shift;
436 return $debug unless defined($l);
437 warn "debug level $l",$/ if ($l > 0);
438 $debug = $l;
439 }
440
441 =head1 Functions to create C<data_structure>
442
443 Those functions generally have to first in your normalization file.
444
445 =head2 tag
446
447 Define new tag for I<search> and I<display>.
448
449 tag('Title', rec('200','a') );
450
451
452 =cut
453
454 sub tag {
455 my $name = shift or die "tag needs name as first argument";
456 my @o = grep { defined($_) && $_ ne '' } @_;
457 return unless (@o);
458 $out->{$name}->{tag} = $name;
459 $out->{$name}->{search} = \@o;
460 $out->{$name}->{display} = \@o;
461 }
462
463 =head2 display
464
465 Define tag just for I<display>
466
467 @v = display('Title', rec('200','a') );
468
469 =cut
470
471 sub display {
472 my $name = shift or die "display needs name as first argument";
473 my @o = grep { defined($_) && $_ ne '' } @_;
474 return unless (@o);
475 $out->{$name}->{tag} = $name;
476 $out->{$name}->{display} = \@o;
477 }
478
479 =head2 search
480
481 Prepare values just for I<search>
482
483 @v = search('Title', rec('200','a') );
484
485 =cut
486
487 sub search {
488 my $name = shift or die "search needs name as first argument";
489 my @o = grep { defined($_) && $_ ne '' } @_;
490 return unless (@o);
491 $out->{$name}->{tag} = $name;
492 $out->{$name}->{search} = \@o;
493 }
494
495 =head2 marc_leader
496
497 Setup fields within MARC leader or get leader
498
499 marc_leader('05','c');
500 my $leader = marc_leader();
501
502 =cut
503
504 sub marc_leader {
505 my ($offset,$value) = @_;
506
507 if ($offset) {
508 $marc_leader->[ $marc_record_offset ]->{ $offset } = $value;
509 } else {
510
511 if (defined($marc_leader)) {
512 die "marc_leader not array = ", dump( $marc_leader ) unless (ref($marc_leader) eq 'ARRAY');
513 return $marc_leader->[ $marc_record_offset ];
514 } else {
515 return;
516 }
517 }
518 }
519
520 =head2 marc
521
522 Save value for MARC field
523
524 marc('900','a', rec('200','a') );
525 marc('001', rec('000') );
526
527 =cut
528
529 sub marc {
530 my $f = shift or die "marc needs field";
531 die "marc field must be numer" unless ($f =~ /^\d+$/);
532
533 my $sf;
534 if ($f >= 10) {
535 $sf = shift or die "marc needs subfield";
536 }
537
538 foreach (@_) {
539 my $v = $_; # make var read-write for Encode
540 next unless (defined($v) && $v !~ /^\s*$/);
541 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
542 if (defined $sf) {
543 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
544 } else {
545 push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
546 }
547 }
548 }
549
550 =head2 marc_repeatable_subfield
551
552 Save values for MARC repetable subfield
553
554 marc_repeatable_subfield('910', 'z', rec('909') );
555
556 =cut
557
558 sub marc_repeatable_subfield {
559 my ($f,$sf) = @_;
560 die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
561 $marc_repeatable_subfield->{ $f . $sf }++;
562 marc(@_);
563 }
564
565 =head2 marc_indicators
566
567 Set both indicators for MARC field
568
569 marc_indicators('900', ' ', 1);
570
571 Any indicator value other than C<0-9> will be treated as undefined.
572
573 =cut
574
575 sub marc_indicators {
576 my $f = shift || die "marc_indicators need field!\n";
577 my ($i1,$i2) = @_;
578 die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
579 die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
580
581 $i1 = ' ' if ($i1 !~ /^\d$/);
582 $i2 = ' ' if ($i2 !~ /^\d$/);
583 @{ $marc_indicators->{$f} } = ($i1,$i2);
584 }
585
586 =head2 marc_compose
587
588 Save values for each MARC subfield explicitly
589
590 marc_compose('900',
591 'a', rec('200','a')
592 'b', rec('201','a')
593 'a', rec('200','b')
594 'c', rec('200','c')
595 );
596
597 If you specify C<+> for subfield, value will be appended
598 to previous defined subfield.
599
600 =cut
601
602 sub marc_compose {
603 my $f = shift or die "marc_compose needs field";
604 die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
605
606 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
607 my $m = [ $f, $i1, $i2 ];
608
609 warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
610
611 if ($#_ % 2 != 1) {
612 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
613 }
614
615 while (@_) {
616 my $sf = shift;
617 my $v = shift;
618
619 next unless (defined($v) && $v !~ /^\s*$/);
620 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
621 if ($sf ne '+') {
622 push @$m, ( $sf, $v );
623 } else {
624 $m->[ $#$m ] .= $v;
625 }
626 }
627
628 warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
629
630 push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
631 }
632
633 =head2 marc_duplicate
634
635 Generate copy of current MARC record and continue working on copy
636
637 marc_duplicate();
638
639 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
640 C<< _get_marc_fields( offset => 42 ) >>.
641
642 =cut
643
644 sub marc_duplicate {
645 my $m = $marc_record->[ -1 ];
646 die "can't duplicate record which isn't defined" unless ($m);
647 push @{ $marc_record }, dclone( $m );
648 push @{ $marc_leader }, dclone( marc_leader() );
649 warn "## marc_duplicate = ", dump(@$marc_leader, @$marc_record), $/ if ($debug > 1);
650 $marc_record_offset = $#{ $marc_record };
651 warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
652
653 }
654
655 =head2 marc_remove
656
657 Remove some field or subfield from MARC record.
658
659 marc_remove('200');
660 marc_remove('200','a');
661
662 This will erase field C<200> or C<200^a> from current MARC record.
663
664 marc_remove('*');
665
666 Will remove all fields in current MARC record.
667
668 This is useful after calling C<marc_duplicate> or on it's own (but, you
669 should probably just remove that subfield definition if you are not
670 using C<marc_duplicate>).
671
672 FIXME: support fields < 10.
673
674 =cut
675
676 sub marc_remove {
677 my ($f, $sf) = @_;
678
679 die "marc_remove needs record number" unless defined($f);
680
681 my $marc = $marc_record->[ $marc_record_offset ];
682
683 warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
684
685 if ($f eq '*') {
686
687 delete( $marc_record->[ $marc_record_offset ] );
688 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
689
690 } else {
691
692 my $i = 0;
693 foreach ( 0 .. $#{ $marc } ) {
694 last unless (defined $marc->[$i]);
695 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
696 if ($marc->[$i]->[0] eq $f) {
697 if (! defined $sf) {
698 # remove whole field
699 splice @$marc, $i, 1;
700 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
701 $i--;
702 } else {
703 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
704 my $o = ($j * 2) + 3;
705 if ($marc->[$i]->[$o] eq $sf) {
706 # remove subfield
707 splice @{$marc->[$i]}, $o, 2;
708 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
709 # is record now empty?
710 if ($#{ $marc->[$i] } == 2) {
711 splice @$marc, $i, 1;
712 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
713 $i--;
714 };
715 }
716 }
717 }
718 }
719 $i++;
720 }
721
722 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
723
724 $marc_record->[ $marc_record_offset ] = $marc;
725 }
726
727 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
728 }
729
730 =head2 marc_original_order
731
732 Copy all subfields preserving original order to marc field.
733
734 marc_original_order( marc_field_number, original_input_field_number );
735
736 Please note that field numbers are consistent with other commands (marc
737 field number first), but somewhat counter-intuitive (destination and then
738 source).
739
740 You might want to use this command if you are just renaming subfields or
741 using pre-processing modify_record in C<config.yml> and don't need any
742 post-processing or want to preserve order of original subfields.
743
744
745 =cut
746
747 sub marc_original_order {
748
749 my ($to, $from) = @_;
750 die "marc_original_order needs from and to fields\n" unless ($from && $to);
751
752 return unless defined($rec->{$from});
753
754 my $r = $rec->{$from};
755 die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
756
757 my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
758 warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
759
760 foreach my $d (@$r) {
761
762 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
763 warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
764 next;
765 }
766
767 my @sfs = @{ $d->{subfields} };
768
769 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
770
771 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
772
773 my $m = [ $to, $i1, $i2 ];
774
775 while (my $sf = shift @sfs) {
776
777 warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
778 my $offset = shift @sfs;
779 die "corrupted sufields specification for field $from\n" unless defined($offset);
780
781 my $v;
782 if (ref($d->{$sf}) eq 'ARRAY') {
783 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
784 } elsif ($offset == 0) {
785 $v = $d->{$sf};
786 } else {
787 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
788 }
789 push @$m, ( $sf, $v ) if (defined($v));
790 }
791
792 if ($#{$m} > 2) {
793 push @{ $marc_record->[ $marc_record_offset ] }, $m;
794 }
795 }
796
797 warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
798 }
799
800 =head2 marc_count
801
802 Return number of MARC records created using L</marc_duplicate>.
803
804 print "created ", marc_count(), " records";
805
806 =cut
807
808 sub marc_count {
809 return $#{ $marc_record };
810 }
811
812
813 =head1 Functions to extract data from input
814
815 This function should be used inside functions to create C<data_structure> described
816 above.
817
818 =head2 _pack_subfields_hash
819
820 @subfields = _pack_subfields_hash( $h );
821 $subfields = _pack_subfields_hash( $h, 1 );
822
823 Return each subfield value in array or pack them all together and return scalar
824 with subfields (denoted by C<^>) and values.
825
826 =cut
827
828 sub _pack_subfields_hash {
829
830 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
831
832 my ($h,$include_subfields) = @_;
833
834 if ( defined($h->{subfields}) ) {
835 my $sfs = delete $h->{subfields} || die "no subfields?";
836 my @out;
837 while (@$sfs) {
838 my $sf = shift @$sfs;
839 push @out, '^' . $sf if ($include_subfields);
840 my $o = shift @$sfs;
841 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
842 # single element subfields are not arrays
843 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
844
845 push @out, $h->{$sf};
846 } else {
847 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
848 push @out, $h->{$sf}->[$o];
849 }
850 }
851 if ($include_subfields) {
852 return join('', @out);
853 } else {
854 return @out;
855 }
856 } else {
857 if ($include_subfields) {
858 my $out = '';
859 foreach my $sf (sort keys %$h) {
860 if (ref($h->{$sf}) eq 'ARRAY') {
861 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
862 } else {
863 $out .= '^' . $sf . $h->{$sf};
864 }
865 }
866 return $out;
867 } else {
868 # FIXME this should probably be in alphabetical order instead of hash order
869 values %{$h};
870 }
871 }
872 }
873
874 =head2 rec1
875
876 Return all values in some field
877
878 @v = rec1('200')
879
880 TODO: order of values is probably same as in source data, need to investigate that
881
882 =cut
883
884 sub rec1 {
885 my $f = shift;
886 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
887 return unless (defined($rec) && defined($rec->{$f}));
888 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
889 if (ref($rec->{$f}) eq 'ARRAY') {
890 my @out;
891 foreach my $h ( @{ $rec->{$f} } ) {
892 if (ref($h) eq 'HASH') {
893 push @out, ( _pack_subfields_hash( $h ) );
894 } else {
895 push @out, $h;
896 }
897 }
898 return @out;
899 } elsif( defined($rec->{$f}) ) {
900 return $rec->{$f};
901 }
902 }
903
904 =head2 rec2
905
906 Return all values in specific field and subfield
907
908 @v = rec2('200','a')
909
910 =cut
911
912 sub rec2 {
913 my $f = shift;
914 return unless (defined($rec && $rec->{$f}));
915 my $sf = shift;
916 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
917 return map {
918 if (ref($_->{$sf}) eq 'ARRAY') {
919 @{ $_->{$sf} };
920 } else {
921 $_->{$sf};
922 }
923 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
924 }
925
926 =head2 rec
927
928 syntaxtic sugar for
929
930 @v = rec('200')
931 @v = rec('200','a')
932
933 If rec() returns just single value, it will
934 return scalar, not array.
935
936 =cut
937
938 sub rec {
939 my @out;
940 if ($#_ == 0) {
941 @out = rec1(@_);
942 } elsif ($#_ == 1) {
943 @out = rec2(@_);
944 }
945 if ($#out == 0 && ! wantarray) {
946 return $out[0];
947 } elsif (@out) {
948 return @out;
949 } else {
950 return '';
951 }
952 }
953
954 =head2 regex
955
956 Apply regex to some or all values
957
958 @v = regex( 's/foo/bar/g', @v );
959
960 =cut
961
962 sub regex {
963 my $r = shift;
964 my @out;
965 #warn "r: $r\n", dump(\@_);
966 foreach my $t (@_) {
967 next unless ($t);
968 eval "\$t =~ $r";
969 push @out, $t if ($t && $t ne '');
970 }
971 return @out;
972 }
973
974 =head2 prefix
975
976 Prefix all values with a string
977
978 @v = prefix( 'my_', @v );
979
980 =cut
981
982 sub prefix {
983 my $p = shift or return;
984 return map { $p . $_ } grep { defined($_) } @_;
985 }
986
987 =head2 suffix
988
989 suffix all values with a string
990
991 @v = suffix( '_my', @v );
992
993 =cut
994
995 sub suffix {
996 my $s = shift or die "suffix needs string as first argument";
997 return map { $_ . $s } grep { defined($_) } @_;
998 }
999
1000 =head2 surround
1001
1002 surround all values with a two strings
1003
1004 @v = surround( 'prefix_', '_suffix', @v );
1005
1006 =cut
1007
1008 sub surround {
1009 my $p = shift or die "surround need prefix as first argument";
1010 my $s = shift or die "surround needs suffix as second argument";
1011 return map { $p . $_ . $s } grep { defined($_) } @_;
1012 }
1013
1014 =head2 first
1015
1016 Return first element
1017
1018 $v = first( @v );
1019
1020 =cut
1021
1022 sub first {
1023 my $r = shift;
1024 return $r;
1025 }
1026
1027 =head2 lookup
1028
1029 Consult lookup hashes for some value
1030
1031 @v = lookup(
1032 sub {
1033 'ffkk/peri/mfn'.rec('000')
1034 },
1035 'ffkk','peri','200-a-200-e',
1036 sub {
1037 first(rec(200,'a')).' '.first(rec('200','e'))
1038 }
1039 );
1040
1041 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1042 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1043
1044 lookup(
1045 # which results to return from record recorded in lookup
1046 sub { 'ffkk/peri/mfn' . rec('000') },
1047 # from which database and input
1048 'ffkk','peri',
1049 # such that following values match
1050 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1051 # if this part is missing, we will try to match same fields
1052 # from lookup record and current one, or you can override
1053 # which records to use from current record using
1054 sub { rec('900','x') . ' ' . rec('900','y') },
1055 )
1056
1057 You can think about this lookup as SQL (if that helps):
1058
1059 select
1060 sub { what }
1061 from
1062 database, input
1063 where
1064 sub { filter from lookuped record }
1065 having
1066 sub { optional filter on current record }
1067
1068 Easy as pie, right?
1069
1070 =cut
1071
1072 sub lookup {
1073 my ($what, $database, $input, $key, $having) = @_;
1074
1075 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1076
1077 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1078 return unless (defined($lookup->{$database}->{$input}->{$key}));
1079
1080 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1081
1082 my $mfns;
1083 my @having = $having->();
1084
1085 warn "## having = ", dump( @having ) if ($debug > 2);
1086
1087 foreach my $h ( @having ) {
1088 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1089 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1090 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1091 }
1092 }
1093
1094 return unless ($mfns);
1095
1096 my @mfns = sort keys %$mfns;
1097
1098 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1099
1100 my $old_rec = $rec;
1101 my @out;
1102
1103 foreach my $mfn (@mfns) {
1104 $rec = $load_row_coderef->( $database, $input, $mfn );
1105
1106 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1107
1108 my @vals = $what->();
1109
1110 push @out, ( @vals );
1111
1112 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1113 }
1114
1115 # if (ref($lookup->{$k}) eq 'ARRAY') {
1116 # return @{ $lookup->{$k} };
1117 # } else {
1118 # return $lookup->{$k};
1119 # }
1120
1121 $rec = $old_rec;
1122
1123 warn "## lookup returns = ", dump(@out), $/ if ($debug);
1124
1125 if ($#out == 0) {
1126 return $out[0];
1127 } else {
1128 return @out;
1129 }
1130 }
1131
1132 =head2 save_into_lookup
1133
1134 Save value into lookup. It associates current database, input
1135 and specific keys with one or more values which will be
1136 associated over MFN.
1137
1138 MFN will be extracted from first occurence current of field 000
1139 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1140
1141 my $nr = save_into_lookup($database,$input,$key,sub {
1142 # code which produce one or more values
1143 });
1144
1145 It returns number of items saved.
1146
1147 This function shouldn't be called directly, it's called from code created by
1148 L<WebPAC::Parser>.
1149
1150 =cut
1151
1152 sub save_into_lookup {
1153 my ($database,$input,$key,$coderef) = @_;
1154 die "save_into_lookup needs database" unless defined($database);
1155 die "save_into_lookup needs input" unless defined($input);
1156 die "save_into_lookup needs key" unless defined($key);
1157 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1158
1159 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1160
1161 my $mfn =
1162 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
1163 defined($config->{_mfn}) ? $config->{_mfn} :
1164 die "mfn not defined or zero";
1165
1166 my $nr = 0;
1167
1168 foreach my $v ( $coderef->() ) {
1169 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1170 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1171 $nr++;
1172 }
1173
1174 return $nr;
1175 }
1176
1177 =head2 config
1178
1179 Consult config values stored in C<config.yml>
1180
1181 # return database code (key under databases in yaml)
1182 $database_code = config(); # use _ from hash
1183 $database_name = config('name');
1184 $database_input_name = config('input name');
1185 $tag = config('input normalize tag');
1186
1187 Up to three levels are supported.
1188
1189 =cut
1190
1191 sub config {
1192 return unless ($config);
1193
1194 my $p = shift;
1195
1196 $p ||= '';
1197
1198 my $v;
1199
1200 warn "### getting config($p)\n" if ($debug > 1);
1201
1202 my @p = split(/\s+/,$p);
1203 if ($#p < 0) {
1204 $v = $config->{ '_' }; # special, database code
1205 } else {
1206
1207 my $c = dclone( $config );
1208
1209 foreach my $k (@p) {
1210 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1211 if (ref($c) eq 'ARRAY') {
1212 $c = shift @$c;
1213 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1214 last;
1215 }
1216
1217 if (! defined($c->{$k}) ) {
1218 $c = undef;
1219 last;
1220 } else {
1221 $c = $c->{$k};
1222 }
1223 }
1224 $v = $c if ($c);
1225
1226 }
1227
1228 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1229 warn "config( '$p' ) is empty\n" if (! $v);
1230
1231 return $v;
1232 }
1233
1234 =head2 id
1235
1236 Returns unique id of this record
1237
1238 $id = id();
1239
1240 Returns C<42/2> for 2nd occurence of MFN 42.
1241
1242 =cut
1243
1244 sub id {
1245 my $mfn = $config->{_mfn} || die "no _mfn in config data";
1246 return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1247 }
1248
1249 =head2 join_with
1250
1251 Joins walues with some delimiter
1252
1253 $v = join_with(", ", @v);
1254
1255 =cut
1256
1257 sub join_with {
1258 my $d = shift;
1259 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1260 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1261 return '' unless defined($v);
1262 return $v;
1263 }
1264
1265 =head2 split_rec_on
1266
1267 Split record subfield on some regex and take one of parts out
1268
1269 $a_before_semi_column =
1270 split_rec_on('200','a', /\s*;\s*/, $part);
1271
1272 C<$part> is optional number of element. First element is
1273 B<1>, not 0!
1274
1275 If there is no C<$part> parameter or C<$part> is 0, this function will
1276 return all values produced by splitting.
1277
1278 =cut
1279
1280 sub split_rec_on {
1281 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1282
1283 my ($fld, $sf, $regex, $part) = @_;
1284 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1285
1286 my @r = rec( $fld, $sf );
1287 my $v = shift @r;
1288 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1289
1290 return '' if ( ! defined($v) || $v =~ /^\s*$/);
1291
1292 my @s = split( $regex, $v );
1293 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1294 if ($part && $part > 0) {
1295 return $s[ $part - 1 ];
1296 } else {
1297 return @s;
1298 }
1299 }
1300
1301 my $hash;
1302
1303 =head2 set
1304
1305 set( key => 'value' );
1306
1307 =cut
1308
1309 sub set {
1310 my ($k,$v) = @_;
1311 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1312 $hash->{$k} = $v;
1313 };
1314
1315 =head2 get
1316
1317 get( 'key' );
1318
1319 =cut
1320
1321 sub get {
1322 my $k = shift || return;
1323 my $v = $hash->{$k};
1324 warn "## get $k = ", dump( $v ), $/ if ( $debug );
1325 return $v;
1326 }
1327
1328 =head2 count
1329
1330 if ( count( @result ) == 1 ) {
1331 # do something if only 1 result is there
1332 }
1333
1334 =cut
1335
1336 sub count {
1337 warn "## count ",dump(@_),$/ if ( $debug );
1338 return @_ . '';
1339 }
1340
1341 # END
1342 1;

  ViewVC Help
Powered by ViewVC 1.1.26