/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1118 - (show annotations)
Sun Oct 26 15:57:37 2008 UTC (15 years, 6 months ago) by dpavlin
File size: 17791 byte(s)
 r1747@llin:  dpavlin | 2008-10-26 16:55:31 +0100
 bug fix: don't destroy subfields values in data hash

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 to
11 search_display search display sorted
12
13 rec1 rec2 rec
14 frec frec_eq frec_ne
15 regex prefix suffix surround
16 first lookup join_with
17 save_into_lookup
18
19 split_rec_on
20
21 get set
22 count
23
24 /;
25
26 use warnings;
27 use strict;
28
29 #use base qw/WebPAC::Common/;
30 use Data::Dump qw/dump/;
31 use Carp qw/confess/;
32
33 # debugging warn(s)
34 my $debug = 0;
35 _debug( $debug );
36
37 # FIXME
38 use WebPAC::Normalize::ISBN;
39 push @EXPORT, ( 'isbn_10', 'isbn_13' );
40
41 use WebPAC::Normalize::MARC;
42 push @EXPORT, ( qw/
43 marc marc_indicators marc_repeatable_subfield
44 marc_compose marc_leader marc_fixed
45 marc_duplicate marc_remove marc_count
46 marc_original_order
47 marc_template
48 /);
49
50 use Storable qw/dclone/;
51
52 =head1 NAME
53
54 WebPAC::Normalize - describe normalisaton rules using sets
55
56 =cut
57
58 our $VERSION = '0.36';
59
60 =head1 SYNOPSIS
61
62 This module uses C<conf/normalize/*.pl> files to perform normalisation
63 from input records using perl functions which are specialized for set
64 processing.
65
66 Sets are implemented as arrays, and normalisation file is valid perl, which
67 means that you check it's validity before running WebPAC using
68 C<perl -c normalize.pl>.
69
70 Normalisation can generate multiple output normalized data. For now, supported output
71 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
72 C<marc>.
73
74 =head1 FUNCTIONS
75
76 Functions which start with C<_> are private and used by WebPAC internally.
77 All other functions are available for use within normalisation rules.
78
79 =head2 data_structure
80
81 Return data structure
82
83 my $ds = WebPAC::Normalize::data_structure(
84 lookup => $lookup_hash,
85 row => $row,
86 rules => $normalize_pl_config,
87 marc_encoding => 'utf-8',
88 config => $config,
89 load_row_coderef => sub {
90 my ($database,$input,$mfn) = @_;
91 $store->load_row( database => $database, input => $input, id => $mfn );
92 },
93 );
94
95 Options C<row>, C<rules> and C<log> are mandatory while all
96 other are optional.
97
98 C<load_row_coderef> is closure only used when executing lookups, so they will
99 die if it's not defined.
100
101 This function will B<die> if normalizastion can't be evaled.
102
103 Since this function isn't exported you have to call it with
104 C<WebPAC::Normalize::data_structure>.
105
106 =cut
107
108 my $load_row_coderef;
109
110 sub data_structure {
111 my $arg = {@_};
112
113 die "need row argument" unless ($arg->{row});
114 die "need normalisation argument" unless ($arg->{rules});
115
116 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
117 _set_ds( $arg->{row} );
118 _set_config( $arg->{config} ) if defined($arg->{config});
119 _clean_ds( %{ $arg } );
120 $load_row_coderef = $arg->{load_row_coderef};
121
122 no strict 'subs';
123 no warnings 'redefine';
124 eval "$arg->{rules};";
125 die "error evaling $arg->{rules}: $@\n" if ($@);
126
127 return _get_ds();
128 }
129
130 =head2 _set_ds
131
132 Set current record hash
133
134 _set_ds( $rec );
135
136 =cut
137
138 my $rec;
139
140 sub _set_ds {
141 $rec = shift or die "no record hash";
142 $WebPAC::Normalize::MARC::rec = $rec;
143 }
144
145 =head2
146
147 my $rec = _get_rec();
148
149 =cut
150
151 sub _get_rec { $rec };
152
153 =head2 _set_config
154
155 Set current config hash
156
157 _set_config( $config );
158
159 Magic keys are:
160
161 =over 4
162
163 =item _
164
165 Code of current database
166
167 =item _mfn
168
169 Current MFN
170
171 =back
172
173 =cut
174
175 my $config;
176
177 sub _set_config {
178 $config = shift;
179 }
180
181 =head2 _get_ds
182
183 Return hash formatted as data structure
184
185 my $ds = _get_ds();
186
187 =cut
188
189 my $out;
190
191 sub _get_ds {
192 #warn "## out = ",dump($out);
193 return $out;
194 }
195
196 =head2 _clean_ds
197
198 Clean data structure hash for next record
199
200 _clean_ds();
201
202 =cut
203
204 sub _clean_ds {
205 my $a = {@_};
206 $out = undef;
207 WebPAC::Normalize::MARC::_clean();
208 }
209
210 =head2 _set_lookup
211
212 Set current lookup hash
213
214 _set_lookup( $lookup );
215
216 =cut
217
218 my $lookup;
219
220 sub _set_lookup {
221 $lookup = shift;
222 }
223
224 =head2 _get_lookup
225
226 Get current lookup hash
227
228 my $lookup = _get_lookup();
229
230 =cut
231
232 sub _get_lookup {
233 return $lookup;
234 }
235
236 =head2 _set_load_row
237
238 Setup code reference which will return L<data_structure> from
239 L<WebPAC::Store>
240
241 _set_load_row(sub {
242 my ($database,$input,$mfn) = @_;
243 $store->load_row( database => $database, input => $input, id => $mfn );
244 });
245
246 =cut
247
248 sub _set_load_row {
249 my $coderef = shift;
250 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
251
252 $load_row_coderef = $coderef;
253 }
254
255 =head2 _debug
256
257 Change level of debug warnings
258
259 _debug( 2 );
260
261 =cut
262
263 sub _debug {
264 my $l = shift;
265 return $debug unless defined($l);
266 warn "debug level $l",$/ if ($l > 0);
267 $debug = $l;
268 $WebPAC::Normalize::MARC::debug = $debug;
269 }
270
271 =head1 Functions to create C<data_structure>
272
273 Those functions generally have to first in your normalization file.
274
275 =head2 to
276
277 Generic way to set values for some name
278
279 to('field-name', 'name-value' => rec('200','a') );
280
281 There are many helpers defined below which might be easier to use.
282
283 =cut
284
285 sub to {
286 my $type = shift or confess "need type -- BUG?";
287 my $name = shift or confess "needs name as first argument";
288 my @o = grep { defined($_) && $_ ne '' } @_;
289 return unless (@o);
290 $out->{$name}->{$type} = \@o;
291 }
292
293 =head2 search_display
294
295 Define output for L<search> and L<display> at the same time
296
297 search_display('Title', rec('200','a') );
298
299 =cut
300
301 sub search_display {
302 my $name = shift or die "search_display needs name as first argument";
303 my @o = grep { defined($_) && $_ ne '' } @_;
304 return unless (@o);
305 $out->{$name}->{search} = \@o;
306 $out->{$name}->{display} = \@o;
307 }
308
309 =head2 tag
310
311 Old name for L<search_display>, it will probably be removed at one point.
312
313 =cut
314
315 sub tag {
316 search_display( @_ );
317 }
318
319 =head2 display
320
321 Define output just for I<display>
322
323 @v = display('Title', rec('200','a') );
324
325 =cut
326
327 sub display { to( 'display', @_ ) }
328
329 =head2 search
330
331 Prepare values just for I<search>
332
333 @v = search('Title', rec('200','a') );
334
335 =cut
336
337 sub search { to( 'search', @_ ) }
338
339 =head2 sorted
340
341 Insert into lists which will be automatically sorted
342
343 sorted('Title', rec('200','a') );
344
345 =cut
346
347 sub sorted { to( 'sorted', @_ ) }
348
349
350 =head1 Functions to extract data from input
351
352 This function should be used inside functions to create C<data_structure> described
353 above.
354
355 =head2 _pack_subfields_hash
356
357 @subfields = _pack_subfields_hash( $h );
358 $subfields = _pack_subfields_hash( $h, 1 );
359
360 Return each subfield value in array or pack them all together and return scalar
361 with subfields (denoted by C<^>) and values.
362
363 =cut
364
365 sub _pack_subfields_hash {
366
367 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
368
369 my ($hash,$include_subfields) = @_;
370
371 # sanity and ease of use
372 return $hash if (ref($hash) ne 'HASH');
373
374 my $h = dclone( $hash );
375
376 if ( defined($h->{subfields}) ) {
377 my $sfs = delete $h->{subfields} || die "no subfields?";
378 my @out;
379 while (@$sfs) {
380 my $sf = shift @$sfs;
381 push @out, '^' . $sf if ($include_subfields);
382 my $o = shift @$sfs;
383 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
384 # single element subfields are not arrays
385 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
386
387 push @out, $h->{$sf};
388 } else {
389 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
390 push @out, $h->{$sf}->[$o];
391 }
392 }
393 if ($include_subfields) {
394 return join('', @out);
395 } else {
396 return @out;
397 }
398 } else {
399 if ($include_subfields) {
400 my $out = '';
401 foreach my $sf (sort keys %$h) {
402 if (ref($h->{$sf}) eq 'ARRAY') {
403 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
404 } else {
405 $out .= '^' . $sf . $h->{$sf};
406 }
407 }
408 return $out;
409 } else {
410 # FIXME this should probably be in alphabetical order instead of hash order
411 values %{$h};
412 }
413 }
414 }
415
416 =head2 rec1
417
418 Return all values in some field
419
420 @v = rec1('200')
421
422 TODO: order of values is probably same as in source data, need to investigate that
423
424 =cut
425
426 sub rec1 {
427 my $f = shift;
428 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
429 return unless (defined($rec) && defined($rec->{$f}));
430 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
431 if (ref($rec->{$f}) eq 'ARRAY') {
432 my @out;
433 foreach my $h ( @{ $rec->{$f} } ) {
434 if (ref($h) eq 'HASH') {
435 push @out, ( _pack_subfields_hash( $h ) );
436 } else {
437 push @out, $h;
438 }
439 }
440 return @out;
441 } elsif( defined($rec->{$f}) ) {
442 return $rec->{$f};
443 }
444 }
445
446 =head2 rec2
447
448 Return all values in specific field and subfield
449
450 @v = rec2('200','a')
451
452 =cut
453
454 sub rec2 {
455 my $f = shift;
456 return unless (defined($rec && $rec->{$f}));
457 my $sf = shift;
458 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
459 return map {
460 if (ref($_->{$sf}) eq 'ARRAY') {
461 @{ $_->{$sf} };
462 } else {
463 $_->{$sf};
464 }
465 } grep { ref($_) eq 'HASH' && defined $_->{$sf} } @{ $rec->{$f} };
466 }
467
468 =head2 rec
469
470 syntaxtic sugar for
471
472 @v = rec('200')
473 @v = rec('200','a')
474
475 If rec() returns just single value, it will
476 return scalar, not array.
477
478 =cut
479
480 sub rec {
481 my @out;
482 if ($#_ == 0) {
483 @out = rec1(@_);
484 } elsif ($#_ == 1) {
485 @out = rec2(@_);
486 }
487 if ($#out == 0 && ! wantarray) {
488 return $out[0];
489 } elsif (@out) {
490 return @out;
491 } else {
492 return '';
493 }
494 }
495
496 =head2 frec
497
498 Returns first value from field
499
500 $v = frec('200');
501 $v = frec('200','a');
502
503 =cut
504
505 sub frec {
506 my @out = rec(@_);
507 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
508 return shift @out;
509 }
510
511 =head2 frec_eq
512
513 =head2 frec_ne
514
515 Check if first values from two fields are same or different
516
517 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
518 # values are same
519 } else {
520 # values are different
521 }
522
523 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
524 could write something like:
525
526 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
527 # yada tada
528 }
529
530 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
531 in order to parse text and create invalid function C<eqfrec>.
532
533 =cut
534
535 sub frec_eq {
536 my ( $f1,$sf1, $f2, $sf2 ) = @_;
537 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
538 }
539
540 sub frec_ne {
541 return ! frec_eq( @_ );
542 }
543
544 =head2 regex
545
546 Apply regex to some or all values
547
548 @v = regex( 's/foo/bar/g', @v );
549
550 =cut
551
552 sub regex {
553 my $r = shift;
554 my @out;
555 #warn "r: $r\n", dump(\@_);
556 foreach my $t (@_) {
557 next unless ($t);
558 eval "\$t =~ $r";
559 push @out, $t if ($t && $t ne '');
560 }
561 return @out;
562 }
563
564 =head2 prefix
565
566 Prefix all values with a string
567
568 @v = prefix( 'my_', @v );
569
570 =cut
571
572 sub prefix {
573 my $p = shift;
574 return @_ unless defined( $p );
575 return map { $p . $_ } grep { defined($_) } @_;
576 }
577
578 =head2 suffix
579
580 suffix all values with a string
581
582 @v = suffix( '_my', @v );
583
584 =cut
585
586 sub suffix {
587 my $s = shift;
588 return @_ unless defined( $s );
589 return map { $_ . $s } grep { defined($_) } @_;
590 }
591
592 =head2 surround
593
594 surround all values with a two strings
595
596 @v = surround( 'prefix_', '_suffix', @v );
597
598 =cut
599
600 sub surround {
601 my $p = shift;
602 my $s = shift;
603 $p = '' unless defined( $p );
604 $s = '' unless defined( $s );
605 return map { $p . $_ . $s } grep { defined($_) } @_;
606 }
607
608 =head2 first
609
610 Return first element
611
612 $v = first( @v );
613
614 =cut
615
616 sub first {
617 my $r = shift;
618 return $r;
619 }
620
621 =head2 lookup
622
623 Consult lookup hashes for some value
624
625 @v = lookup(
626 sub {
627 'ffkk/peri/mfn'.rec('000')
628 },
629 'ffkk','peri','200-a-200-e',
630 sub {
631 first(rec(200,'a')).' '.first(rec('200','e'))
632 }
633 );
634
635 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
636 normal lookup definition in C<conf/lookup/something.pl> which looks like:
637
638 lookup(
639 # which results to return from record recorded in lookup
640 sub { 'ffkk/peri/mfn' . rec('000') },
641 # from which database and input
642 'ffkk','peri',
643 # such that following values match
644 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
645 # if this part is missing, we will try to match same fields
646 # from lookup record and current one, or you can override
647 # which records to use from current record using
648 sub { rec('900','x') . ' ' . rec('900','y') },
649 )
650
651 You can think about this lookup as SQL (if that helps):
652
653 select
654 sub { what }
655 from
656 database, input
657 where
658 sub { filter from lookuped record }
659 having
660 sub { optional filter on current record }
661
662 Easy as pie, right?
663
664 =cut
665
666 sub lookup {
667 my ($what, $database, $input, $key, $having) = @_;
668
669 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
670
671 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
672 return unless (defined($lookup->{$database}->{$input}->{$key}));
673
674 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
675
676 my $mfns;
677 my @having = $having->();
678
679 warn "## having = ", dump( @having ) if ($debug > 2);
680
681 foreach my $h ( @having ) {
682 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
683 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
684 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
685 }
686 }
687
688 return unless ($mfns);
689
690 my @mfns = sort keys %$mfns;
691
692 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
693
694 my $old_rec = $rec;
695 my @out;
696
697 foreach my $mfn (@mfns) {
698 $rec = $load_row_coderef->( $database, $input, $mfn );
699
700 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
701
702 my @vals = $what->();
703
704 push @out, ( @vals );
705
706 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
707 }
708
709 # if (ref($lookup->{$k}) eq 'ARRAY') {
710 # return @{ $lookup->{$k} };
711 # } else {
712 # return $lookup->{$k};
713 # }
714
715 $rec = $old_rec;
716
717 warn "## lookup returns = ", dump(@out), $/ if ($debug);
718
719 if ($#out == 0) {
720 return $out[0];
721 } else {
722 return @out;
723 }
724 }
725
726 =head2 save_into_lookup
727
728 Save value into lookup. It associates current database, input
729 and specific keys with one or more values which will be
730 associated over MFN.
731
732 MFN will be extracted from first occurence current of field 000
733 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
734
735 my $nr = save_into_lookup($database,$input,$key,sub {
736 # code which produce one or more values
737 });
738
739 It returns number of items saved.
740
741 This function shouldn't be called directly, it's called from code created by
742 L<WebPAC::Parser>.
743
744 =cut
745
746 sub save_into_lookup {
747 my ($database,$input,$key,$coderef) = @_;
748 die "save_into_lookup needs database" unless defined($database);
749 die "save_into_lookup needs input" unless defined($input);
750 die "save_into_lookup needs key" unless defined($key);
751 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
752
753 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
754
755 my $mfn =
756 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
757 defined($config->{_mfn}) ? $config->{_mfn} :
758 die "mfn not defined or zero";
759
760 my $nr = 0;
761
762 foreach my $v ( $coderef->() ) {
763 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
764 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
765 $nr++;
766 }
767
768 return $nr;
769 }
770
771 =head2 config
772
773 Consult config values stored in C<config.yml>
774
775 # return database code (key under databases in yaml)
776 $database_code = config(); # use _ from hash
777 $database_name = config('name');
778 $database_input_name = config('input name');
779
780 Up to three levels are supported.
781
782 =cut
783
784 sub config {
785 return unless ($config);
786
787 my $p = shift;
788
789 $p ||= '';
790
791 my $v;
792
793 warn "### getting config($p)\n" if ($debug > 1);
794
795 my @p = split(/\s+/,$p);
796 if ($#p < 0) {
797 $v = $config->{ '_' }; # special, database code
798 } else {
799
800 my $c = dclone( $config );
801
802 foreach my $k (@p) {
803 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
804 if (ref($c) eq 'ARRAY') {
805 $c = shift @$c;
806 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
807 last;
808 }
809
810 if (! defined($c->{$k}) ) {
811 $c = undef;
812 last;
813 } else {
814 $c = $c->{$k};
815 }
816 }
817 $v = $c if ($c);
818
819 }
820
821 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
822 warn "config( '$p' ) is empty\n" if (! $v);
823
824 return $v;
825 }
826
827 =head2 id
828
829 Returns unique id of this record
830
831 $id = id();
832
833 Returns C<42/2> for 2nd occurence of MFN 42.
834
835 =cut
836
837 sub id {
838 my $mfn = $config->{_mfn} || die "no _mfn in config data";
839 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
840 }
841
842 =head2 join_with
843
844 Joins walues with some delimiter
845
846 $v = join_with(", ", @v);
847
848 =cut
849
850 sub join_with {
851 my $d = shift;
852 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
853 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
854 return '' unless defined($v);
855 return $v;
856 }
857
858 =head2 split_rec_on
859
860 Split record subfield on some regex and take one of parts out
861
862 $a_before_semi_column =
863 split_rec_on('200','a', /\s*;\s*/, $part);
864
865 C<$part> is optional number of element. First element is
866 B<1>, not 0!
867
868 If there is no C<$part> parameter or C<$part> is 0, this function will
869 return all values produced by splitting.
870
871 =cut
872
873 sub split_rec_on {
874 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
875
876 my ($fld, $sf, $regex, $part) = @_;
877 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
878
879 my @r = rec( $fld, $sf );
880 my $v = shift @r;
881 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
882
883 return '' if ( ! defined($v) || $v =~ /^\s*$/);
884
885 my @s = split( $regex, $v );
886 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
887 if ($part && $part > 0) {
888 return $s[ $part - 1 ];
889 } else {
890 return @s;
891 }
892 }
893
894 my $hash;
895
896 =head2 set
897
898 set( key => 'value' );
899
900 =cut
901
902 sub set {
903 my ($k,$v) = @_;
904 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
905 $hash->{$k} = $v;
906 };
907
908 =head2 get
909
910 get( 'key' );
911
912 =cut
913
914 sub get {
915 my $k = shift || return;
916 my $v = $hash->{$k};
917 warn "## get $k = ", dump( $v ), $/ if ( $debug );
918 return $v;
919 }
920
921 =head2 count
922
923 if ( count( @result ) == 1 ) {
924 # do something if only 1 result is there
925 }
926
927 =cut
928
929 sub count {
930 warn "## count ",dump(@_),$/ if ( $debug );
931 return @_ . '';
932 }
933
934 # END
935 1;

  ViewVC Help
Powered by ViewVC 1.1.26