/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1037 - (show annotations)
Mon Nov 12 11:17:19 2007 UTC (15 years, 2 months ago) by dpavlin
File size: 17593 byte(s)
 r1612@llin:  dpavlin | 2007-11-12 12:17:17 +0100
 fixed debug levels: now tests run with -d will display
 own diag messages, while -d -d will be 1st level of
 debug for WebPAC, and so on...

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 rec1 rec2 rec
13 frec frec_eq frec_ne
14 regex prefix suffix surround
15 first lookup join_with
16 save_into_lookup
17
18 split_rec_on
19
20 get set
21 count
22
23 /;
24
25 use warnings;
26 use strict;
27
28 #use base qw/WebPAC::Common/;
29 use Data::Dump qw/dump/;
30 use Carp qw/confess/;
31
32 # debugging warn(s)
33 my $debug = 0;
34 _debug( $debug );
35
36 # FIXME
37 use WebPAC::Normalize::ISBN;
38 push @EXPORT, ( 'isbn_10', 'isbn_13' );
39
40 use WebPAC::Normalize::MARC;
41 push @EXPORT, ( qw/
42 marc marc_indicators marc_repeatable_subfield
43 marc_compose marc_leader marc_fixed
44 marc_duplicate marc_remove marc_count
45 marc_original_order
46 marc_template
47 /);
48
49 =head1 NAME
50
51 WebPAC::Normalize - describe normalisaton rules using sets
52
53 =cut
54
55 our $VERSION = '0.35';
56
57 =head1 SYNOPSIS
58
59 This module uses C<conf/normalize/*.pl> files to perform normalisation
60 from input records using perl functions which are specialized for set
61 processing.
62
63 Sets are implemented as arrays, and normalisation file is valid perl, which
64 means that you check it's validity before running WebPAC using
65 C<perl -c normalize.pl>.
66
67 Normalisation can generate multiple output normalized data. For now, supported output
68 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69 C<marc>.
70
71 =head1 FUNCTIONS
72
73 Functions which start with C<_> are private and used by WebPAC internally.
74 All other functions are available for use within normalisation rules.
75
76 =head2 data_structure
77
78 Return data structure
79
80 my $ds = WebPAC::Normalize::data_structure(
81 lookup => $lookup_hash,
82 row => $row,
83 rules => $normalize_pl_config,
84 marc_encoding => 'utf-8',
85 config => $config,
86 load_row_coderef => sub {
87 my ($database,$input,$mfn) = @_;
88 $store->load_row( database => $database, input => $input, id => $mfn );
89 },
90 );
91
92 Options C<row>, C<rules> and C<log> are mandatory while all
93 other are optional.
94
95 C<load_row_coderef> is closure only used when executing lookups, so they will
96 die if it's not defined.
97
98 This function will B<die> if normalizastion can't be evaled.
99
100 Since this function isn't exported you have to call it with
101 C<WebPAC::Normalize::data_structure>.
102
103 =cut
104
105 my $load_row_coderef;
106
107 sub data_structure {
108 my $arg = {@_};
109
110 die "need row argument" unless ($arg->{row});
111 die "need normalisation argument" unless ($arg->{rules});
112
113 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114 _set_ds( $arg->{row} );
115 _set_config( $arg->{config} ) if defined($arg->{config});
116 _clean_ds( %{ $arg } );
117 $load_row_coderef = $arg->{load_row_coderef};
118
119 no strict 'subs';
120 no warnings 'redefine';
121 eval "$arg->{rules};";
122 die "error evaling $arg->{rules}: $@\n" if ($@);
123
124 return _get_ds();
125 }
126
127 =head2 _set_ds
128
129 Set current record hash
130
131 _set_ds( $rec );
132
133 =cut
134
135 my $rec;
136
137 sub _set_ds {
138 $rec = shift or die "no record hash";
139 $WebPAC::Normalize::MARC::rec = $rec;
140 }
141
142 =head2
143
144 my $rec = _get_rec();
145
146 =cut
147
148 sub _get_rec { $rec };
149
150 =head2 _set_config
151
152 Set current config hash
153
154 _set_config( $config );
155
156 Magic keys are:
157
158 =over 4
159
160 =item _
161
162 Code of current database
163
164 =item _mfn
165
166 Current MFN
167
168 =back
169
170 =cut
171
172 my $config;
173
174 sub _set_config {
175 $config = shift;
176 }
177
178 =head2 _get_ds
179
180 Return hash formatted as data structure
181
182 my $ds = _get_ds();
183
184 =cut
185
186 my $out;
187
188 sub _get_ds {
189 #warn "## out = ",dump($out);
190 return $out;
191 }
192
193 =head2 _clean_ds
194
195 Clean data structure hash for next record
196
197 _clean_ds();
198
199 =cut
200
201 sub _clean_ds {
202 my $a = {@_};
203 $out = undef;
204 WebPAC::Normalize::MARC::_clean();
205 }
206
207 =head2 _set_lookup
208
209 Set current lookup hash
210
211 _set_lookup( $lookup );
212
213 =cut
214
215 my $lookup;
216
217 sub _set_lookup {
218 $lookup = shift;
219 }
220
221 =head2 _get_lookup
222
223 Get current lookup hash
224
225 my $lookup = _get_lookup();
226
227 =cut
228
229 sub _get_lookup {
230 return $lookup;
231 }
232
233 =head2 _set_load_row
234
235 Setup code reference which will return L<data_structure> from
236 L<WebPAC::Store>
237
238 _set_load_row(sub {
239 my ($database,$input,$mfn) = @_;
240 $store->load_row( database => $database, input => $input, id => $mfn );
241 });
242
243 =cut
244
245 sub _set_load_row {
246 my $coderef = shift;
247 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
248
249 $load_row_coderef = $coderef;
250 }
251
252 =head2 _debug
253
254 Change level of debug warnings
255
256 _debug( 2 );
257
258 =cut
259
260 sub _debug {
261 my $l = shift;
262 return $debug unless defined($l);
263 warn "debug level $l",$/ if ($l > 0);
264 $debug = $l;
265 $WebPAC::Normalize::MARC::debug = $debug;
266 warn "#### MARC::debug = ",dump($WebPAC::Normalize::MARC::debug);
267 }
268
269 =head1 Functions to create C<data_structure>
270
271 Those functions generally have to first in your normalization file.
272
273 =head2 search_display
274
275 Define output for L<search> and L<display> at the same time
276
277 search_display('Title', rec('200','a') );
278
279
280 =cut
281
282 sub search_display {
283 my $name = shift or die "search_display needs name as first argument";
284 my @o = grep { defined($_) && $_ ne '' } @_;
285 return unless (@o);
286 $out->{$name}->{search} = \@o;
287 $out->{$name}->{display} = \@o;
288 }
289
290 =head2 tag
291
292 Old name for L<search_display>, but supported
293
294 =cut
295
296 sub tag {
297 search_display( @_ );
298 }
299
300 =head2 display
301
302 Define output just for I<display>
303
304 @v = display('Title', rec('200','a') );
305
306 =cut
307
308 sub _field {
309 my $type = shift or confess "need type -- BUG?";
310 my $name = shift or confess "needs name as first argument";
311 my @o = grep { defined($_) && $_ ne '' } @_;
312 return unless (@o);
313 $out->{$name}->{$type} = \@o;
314 }
315
316 sub display { _field( 'display', @_ ) }
317
318 =head2 search
319
320 Prepare values just for I<search>
321
322 @v = search('Title', rec('200','a') );
323
324 =cut
325
326 sub search { _field( 'search', @_ ) }
327
328 =head2 sorted
329
330 Insert into lists which will be automatically sorted
331
332 sorted('Title', rec('200','a') );
333
334 =cut
335
336 sub sorted { _field( 'sorted', @_ ) }
337
338
339
340 =head1 Functions to extract data from input
341
342 This function should be used inside functions to create C<data_structure> described
343 above.
344
345 =head2 _pack_subfields_hash
346
347 @subfields = _pack_subfields_hash( $h );
348 $subfields = _pack_subfields_hash( $h, 1 );
349
350 Return each subfield value in array or pack them all together and return scalar
351 with subfields (denoted by C<^>) and values.
352
353 =cut
354
355 sub _pack_subfields_hash {
356
357 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
358
359 my ($h,$include_subfields) = @_;
360
361 # sanity and ease of use
362 return $h if (ref($h) ne 'HASH');
363
364 if ( defined($h->{subfields}) ) {
365 my $sfs = delete $h->{subfields} || die "no subfields?";
366 my @out;
367 while (@$sfs) {
368 my $sf = shift @$sfs;
369 push @out, '^' . $sf if ($include_subfields);
370 my $o = shift @$sfs;
371 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
372 # single element subfields are not arrays
373 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
374
375 push @out, $h->{$sf};
376 } else {
377 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
378 push @out, $h->{$sf}->[$o];
379 }
380 }
381 if ($include_subfields) {
382 return join('', @out);
383 } else {
384 return @out;
385 }
386 } else {
387 if ($include_subfields) {
388 my $out = '';
389 foreach my $sf (sort keys %$h) {
390 if (ref($h->{$sf}) eq 'ARRAY') {
391 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
392 } else {
393 $out .= '^' . $sf . $h->{$sf};
394 }
395 }
396 return $out;
397 } else {
398 # FIXME this should probably be in alphabetical order instead of hash order
399 values %{$h};
400 }
401 }
402 }
403
404 =head2 rec1
405
406 Return all values in some field
407
408 @v = rec1('200')
409
410 TODO: order of values is probably same as in source data, need to investigate that
411
412 =cut
413
414 sub rec1 {
415 my $f = shift;
416 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
417 return unless (defined($rec) && defined($rec->{$f}));
418 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
419 if (ref($rec->{$f}) eq 'ARRAY') {
420 my @out;
421 foreach my $h ( @{ $rec->{$f} } ) {
422 if (ref($h) eq 'HASH') {
423 push @out, ( _pack_subfields_hash( $h ) );
424 } else {
425 push @out, $h;
426 }
427 }
428 return @out;
429 } elsif( defined($rec->{$f}) ) {
430 return $rec->{$f};
431 }
432 }
433
434 =head2 rec2
435
436 Return all values in specific field and subfield
437
438 @v = rec2('200','a')
439
440 =cut
441
442 sub rec2 {
443 my $f = shift;
444 return unless (defined($rec && $rec->{$f}));
445 my $sf = shift;
446 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
447 return map {
448 if (ref($_->{$sf}) eq 'ARRAY') {
449 @{ $_->{$sf} };
450 } else {
451 $_->{$sf};
452 }
453 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
454 }
455
456 =head2 rec
457
458 syntaxtic sugar for
459
460 @v = rec('200')
461 @v = rec('200','a')
462
463 If rec() returns just single value, it will
464 return scalar, not array.
465
466 =cut
467
468 sub rec {
469 my @out;
470 if ($#_ == 0) {
471 @out = rec1(@_);
472 } elsif ($#_ == 1) {
473 @out = rec2(@_);
474 }
475 if ($#out == 0 && ! wantarray) {
476 return $out[0];
477 } elsif (@out) {
478 return @out;
479 } else {
480 return '';
481 }
482 }
483
484 =head2 frec
485
486 Returns first value from field
487
488 $v = frec('200');
489 $v = frec('200','a');
490
491 =cut
492
493 sub frec {
494 my @out = rec(@_);
495 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
496 return shift @out;
497 }
498
499 =head2 frec_eq
500
501 =head2 frec_ne
502
503 Check if first values from two fields are same or different
504
505 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
506 # values are same
507 } else {
508 # values are different
509 }
510
511 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
512 could write something like:
513
514 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
515 # yada tada
516 }
517
518 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
519 in order to parse text and create invalid function C<eqfrec>.
520
521 =cut
522
523 sub frec_eq {
524 my ( $f1,$sf1, $f2, $sf2 ) = @_;
525 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
526 }
527
528 sub frec_ne {
529 return ! frec_eq( @_ );
530 }
531
532 =head2 regex
533
534 Apply regex to some or all values
535
536 @v = regex( 's/foo/bar/g', @v );
537
538 =cut
539
540 sub regex {
541 my $r = shift;
542 my @out;
543 #warn "r: $r\n", dump(\@_);
544 foreach my $t (@_) {
545 next unless ($t);
546 eval "\$t =~ $r";
547 push @out, $t if ($t && $t ne '');
548 }
549 return @out;
550 }
551
552 =head2 prefix
553
554 Prefix all values with a string
555
556 @v = prefix( 'my_', @v );
557
558 =cut
559
560 sub prefix {
561 my $p = shift;
562 return @_ unless defined( $p );
563 return map { $p . $_ } grep { defined($_) } @_;
564 }
565
566 =head2 suffix
567
568 suffix all values with a string
569
570 @v = suffix( '_my', @v );
571
572 =cut
573
574 sub suffix {
575 my $s = shift;
576 return @_ unless defined( $s );
577 return map { $_ . $s } grep { defined($_) } @_;
578 }
579
580 =head2 surround
581
582 surround all values with a two strings
583
584 @v = surround( 'prefix_', '_suffix', @v );
585
586 =cut
587
588 sub surround {
589 my $p = shift;
590 my $s = shift;
591 $p = '' unless defined( $p );
592 $s = '' unless defined( $s );
593 return map { $p . $_ . $s } grep { defined($_) } @_;
594 }
595
596 =head2 first
597
598 Return first element
599
600 $v = first( @v );
601
602 =cut
603
604 sub first {
605 my $r = shift;
606 return $r;
607 }
608
609 =head2 lookup
610
611 Consult lookup hashes for some value
612
613 @v = lookup(
614 sub {
615 'ffkk/peri/mfn'.rec('000')
616 },
617 'ffkk','peri','200-a-200-e',
618 sub {
619 first(rec(200,'a')).' '.first(rec('200','e'))
620 }
621 );
622
623 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
624 normal lookup definition in C<conf/lookup/something.pl> which looks like:
625
626 lookup(
627 # which results to return from record recorded in lookup
628 sub { 'ffkk/peri/mfn' . rec('000') },
629 # from which database and input
630 'ffkk','peri',
631 # such that following values match
632 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
633 # if this part is missing, we will try to match same fields
634 # from lookup record and current one, or you can override
635 # which records to use from current record using
636 sub { rec('900','x') . ' ' . rec('900','y') },
637 )
638
639 You can think about this lookup as SQL (if that helps):
640
641 select
642 sub { what }
643 from
644 database, input
645 where
646 sub { filter from lookuped record }
647 having
648 sub { optional filter on current record }
649
650 Easy as pie, right?
651
652 =cut
653
654 sub lookup {
655 my ($what, $database, $input, $key, $having) = @_;
656
657 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
658
659 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
660 return unless (defined($lookup->{$database}->{$input}->{$key}));
661
662 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
663
664 my $mfns;
665 my @having = $having->();
666
667 warn "## having = ", dump( @having ) if ($debug > 2);
668
669 foreach my $h ( @having ) {
670 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
671 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
672 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
673 }
674 }
675
676 return unless ($mfns);
677
678 my @mfns = sort keys %$mfns;
679
680 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
681
682 my $old_rec = $rec;
683 my @out;
684
685 foreach my $mfn (@mfns) {
686 $rec = $load_row_coderef->( $database, $input, $mfn );
687
688 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
689
690 my @vals = $what->();
691
692 push @out, ( @vals );
693
694 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
695 }
696
697 # if (ref($lookup->{$k}) eq 'ARRAY') {
698 # return @{ $lookup->{$k} };
699 # } else {
700 # return $lookup->{$k};
701 # }
702
703 $rec = $old_rec;
704
705 warn "## lookup returns = ", dump(@out), $/ if ($debug);
706
707 if ($#out == 0) {
708 return $out[0];
709 } else {
710 return @out;
711 }
712 }
713
714 =head2 save_into_lookup
715
716 Save value into lookup. It associates current database, input
717 and specific keys with one or more values which will be
718 associated over MFN.
719
720 MFN will be extracted from first occurence current of field 000
721 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
722
723 my $nr = save_into_lookup($database,$input,$key,sub {
724 # code which produce one or more values
725 });
726
727 It returns number of items saved.
728
729 This function shouldn't be called directly, it's called from code created by
730 L<WebPAC::Parser>.
731
732 =cut
733
734 sub save_into_lookup {
735 my ($database,$input,$key,$coderef) = @_;
736 die "save_into_lookup needs database" unless defined($database);
737 die "save_into_lookup needs input" unless defined($input);
738 die "save_into_lookup needs key" unless defined($key);
739 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
740
741 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
742
743 my $mfn =
744 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
745 defined($config->{_mfn}) ? $config->{_mfn} :
746 die "mfn not defined or zero";
747
748 my $nr = 0;
749
750 foreach my $v ( $coderef->() ) {
751 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
752 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
753 $nr++;
754 }
755
756 return $nr;
757 }
758
759 =head2 config
760
761 Consult config values stored in C<config.yml>
762
763 # return database code (key under databases in yaml)
764 $database_code = config(); # use _ from hash
765 $database_name = config('name');
766 $database_input_name = config('input name');
767
768 Up to three levels are supported.
769
770 =cut
771
772 sub config {
773 return unless ($config);
774
775 my $p = shift;
776
777 $p ||= '';
778
779 my $v;
780
781 warn "### getting config($p)\n" if ($debug > 1);
782
783 my @p = split(/\s+/,$p);
784 if ($#p < 0) {
785 $v = $config->{ '_' }; # special, database code
786 } else {
787
788 my $c = dclone( $config );
789
790 foreach my $k (@p) {
791 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
792 if (ref($c) eq 'ARRAY') {
793 $c = shift @$c;
794 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
795 last;
796 }
797
798 if (! defined($c->{$k}) ) {
799 $c = undef;
800 last;
801 } else {
802 $c = $c->{$k};
803 }
804 }
805 $v = $c if ($c);
806
807 }
808
809 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
810 warn "config( '$p' ) is empty\n" if (! $v);
811
812 return $v;
813 }
814
815 =head2 id
816
817 Returns unique id of this record
818
819 $id = id();
820
821 Returns C<42/2> for 2nd occurence of MFN 42.
822
823 =cut
824
825 sub id {
826 my $mfn = $config->{_mfn} || die "no _mfn in config data";
827 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
828 }
829
830 =head2 join_with
831
832 Joins walues with some delimiter
833
834 $v = join_with(", ", @v);
835
836 =cut
837
838 sub join_with {
839 my $d = shift;
840 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
841 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
842 return '' unless defined($v);
843 return $v;
844 }
845
846 =head2 split_rec_on
847
848 Split record subfield on some regex and take one of parts out
849
850 $a_before_semi_column =
851 split_rec_on('200','a', /\s*;\s*/, $part);
852
853 C<$part> is optional number of element. First element is
854 B<1>, not 0!
855
856 If there is no C<$part> parameter or C<$part> is 0, this function will
857 return all values produced by splitting.
858
859 =cut
860
861 sub split_rec_on {
862 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
863
864 my ($fld, $sf, $regex, $part) = @_;
865 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
866
867 my @r = rec( $fld, $sf );
868 my $v = shift @r;
869 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
870
871 return '' if ( ! defined($v) || $v =~ /^\s*$/);
872
873 my @s = split( $regex, $v );
874 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
875 if ($part && $part > 0) {
876 return $s[ $part - 1 ];
877 } else {
878 return @s;
879 }
880 }
881
882 my $hash;
883
884 =head2 set
885
886 set( key => 'value' );
887
888 =cut
889
890 sub set {
891 my ($k,$v) = @_;
892 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
893 $hash->{$k} = $v;
894 };
895
896 =head2 get
897
898 get( 'key' );
899
900 =cut
901
902 sub get {
903 my $k = shift || return;
904 my $v = $hash->{$k};
905 warn "## get $k = ", dump( $v ), $/ if ( $debug );
906 return $v;
907 }
908
909 =head2 count
910
911 if ( count( @result ) == 1 ) {
912 # do something if only 1 result is there
913 }
914
915 =cut
916
917 sub count {
918 warn "## count ",dump(@_),$/ if ( $debug );
919 return @_ . '';
920 }
921
922 # END
923 1;

  ViewVC Help
Powered by ViewVC 1.1.26