/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1036 - (show annotations)
Mon Nov 12 11:10:48 2007 UTC (15 years, 2 months ago) by dpavlin
File size: 17534 byte(s)
 r1610@llin:  dpavlin | 2007-11-12 12:10:45 +0100
 split MARC handling routines into WebPAC::Normalize::MARC [2.31]

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 our @EXPORT = qw/
4 _set_ds _set_lookup
5 _set_load_row
6 _get_ds _clean_ds
7 _debug
8 _pack_subfields_hash
9
10 search_display search display sorted
11
12 rec1 rec2 rec
13 frec frec_eq frec_ne
14 regex prefix suffix surround
15 first lookup join_with
16 save_into_lookup
17
18 split_rec_on
19
20 get set
21 count
22
23 /;
24
25 use warnings;
26 use strict;
27
28 #use base qw/WebPAC::Common/;
29 use Data::Dump qw/dump/;
30 use Storable qw/dclone/;
31 use Carp qw/confess/;
32
33 # debugging warn(s)
34 my $debug = 0;
35
36 # FIXME
37 use WebPAC::Normalize::ISBN;
38 push @EXPORT, ( 'isbn_10', 'isbn_13' );
39
40 use WebPAC::Normalize::MARC;
41 push @EXPORT, ( qw/
42 marc marc_indicators marc_repeatable_subfield
43 marc_compose marc_leader marc_fixed
44 marc_duplicate marc_remove marc_count
45 marc_original_order
46 marc_template
47 /);
48
49 =head1 NAME
50
51 WebPAC::Normalize - describe normalisaton rules using sets
52
53 =cut
54
55 our $VERSION = '0.35';
56
57 =head1 SYNOPSIS
58
59 This module uses C<conf/normalize/*.pl> files to perform normalisation
60 from input records using perl functions which are specialized for set
61 processing.
62
63 Sets are implemented as arrays, and normalisation file is valid perl, which
64 means that you check it's validity before running WebPAC using
65 C<perl -c normalize.pl>.
66
67 Normalisation can generate multiple output normalized data. For now, supported output
68 types (on the left side of definition) are: C<search_display>, C<display>, C<search> and
69 C<marc>.
70
71 =head1 FUNCTIONS
72
73 Functions which start with C<_> are private and used by WebPAC internally.
74 All other functions are available for use within normalisation rules.
75
76 =head2 data_structure
77
78 Return data structure
79
80 my $ds = WebPAC::Normalize::data_structure(
81 lookup => $lookup_hash,
82 row => $row,
83 rules => $normalize_pl_config,
84 marc_encoding => 'utf-8',
85 config => $config,
86 load_row_coderef => sub {
87 my ($database,$input,$mfn) = @_;
88 $store->load_row( database => $database, input => $input, id => $mfn );
89 },
90 );
91
92 Options C<row>, C<rules> and C<log> are mandatory while all
93 other are optional.
94
95 C<load_row_coderef> is closure only used when executing lookups, so they will
96 die if it's not defined.
97
98 This function will B<die> if normalizastion can't be evaled.
99
100 Since this function isn't exported you have to call it with
101 C<WebPAC::Normalize::data_structure>.
102
103 =cut
104
105 my $load_row_coderef;
106
107 sub data_structure {
108 my $arg = {@_};
109
110 die "need row argument" unless ($arg->{row});
111 die "need normalisation argument" unless ($arg->{rules});
112
113 _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
114 _set_ds( $arg->{row} );
115 _set_config( $arg->{config} ) if defined($arg->{config});
116 _clean_ds( %{ $arg } );
117 $load_row_coderef = $arg->{load_row_coderef};
118
119 no strict 'subs';
120 no warnings 'redefine';
121 eval "$arg->{rules};";
122 die "error evaling $arg->{rules}: $@\n" if ($@);
123
124 return _get_ds();
125 }
126
127 =head2 _set_ds
128
129 Set current record hash
130
131 _set_ds( $rec );
132
133 =cut
134
135 my $rec;
136
137 sub _set_ds {
138 $rec = shift or die "no record hash";
139 $WebPAC::Normalize::MARC::rec = $rec;
140 }
141
142 =head2
143
144 my $rec = _get_rec();
145
146 =cut
147
148 sub _get_rec { $rec };
149
150 =head2 _set_config
151
152 Set current config hash
153
154 _set_config( $config );
155
156 Magic keys are:
157
158 =over 4
159
160 =item _
161
162 Code of current database
163
164 =item _mfn
165
166 Current MFN
167
168 =back
169
170 =cut
171
172 my $config;
173
174 sub _set_config {
175 $config = shift;
176 }
177
178 =head2 _get_ds
179
180 Return hash formatted as data structure
181
182 my $ds = _get_ds();
183
184 =cut
185
186 my $out;
187
188 sub _get_ds {
189 #warn "## out = ",dump($out);
190 return $out;
191 }
192
193 =head2 _clean_ds
194
195 Clean data structure hash for next record
196
197 _clean_ds();
198
199 =cut
200
201 sub _clean_ds {
202 my $a = {@_};
203 $out = undef;
204 WebPAC::Normalize::MARC::_clean();
205 }
206
207 =head2 _set_lookup
208
209 Set current lookup hash
210
211 _set_lookup( $lookup );
212
213 =cut
214
215 my $lookup;
216
217 sub _set_lookup {
218 $lookup = shift;
219 }
220
221 =head2 _get_lookup
222
223 Get current lookup hash
224
225 my $lookup = _get_lookup();
226
227 =cut
228
229 sub _get_lookup {
230 return $lookup;
231 }
232
233 =head2 _set_load_row
234
235 Setup code reference which will return L<data_structure> from
236 L<WebPAC::Store>
237
238 _set_load_row(sub {
239 my ($database,$input,$mfn) = @_;
240 $store->load_row( database => $database, input => $input, id => $mfn );
241 });
242
243 =cut
244
245 sub _set_load_row {
246 my $coderef = shift;
247 confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
248
249 $load_row_coderef = $coderef;
250 }
251
252 =head2 _debug
253
254 Change level of debug warnings
255
256 _debug( 2 );
257
258 =cut
259
260 sub _debug {
261 my $l = shift;
262 return $debug unless defined($l);
263 warn "debug level $l",$/ if ($l > 0);
264 $debug = $l;
265 $WebPAC::Normalize::MARC::debug = $debug;
266 }
267
268 =head1 Functions to create C<data_structure>
269
270 Those functions generally have to first in your normalization file.
271
272 =head2 search_display
273
274 Define output for L<search> and L<display> at the same time
275
276 search_display('Title', rec('200','a') );
277
278
279 =cut
280
281 sub search_display {
282 my $name = shift or die "search_display needs name as first argument";
283 my @o = grep { defined($_) && $_ ne '' } @_;
284 return unless (@o);
285 $out->{$name}->{search} = \@o;
286 $out->{$name}->{display} = \@o;
287 }
288
289 =head2 tag
290
291 Old name for L<search_display>, but supported
292
293 =cut
294
295 sub tag {
296 search_display( @_ );
297 }
298
299 =head2 display
300
301 Define output just for I<display>
302
303 @v = display('Title', rec('200','a') );
304
305 =cut
306
307 sub _field {
308 my $type = shift or confess "need type -- BUG?";
309 my $name = shift or confess "needs name as first argument";
310 my @o = grep { defined($_) && $_ ne '' } @_;
311 return unless (@o);
312 $out->{$name}->{$type} = \@o;
313 }
314
315 sub display { _field( 'display', @_ ) }
316
317 =head2 search
318
319 Prepare values just for I<search>
320
321 @v = search('Title', rec('200','a') );
322
323 =cut
324
325 sub search { _field( 'search', @_ ) }
326
327 =head2 sorted
328
329 Insert into lists which will be automatically sorted
330
331 sorted('Title', rec('200','a') );
332
333 =cut
334
335 sub sorted { _field( 'sorted', @_ ) }
336
337
338
339 =head1 Functions to extract data from input
340
341 This function should be used inside functions to create C<data_structure> described
342 above.
343
344 =head2 _pack_subfields_hash
345
346 @subfields = _pack_subfields_hash( $h );
347 $subfields = _pack_subfields_hash( $h, 1 );
348
349 Return each subfield value in array or pack them all together and return scalar
350 with subfields (denoted by C<^>) and values.
351
352 =cut
353
354 sub _pack_subfields_hash {
355
356 warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
357
358 my ($h,$include_subfields) = @_;
359
360 # sanity and ease of use
361 return $h if (ref($h) ne 'HASH');
362
363 if ( defined($h->{subfields}) ) {
364 my $sfs = delete $h->{subfields} || die "no subfields?";
365 my @out;
366 while (@$sfs) {
367 my $sf = shift @$sfs;
368 push @out, '^' . $sf if ($include_subfields);
369 my $o = shift @$sfs;
370 if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
371 # single element subfields are not arrays
372 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
373
374 push @out, $h->{$sf};
375 } else {
376 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
377 push @out, $h->{$sf}->[$o];
378 }
379 }
380 if ($include_subfields) {
381 return join('', @out);
382 } else {
383 return @out;
384 }
385 } else {
386 if ($include_subfields) {
387 my $out = '';
388 foreach my $sf (sort keys %$h) {
389 if (ref($h->{$sf}) eq 'ARRAY') {
390 $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
391 } else {
392 $out .= '^' . $sf . $h->{$sf};
393 }
394 }
395 return $out;
396 } else {
397 # FIXME this should probably be in alphabetical order instead of hash order
398 values %{$h};
399 }
400 }
401 }
402
403 =head2 rec1
404
405 Return all values in some field
406
407 @v = rec1('200')
408
409 TODO: order of values is probably same as in source data, need to investigate that
410
411 =cut
412
413 sub rec1 {
414 my $f = shift;
415 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
416 return unless (defined($rec) && defined($rec->{$f}));
417 warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
418 if (ref($rec->{$f}) eq 'ARRAY') {
419 my @out;
420 foreach my $h ( @{ $rec->{$f} } ) {
421 if (ref($h) eq 'HASH') {
422 push @out, ( _pack_subfields_hash( $h ) );
423 } else {
424 push @out, $h;
425 }
426 }
427 return @out;
428 } elsif( defined($rec->{$f}) ) {
429 return $rec->{$f};
430 }
431 }
432
433 =head2 rec2
434
435 Return all values in specific field and subfield
436
437 @v = rec2('200','a')
438
439 =cut
440
441 sub rec2 {
442 my $f = shift;
443 return unless (defined($rec && $rec->{$f}));
444 my $sf = shift;
445 warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
446 return map {
447 if (ref($_->{$sf}) eq 'ARRAY') {
448 @{ $_->{$sf} };
449 } else {
450 $_->{$sf};
451 }
452 } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
453 }
454
455 =head2 rec
456
457 syntaxtic sugar for
458
459 @v = rec('200')
460 @v = rec('200','a')
461
462 If rec() returns just single value, it will
463 return scalar, not array.
464
465 =cut
466
467 sub rec {
468 my @out;
469 if ($#_ == 0) {
470 @out = rec1(@_);
471 } elsif ($#_ == 1) {
472 @out = rec2(@_);
473 }
474 if ($#out == 0 && ! wantarray) {
475 return $out[0];
476 } elsif (@out) {
477 return @out;
478 } else {
479 return '';
480 }
481 }
482
483 =head2 frec
484
485 Returns first value from field
486
487 $v = frec('200');
488 $v = frec('200','a');
489
490 =cut
491
492 sub frec {
493 my @out = rec(@_);
494 warn "rec(",dump(@_),") has more than one return value, ignoring\n" if $#out > 0;
495 return shift @out;
496 }
497
498 =head2 frec_eq
499
500 =head2 frec_ne
501
502 Check if first values from two fields are same or different
503
504 if ( frec_eq( 900 => 'a', 910 => 'c' ) ) {
505 # values are same
506 } else {
507 # values are different
508 }
509
510 Strictly speaking C<frec_eq> and C<frec_ne> wouldn't be needed if you
511 could write something like:
512
513 if ( frec( '900','a' ) eq frec( '910','c' ) ) {
514 # yada tada
515 }
516
517 but you can't since our parser L<WebPAC::Parser> will remove all whitespaces
518 in order to parse text and create invalid function C<eqfrec>.
519
520 =cut
521
522 sub frec_eq {
523 my ( $f1,$sf1, $f2, $sf2 ) = @_;
524 return (rec( $f1, $sf1 ))[0] eq (rec( $f2, $sf2 ))[0];
525 }
526
527 sub frec_ne {
528 return ! frec_eq( @_ );
529 }
530
531 =head2 regex
532
533 Apply regex to some or all values
534
535 @v = regex( 's/foo/bar/g', @v );
536
537 =cut
538
539 sub regex {
540 my $r = shift;
541 my @out;
542 #warn "r: $r\n", dump(\@_);
543 foreach my $t (@_) {
544 next unless ($t);
545 eval "\$t =~ $r";
546 push @out, $t if ($t && $t ne '');
547 }
548 return @out;
549 }
550
551 =head2 prefix
552
553 Prefix all values with a string
554
555 @v = prefix( 'my_', @v );
556
557 =cut
558
559 sub prefix {
560 my $p = shift;
561 return @_ unless defined( $p );
562 return map { $p . $_ } grep { defined($_) } @_;
563 }
564
565 =head2 suffix
566
567 suffix all values with a string
568
569 @v = suffix( '_my', @v );
570
571 =cut
572
573 sub suffix {
574 my $s = shift;
575 return @_ unless defined( $s );
576 return map { $_ . $s } grep { defined($_) } @_;
577 }
578
579 =head2 surround
580
581 surround all values with a two strings
582
583 @v = surround( 'prefix_', '_suffix', @v );
584
585 =cut
586
587 sub surround {
588 my $p = shift;
589 my $s = shift;
590 $p = '' unless defined( $p );
591 $s = '' unless defined( $s );
592 return map { $p . $_ . $s } grep { defined($_) } @_;
593 }
594
595 =head2 first
596
597 Return first element
598
599 $v = first( @v );
600
601 =cut
602
603 sub first {
604 my $r = shift;
605 return $r;
606 }
607
608 =head2 lookup
609
610 Consult lookup hashes for some value
611
612 @v = lookup(
613 sub {
614 'ffkk/peri/mfn'.rec('000')
615 },
616 'ffkk','peri','200-a-200-e',
617 sub {
618 first(rec(200,'a')).' '.first(rec('200','e'))
619 }
620 );
621
622 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
623 normal lookup definition in C<conf/lookup/something.pl> which looks like:
624
625 lookup(
626 # which results to return from record recorded in lookup
627 sub { 'ffkk/peri/mfn' . rec('000') },
628 # from which database and input
629 'ffkk','peri',
630 # such that following values match
631 sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
632 # if this part is missing, we will try to match same fields
633 # from lookup record and current one, or you can override
634 # which records to use from current record using
635 sub { rec('900','x') . ' ' . rec('900','y') },
636 )
637
638 You can think about this lookup as SQL (if that helps):
639
640 select
641 sub { what }
642 from
643 database, input
644 where
645 sub { filter from lookuped record }
646 having
647 sub { optional filter on current record }
648
649 Easy as pie, right?
650
651 =cut
652
653 sub lookup {
654 my ($what, $database, $input, $key, $having) = @_;
655
656 confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
657
658 warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
659 return unless (defined($lookup->{$database}->{$input}->{$key}));
660
661 confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
662
663 my $mfns;
664 my @having = $having->();
665
666 warn "## having = ", dump( @having ) if ($debug > 2);
667
668 foreach my $h ( @having ) {
669 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
670 warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
671 $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
672 }
673 }
674
675 return unless ($mfns);
676
677 my @mfns = sort keys %$mfns;
678
679 warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
680
681 my $old_rec = $rec;
682 my @out;
683
684 foreach my $mfn (@mfns) {
685 $rec = $load_row_coderef->( $database, $input, $mfn );
686
687 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
688
689 my @vals = $what->();
690
691 push @out, ( @vals );
692
693 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
694 }
695
696 # if (ref($lookup->{$k}) eq 'ARRAY') {
697 # return @{ $lookup->{$k} };
698 # } else {
699 # return $lookup->{$k};
700 # }
701
702 $rec = $old_rec;
703
704 warn "## lookup returns = ", dump(@out), $/ if ($debug);
705
706 if ($#out == 0) {
707 return $out[0];
708 } else {
709 return @out;
710 }
711 }
712
713 =head2 save_into_lookup
714
715 Save value into lookup. It associates current database, input
716 and specific keys with one or more values which will be
717 associated over MFN.
718
719 MFN will be extracted from first occurence current of field 000
720 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
721
722 my $nr = save_into_lookup($database,$input,$key,sub {
723 # code which produce one or more values
724 });
725
726 It returns number of items saved.
727
728 This function shouldn't be called directly, it's called from code created by
729 L<WebPAC::Parser>.
730
731 =cut
732
733 sub save_into_lookup {
734 my ($database,$input,$key,$coderef) = @_;
735 die "save_into_lookup needs database" unless defined($database);
736 die "save_into_lookup needs input" unless defined($input);
737 die "save_into_lookup needs key" unless defined($key);
738 die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
739
740 warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
741
742 my $mfn =
743 defined($rec->{'000'}->[0]) ? $rec->{'000'}->[0] :
744 defined($config->{_mfn}) ? $config->{_mfn} :
745 die "mfn not defined or zero";
746
747 my $nr = 0;
748
749 foreach my $v ( $coderef->() ) {
750 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
751 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
752 $nr++;
753 }
754
755 return $nr;
756 }
757
758 =head2 config
759
760 Consult config values stored in C<config.yml>
761
762 # return database code (key under databases in yaml)
763 $database_code = config(); # use _ from hash
764 $database_name = config('name');
765 $database_input_name = config('input name');
766
767 Up to three levels are supported.
768
769 =cut
770
771 sub config {
772 return unless ($config);
773
774 my $p = shift;
775
776 $p ||= '';
777
778 my $v;
779
780 warn "### getting config($p)\n" if ($debug > 1);
781
782 my @p = split(/\s+/,$p);
783 if ($#p < 0) {
784 $v = $config->{ '_' }; # special, database code
785 } else {
786
787 my $c = dclone( $config );
788
789 foreach my $k (@p) {
790 warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
791 if (ref($c) eq 'ARRAY') {
792 $c = shift @$c;
793 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
794 last;
795 }
796
797 if (! defined($c->{$k}) ) {
798 $c = undef;
799 last;
800 } else {
801 $c = $c->{$k};
802 }
803 }
804 $v = $c if ($c);
805
806 }
807
808 warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
809 warn "config( '$p' ) is empty\n" if (! $v);
810
811 return $v;
812 }
813
814 =head2 id
815
816 Returns unique id of this record
817
818 $id = id();
819
820 Returns C<42/2> for 2nd occurence of MFN 42.
821
822 =cut
823
824 sub id {
825 my $mfn = $config->{_mfn} || die "no _mfn in config data";
826 return $mfn . ( WebPAC::Normalize::MARC::_created_marc_records() || '' );
827 }
828
829 =head2 join_with
830
831 Joins walues with some delimiter
832
833 $v = join_with(", ", @v);
834
835 =cut
836
837 sub join_with {
838 my $d = shift;
839 warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
840 my $v = join($d, grep { defined($_) && $_ ne '' } @_);
841 return '' unless defined($v);
842 return $v;
843 }
844
845 =head2 split_rec_on
846
847 Split record subfield on some regex and take one of parts out
848
849 $a_before_semi_column =
850 split_rec_on('200','a', /\s*;\s*/, $part);
851
852 C<$part> is optional number of element. First element is
853 B<1>, not 0!
854
855 If there is no C<$part> parameter or C<$part> is 0, this function will
856 return all values produced by splitting.
857
858 =cut
859
860 sub split_rec_on {
861 die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
862
863 my ($fld, $sf, $regex, $part) = @_;
864 warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
865
866 my @r = rec( $fld, $sf );
867 my $v = shift @r;
868 warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
869
870 return '' if ( ! defined($v) || $v =~ /^\s*$/);
871
872 my @s = split( $regex, $v );
873 warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
874 if ($part && $part > 0) {
875 return $s[ $part - 1 ];
876 } else {
877 return @s;
878 }
879 }
880
881 my $hash;
882
883 =head2 set
884
885 set( key => 'value' );
886
887 =cut
888
889 sub set {
890 my ($k,$v) = @_;
891 warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
892 $hash->{$k} = $v;
893 };
894
895 =head2 get
896
897 get( 'key' );
898
899 =cut
900
901 sub get {
902 my $k = shift || return;
903 my $v = $hash->{$k};
904 warn "## get $k = ", dump( $v ), $/ if ( $debug );
905 return $v;
906 }
907
908 =head2 count
909
910 if ( count( @result ) == 1 ) {
911 # do something if only 1 result is there
912 }
913
914 =cut
915
916 sub count {
917 warn "## count ",dump(@_),$/ if ( $debug );
918 return @_ . '';
919 }
920
921 # END
922 1;

  ViewVC Help
Powered by ViewVC 1.1.26