/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 536 by dpavlin, Mon Jun 26 16:39:51 2006 UTC revision 547 by dpavlin, Thu Jun 29 23:19:26 2006 UTC
# Line 1  Line 1 
1  package WebPAC::Normalize;  package WebPAC::Normalize;
2  use Exporter 'import';  use Exporter 'import';
3  @EXPORT = qw/  @EXPORT = qw/
4          set_rec set_lookup          _set_rec _set_lookup
5          get_ds clean_ds          _get_ds _clean_ds
6    
7          tag search display          tag search display
8            marc marc_indicators marc_repeatable_subfield
9    
10          rec1 rec2 rec          rec1 rec2 rec
11          regex prefix suffix surround          regex prefix suffix surround
12          first lookup join_with          first lookup join_with
# Line 14  use strict; Line 17  use strict;
17    
18  #use base qw/WebPAC::Common/;  #use base qw/WebPAC::Common/;
19  use Data::Dumper;  use Data::Dumper;
20    use Encode qw/from_to/;
21    
22  =head1 NAME  =head1 NAME
23    
# Line 21  WebPAC::Normalize - describe normalisato Line 25  WebPAC::Normalize - describe normalisato
25    
26  =head1 VERSION  =head1 VERSION
27    
28  Version 0.04  Version 0.06
29    
30  =cut  =cut
31    
32  our $VERSION = '0.04';  our $VERSION = '0.06';
33    
34  =head1 SYNOPSIS  =head1 SYNOPSIS
35    
# Line 38  means that you check it's validity befor Line 42  means that you check it's validity befor
42  C<perl -c normalize.pl>.  C<perl -c normalize.pl>.
43    
44  Normalisation can generate multiple output normalized data. For now, supported output  Normalisation can generate multiple output normalized data. For now, supported output
45  types (on the left side of definition) are: C<tag>, C<display> and C<search>.  types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46    C<marc>.
47    
48  =head1 FUNCTIONS  =head1 FUNCTIONS
49    
50    Functions which start with C<_> are private and used by WebPAC internally.
51    All other functions are available for use within normalisation rules.
52    
53  =head2 data_structure  =head2 data_structure
54    
55  Return data structure  Return data structure
56    
57    my $ds = WebPAC::Normalize(    my $ds = WebPAC::Normalize::data_structure(
58          lookup => $lookup->lookup_hash,          lookup => $lookup->lookup_hash,
59          row => $row,          row => $row,
60          rules => $normalize_pl_config,          rules => $normalize_pl_config,
61            marc_encoding => 'utf-8',
62    );    );
63    
64    Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65    other are optional.
66    
67  This function will B<die> if normalizastion can't be evaled.  This function will B<die> if normalizastion can't be evaled.
68    
69    Since this function isn't exported you have to call it with
70    C<WebPAC::Normalize::data_structure>.
71    
72  =cut  =cut
73    
74  sub data_structure {  sub data_structure {
# Line 63  sub data_structure { Line 78  sub data_structure {
78          die "need normalisation argument" unless ($arg->{rules});          die "need normalisation argument" unless ($arg->{rules});
79    
80          no strict 'subs';          no strict 'subs';
81          set_lookup( $arg->{lookup} );          _set_lookup( $arg->{lookup} );
82          set_rec( $arg->{row} );          _set_rec( $arg->{row} );
83          clean_ds();          _clean_ds( %{ $arg } );
84          eval "$arg->{rules}";          eval "$arg->{rules}";
85          die "error evaling $arg->{rules}: $@\n" if ($@);          die "error evaling $arg->{rules}: $@\n" if ($@);
86          return get_ds();  
87            return _get_ds();
88  }  }
89    
90  =head2 set_rec  =head2 _set_rec
91    
92  Set current record hash  Set current record hash
93    
94    set_rec( $rec );    _set_rec( $rec );
95    
96  =cut  =cut
97    
98  my $rec;  my $rec;
99    
100  sub set_rec {  sub _set_rec {
101          $rec = shift or die "no record hash";          $rec = shift or die "no record hash";
102  }  }
103    
104    =head2 _get_ds
105    
106    Return hash formatted as data structure
107    
108      my $ds = _get_ds();
109    
110    =cut
111    
112    my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
113    
114    sub _get_ds {
115            return $out;
116    }
117    
118    =head2 _clean_ds
119    
120    Clean data structure hash for next record
121    
122      _clean_ds();
123    
124    =cut
125    
126    sub _clean_ds {
127            my $a = {@_};
128            ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = (undef);
129            $marc_encoding = $a->{marc_encoding};
130    }
131    
132    =head2 _set_lookup
133    
134    Set current lookup hash
135    
136      _set_lookup( $lookup );
137    
138    =cut
139    
140    my $lookup;
141    
142    sub _set_lookup {
143            $lookup = shift;
144    }
145    
146    =head2 _get_marc_fields
147    
148    Get all fields defined by calls to C<marc>
149    
150            $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
151    
152    
153    
154    We are using I<magic> which detect repeatable fields only from
155    sequence of field/subfield data generated by normalization.
156    
157    Repeatable field is created if there is second occurence of same subfield or
158    if any of indicators are different. This is sane for most cases except for
159    non-repeatable fields with repeatable subfields.
160    
161    You can change behaviour of that using C<marc_repeatable_subfield>.
162    
163    =cut
164    
165    sub _get_marc_fields {
166            my @m;
167            my $last;
168            foreach my $row (@{ $marc_record }) {
169                    if ($last &&
170                            $last->[0] eq $row->[0] &&              # check if field is same
171                            $last->[1] eq $row->[1] &&              # check for i1
172                            $last->[2] eq $row->[2] &&              # and for i2
173                                    ( $last->[3] ne $row->[3] ||                            # and subfield is different
174                                    $last->[3] eq $row->[3] &&                                      # or subfield is same,
175                                    $marc_repeatable_subfield->{ $row->[3] }        # but is repeatable
176                            )
177                    ) {
178                            push @$last, ( $row->[3] , $row->[4] );
179                            warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180                            next;
181                    } elsif ($last) {
182                            push @m, $last;
183                    }
184    
185                    $last = $row;
186            }
187    
188            push @m, $last if ($last);
189    
190            return @m;
191    }
192    
193    =head1 Functions to create C<data_structure>
194    
195    Those functions generally have to first in your normalization file.
196    
197  =head2 tag  =head2 tag
198    
199  Define new tag for I<search> and I<display>.  Define new tag for I<search> and I<display>.
# Line 94  Define new tag for I<search> and I<displ Line 203  Define new tag for I<search> and I<displ
203    
204  =cut  =cut
205    
 my $out;  
   
206  sub tag {  sub tag {
207          my $name = shift or die "tag needs name as first argument";          my $name = shift or die "tag needs name as first argument";
208          my @o = grep { defined($_) && $_ ne '' } @_;          my @o = grep { defined($_) && $_ ne '' } @_;
# Line 137  sub search { Line 244  sub search {
244          $out->{$name}->{search} = \@o;          $out->{$name}->{search} = \@o;
245  }  }
246    
247  =head2 get_ds  =head2 marc
248    
249  Return hash formatted as data structure  Save value for MARC field
250    
251    my $ds = get_ds();    marc('900','a', rec('200','a') );
252    
253  =cut  =cut
254    
255  sub get_ds {  sub marc {
256          return $out;          my $f = shift or die "marc needs field";
257            die "marc field must be numer" unless ($f =~ /^\d+$/);
258    
259            my $sf = shift or die "marc needs subfield";
260    
261            foreach (@_) {
262                    my $v = $_;             # make var read-write for Encode
263                    next unless (defined($v) && $v !~ /^\s*$/);
264                    from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265                    push @{ $marc_record }, [
266                            $f,
267                            $marc_indicators->{$f}->{i1} || ' ',
268                            $marc_indicators->{$f}->{i2} || ' ',
269                            $sf => $v
270                    ];
271            }
272  }  }
273    
274  =head2 clean_ds  =head2 marc_repeatable_subfield
275    
276  Clean data structure hash for next record  Save values for MARC repetable subfield
277    
278    clean_ds();    marc_repeatable_subfield('910', 'z', rec('909') );
279    
280  =cut  =cut
281    
282  sub clean_ds {  sub marc_repeatable_subfield {
283          $out = undef;          die "marc_repeatable_subfield need subfield!\n" unless (defined($_[1]));
284            $marc_repeatable_subfield->{ $_[1] }++;
285            marc(@_);
286  }  }
287    
288  =head2 set_lookup  =head2 marc_indicators
289    
290  Set current lookup hash  Set both indicators for MARC field
291    
292    set_lookup( $lookup );    marc_indicators('900', ' ', 1);
293    
294  =cut  Any indicator value other than C<0-9> will be treated as undefined.
295    
296  my $lookup;  =cut
297    
298  sub set_lookup {  sub marc_indicators {
299          $lookup = shift;          my $f = shift || die "marc_indicators need field!\n";
300            my ($i1,$i2) = @_;
301            die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
302            die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
303    
304            $i1 = ' ' if ($i1 !~ /^\d$/);
305            $i2 = ' ' if ($i2 !~ /^\d$/);
306            $marc_indicators->{$f}->{i1} = $i1;
307            $marc_indicators->{$f}->{i2} = $i2;
308  }  }
309    
310    
311    =head1 Functions to extract data from input
312    
313    This function should be used inside functions to create C<data_structure> described
314    above.
315    
316  =head2 rec1  =head2 rec1
317    
318  Return all values in some field  Return all values in some field

Legend:
Removed from v.536  
changed lines
  Added in v.547

  ViewVC Help
Powered by ViewVC 1.1.26