/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 544 - (show annotations)
Thu Jun 29 21:52:51 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 7737 byte(s)
 r736@llin:  dpavlin | 2006-06-29 23:54:24 +0200
 oh, another bit of magic missing...

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc21
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dumper;
20 use Encode qw/from_to/;
21
22 =head1 NAME
23
24 WebPAC::Normalize - describe normalisaton rules using sets
25
26 =head1 VERSION
27
28 Version 0.06
29
30 =cut
31
32 our $VERSION = '0.06';
33
34 =head1 SYNOPSIS
35
36 This module uses C<conf/normalize/*.pl> files to perform normalisation
37 from input records using perl functions which are specialized for set
38 processing.
39
40 Sets are implemented as arrays, and normalisation file is valid perl, which
41 means that you check it's validity before running WebPAC using
42 C<perl -c normalize.pl>.
43
44 Normalisation can generate multiple output normalized data. For now, supported output
45 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 C<marc21>.
47
48 =head1 FUNCTIONS
49
50 Functions which start with C<_> are private and used by WebPAC internally.
51 All other functions are available for use within normalisation rules.
52
53 =head2 data_structure
54
55 Return data structure
56
57 my $ds = WebPAC::Normalize::data_structure(
58 lookup => $lookup->lookup_hash,
59 row => $row,
60 rules => $normalize_pl_config,
61 marc_encoding => 'utf-8',
62 );
63
64 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65 other are optional.
66
67 This function will B<die> if normalizastion can't be evaled.
68
69 Since this function isn't exported you have to call it with
70 C<WebPAC::Normalize::data_structure>.
71
72 =cut
73
74 sub data_structure {
75 my $arg = {@_};
76
77 die "need row argument" unless ($arg->{row});
78 die "need normalisation argument" unless ($arg->{rules});
79
80 no strict 'subs';
81 _set_lookup( $arg->{lookup} );
82 _set_rec( $arg->{row} );
83 _clean_ds( %{ $arg } );
84 eval "$arg->{rules}";
85 die "error evaling $arg->{rules}: $@\n" if ($@);
86
87 return _get_ds();
88 }
89
90 =head2 _set_rec
91
92 Set current record hash
93
94 _set_rec( $rec );
95
96 =cut
97
98 my $rec;
99
100 sub _set_rec {
101 $rec = shift or die "no record hash";
102 }
103
104 =head2 _get_ds
105
106 Return hash formatted as data structure
107
108 my $ds = _get_ds();
109
110 =cut
111
112 my $out;
113 my $marc21;
114 my $marc_encoding;
115
116 sub _get_ds {
117 return $out;
118 }
119
120 =head2 _clean_ds
121
122 Clean data structure hash for next record
123
124 _clean_ds();
125
126 =cut
127
128 sub _clean_ds {
129 my $a = {@_};
130 $out = undef;
131 $marc21 = undef;
132 $marc_encoding = $a->{marc_encoding};
133 }
134
135 =head2 _set_lookup
136
137 Set current lookup hash
138
139 _set_lookup( $lookup );
140
141 =cut
142
143 my $lookup;
144
145 sub _set_lookup {
146 $lookup = shift;
147 }
148
149 =head2 _get_marc21_fields
150
151 Get all fields defined by calls to C<marc21>
152
153 $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154
155
156
157 We are using I<magic> which detect repeatable fields only from
158 sequence of field/subfield data generated by normalization.
159
160 Repeatable field is created if there is second occurence of same subfield or
161 if any of indicators are different. This is sane for most cases except for
162 non-repeatable fields with repeatable subfields.
163
164 B<TODO>: implement exceptions to magic
165
166 =cut
167
168 sub _get_marc21_fields {
169 my @m;
170 my $last;
171 foreach my $row (@{ $marc21 }) {
172 if ($last &&
173 $last->[0] eq $row->[0] && # check if field is same
174 $last->[1] eq $row->[1] && # check for i1
175 $last->[2] eq $row->[2] && # and for i2
176 $last->[3] ne $row->[3] # and subfield is different
177 ) {
178 push @$last, ( $row->[3] , $row->[4] );
179 warn "## ++ added $row->[0] ^$row->[3] to $last->[0]\n";
180 next;
181 } elsif ($last) {
182 push @m, $last;
183 }
184
185 $last = $row;
186 }
187
188 push @m, $last if ($last);
189
190 return @m;
191 }
192
193 =head1 Functions to create C<data_structure>
194
195 Those functions generally have to first in your normalization file.
196
197 =head2 tag
198
199 Define new tag for I<search> and I<display>.
200
201 tag('Title', rec('200','a') );
202
203
204 =cut
205
206 sub tag {
207 my $name = shift or die "tag needs name as first argument";
208 my @o = grep { defined($_) && $_ ne '' } @_;
209 return unless (@o);
210 $out->{$name}->{tag} = $name;
211 $out->{$name}->{search} = \@o;
212 $out->{$name}->{display} = \@o;
213 }
214
215 =head2 display
216
217 Define tag just for I<display>
218
219 @v = display('Title', rec('200','a') );
220
221 =cut
222
223 sub display {
224 my $name = shift or die "display needs name as first argument";
225 my @o = grep { defined($_) && $_ ne '' } @_;
226 return unless (@o);
227 $out->{$name}->{tag} = $name;
228 $out->{$name}->{display} = \@o;
229 }
230
231 =head2 search
232
233 Prepare values just for I<search>
234
235 @v = search('Title', rec('200','a') );
236
237 =cut
238
239 sub search {
240 my $name = shift or die "search needs name as first argument";
241 my @o = grep { defined($_) && $_ ne '' } @_;
242 return unless (@o);
243 $out->{$name}->{tag} = $name;
244 $out->{$name}->{search} = \@o;
245 }
246
247 =head2 marc21
248
249 Save value for MARC field
250
251 marc21('900','a', rec('200','a') );
252
253 =cut
254
255 sub marc21 {
256 my $f = shift or die "marc21 needs field";
257 die "marc21 field must be numer" unless ($f =~ /^\d+$/);
258
259 my $sf = shift or die "marc21 needs subfield";
260
261 foreach (@_) {
262 my $v = $_; # make var read-write for Encode
263 next unless (defined($v) && $v !~ /^\s*$/);
264 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
265 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
266 }
267 }
268
269 =head1 Functions to extract data from input
270
271 This function should be used inside functions to create C<data_structure> described
272 above.
273
274 =head2 rec1
275
276 Return all values in some field
277
278 @v = rec1('200')
279
280 TODO: order of values is probably same as in source data, need to investigate that
281
282 =cut
283
284 sub rec1 {
285 my $f = shift;
286 return unless (defined($rec) && defined($rec->{$f}));
287 if (ref($rec->{$f}) eq 'ARRAY') {
288 return map {
289 if (ref($_) eq 'HASH') {
290 values %{$_};
291 } else {
292 $_;
293 }
294 } @{ $rec->{$f} };
295 } elsif( defined($rec->{$f}) ) {
296 return $rec->{$f};
297 }
298 }
299
300 =head2 rec2
301
302 Return all values in specific field and subfield
303
304 @v = rec2('200','a')
305
306 =cut
307
308 sub rec2 {
309 my $f = shift;
310 return unless (defined($rec && $rec->{$f}));
311 my $sf = shift;
312 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
313 }
314
315 =head2 rec
316
317 syntaxtic sugar for
318
319 @v = rec('200')
320 @v = rec('200','a')
321
322 =cut
323
324 sub rec {
325 if ($#_ == 0) {
326 return rec1(@_);
327 } elsif ($#_ == 1) {
328 return rec2(@_);
329 }
330 }
331
332 =head2 regex
333
334 Apply regex to some or all values
335
336 @v = regex( 's/foo/bar/g', @v );
337
338 =cut
339
340 sub regex {
341 my $r = shift;
342 my @out;
343 #warn "r: $r\n",Dumper(\@_);
344 foreach my $t (@_) {
345 next unless ($t);
346 eval "\$t =~ $r";
347 push @out, $t if ($t && $t ne '');
348 }
349 return @out;
350 }
351
352 =head2 prefix
353
354 Prefix all values with a string
355
356 @v = prefix( 'my_', @v );
357
358 =cut
359
360 sub prefix {
361 my $p = shift or die "prefix needs string as first argument";
362 return map { $p . $_ } grep { defined($_) } @_;
363 }
364
365 =head2 suffix
366
367 suffix all values with a string
368
369 @v = suffix( '_my', @v );
370
371 =cut
372
373 sub suffix {
374 my $s = shift or die "suffix needs string as first argument";
375 return map { $_ . $s } grep { defined($_) } @_;
376 }
377
378 =head2 surround
379
380 surround all values with a two strings
381
382 @v = surround( 'prefix_', '_suffix', @v );
383
384 =cut
385
386 sub surround {
387 my $p = shift or die "surround need prefix as first argument";
388 my $s = shift or die "surround needs suffix as second argument";
389 return map { $p . $_ . $s } grep { defined($_) } @_;
390 }
391
392 =head2 first
393
394 Return first element
395
396 $v = first( @v );
397
398 =cut
399
400 sub first {
401 my $r = shift;
402 return $r;
403 }
404
405 =head2 lookup
406
407 Consult lookup hashes for some value
408
409 @v = lookup( $v );
410 @v = lookup( @v );
411
412 =cut
413
414 sub lookup {
415 my $k = shift or return;
416 return unless (defined($lookup->{$k}));
417 if (ref($lookup->{$k}) eq 'ARRAY') {
418 return @{ $lookup->{$k} };
419 } else {
420 return $lookup->{$k};
421 }
422 }
423
424 =head2 join_with
425
426 Joins walues with some delimiter
427
428 $v = join_with(", ", @v);
429
430 =cut
431
432 sub join_with {
433 my $d = shift;
434 return join($d, grep { defined($_) && $_ ne '' } @_);
435 }
436
437 # END
438 1;

  ViewVC Help
Powered by ViewVC 1.1.26