/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 542 - (show annotations)
Thu Jun 29 21:18:59 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 7406 byte(s)
 r731@llin:  dpavlin | 2006-06-29 23:02:08 +0200
 implement magic to create fields and repeatable fields (which might be broken for some cases).

1 package WebPAC::Normalize;
2 use Exporter 'import';
3 @EXPORT = qw/
4 _set_rec _set_lookup
5 _get_ds _clean_ds
6
7 tag search display
8 marc21
9
10 rec1 rec2 rec
11 regex prefix suffix surround
12 first lookup join_with
13 /;
14
15 use warnings;
16 use strict;
17
18 #use base qw/WebPAC::Common/;
19 use Data::Dumper;
20 use Encode qw/from_to/;
21
22 =head1 NAME
23
24 WebPAC::Normalize - describe normalisaton rules using sets
25
26 =head1 VERSION
27
28 Version 0.06
29
30 =cut
31
32 our $VERSION = '0.06';
33
34 =head1 SYNOPSIS
35
36 This module uses C<conf/normalize/*.pl> files to perform normalisation
37 from input records using perl functions which are specialized for set
38 processing.
39
40 Sets are implemented as arrays, and normalisation file is valid perl, which
41 means that you check it's validity before running WebPAC using
42 C<perl -c normalize.pl>.
43
44 Normalisation can generate multiple output normalized data. For now, supported output
45 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
46 C<marc21>.
47
48 =head1 FUNCTIONS
49
50 Functions which start with C<_> are private and used by WebPAC internally.
51 All other functions are available for use within normalisation rules.
52
53 =head2 data_structure
54
55 Return data structure
56
57 my $ds = WebPAC::Normalize::data_structure(
58 lookup => $lookup->lookup_hash,
59 row => $row,
60 rules => $normalize_pl_config,
61 marc_encoding => 'utf-8',
62 );
63
64 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
65 other are optional.
66
67 This function will B<die> if normalizastion can't be evaled.
68
69 Since this function isn't exported you have to call it with
70 C<WebPAC::Normalize::data_structure>.
71
72 =cut
73
74 sub data_structure {
75 my $arg = {@_};
76
77 die "need row argument" unless ($arg->{row});
78 die "need normalisation argument" unless ($arg->{rules});
79
80 no strict 'subs';
81 _set_lookup( $arg->{lookup} );
82 _set_rec( $arg->{row} );
83 _clean_ds( %{ $arg } );
84 eval "$arg->{rules}";
85 die "error evaling $arg->{rules}: $@\n" if ($@);
86
87 return _get_ds();
88 }
89
90 =head2 _set_rec
91
92 Set current record hash
93
94 _set_rec( $rec );
95
96 =cut
97
98 my $rec;
99
100 sub _set_rec {
101 $rec = shift or die "no record hash";
102 }
103
104 =head2 _get_ds
105
106 Return hash formatted as data structure
107
108 my $ds = _get_ds();
109
110 =cut
111
112 my $out;
113 my $marc21;
114 my $marc_encoding;
115
116 sub _get_ds {
117 return $out;
118 }
119
120 =head2 _clean_ds
121
122 Clean data structure hash for next record
123
124 _clean_ds();
125
126 =cut
127
128 sub _clean_ds {
129 my $a = {@_};
130 $out = undef;
131 $marc21 = undef;
132 $marc_encoding = $a->{marc_encoding};
133 }
134
135 =head2 _set_lookup
136
137 Set current lookup hash
138
139 _set_lookup( $lookup );
140
141 =cut
142
143 my $lookup;
144
145 sub _set_lookup {
146 $lookup = shift;
147 }
148
149 =head2 _get_marc21_fields
150
151 Get all fields defined by calls to C<marc21>
152
153 $marc->add_fields( WebPAC::Normalize:_get_marc21_fields() );
154
155 B<TODO>: implement exceptions to magic which unrolls repeated subfields
156 as new field with that subfield.
157
158 =cut
159
160 sub _get_marc21_fields {
161 my @m;
162 my $last;
163 foreach my $row (@{ $marc21 }) {
164 if ($last && (
165 $last->[0] eq $row->[0] || # check if field is same
166 $last->[1] eq $row->[1] || # check for i1
167 $last->[2] eq $row->[2] # and for i2
168 ) ) {
169 $last->[3]->{ $row->[3] } = $row->[4];
170 warn "## ++ added $row->[0] ^$row->[3]\n";
171 next;
172 } elsif ($last) {
173 push @m, $last;
174 }
175
176 push @m, $row;
177 }
178
179 push @m, $last if ($last);
180
181 return @m;
182 }
183
184 =head1 Functions to create C<data_structure>
185
186 Those functions generally have to first in your normalization file.
187
188 =head2 tag
189
190 Define new tag for I<search> and I<display>.
191
192 tag('Title', rec('200','a') );
193
194
195 =cut
196
197 sub tag {
198 my $name = shift or die "tag needs name as first argument";
199 my @o = grep { defined($_) && $_ ne '' } @_;
200 return unless (@o);
201 $out->{$name}->{tag} = $name;
202 $out->{$name}->{search} = \@o;
203 $out->{$name}->{display} = \@o;
204 }
205
206 =head2 display
207
208 Define tag just for I<display>
209
210 @v = display('Title', rec('200','a') );
211
212 =cut
213
214 sub display {
215 my $name = shift or die "display needs name as first argument";
216 my @o = grep { defined($_) && $_ ne '' } @_;
217 return unless (@o);
218 $out->{$name}->{tag} = $name;
219 $out->{$name}->{display} = \@o;
220 }
221
222 =head2 search
223
224 Prepare values just for I<search>
225
226 @v = search('Title', rec('200','a') );
227
228 =cut
229
230 sub search {
231 my $name = shift or die "search needs name as first argument";
232 my @o = grep { defined($_) && $_ ne '' } @_;
233 return unless (@o);
234 $out->{$name}->{tag} = $name;
235 $out->{$name}->{search} = \@o;
236 }
237
238 =head2 marc21
239
240 Save value for MARC field
241
242 marc21('900','a', rec('200','a') );
243
244 =cut
245
246 sub marc21 {
247 my $f = shift or die "marc21 needs field";
248 die "marc21 field must be numer" unless ($f =~ /^\d+$/);
249
250 my $sf = shift or die "marc21 needs subfield";
251
252 foreach (@_) {
253 my $v = $_; # make var read-write for Encode
254 next unless (defined($v) && $v !~ /^\s+$/);
255 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
256 push @{ $marc21 }, [ $f, ' ', ' ', $sf => $v ];
257 }
258 }
259
260 =head1 Functions to extract data from input
261
262 This function should be used inside functions to create C<data_structure> described
263 above.
264
265 =head2 rec1
266
267 Return all values in some field
268
269 @v = rec1('200')
270
271 TODO: order of values is probably same as in source data, need to investigate that
272
273 =cut
274
275 sub rec1 {
276 my $f = shift;
277 return unless (defined($rec) && defined($rec->{$f}));
278 if (ref($rec->{$f}) eq 'ARRAY') {
279 return map {
280 if (ref($_) eq 'HASH') {
281 values %{$_};
282 } else {
283 $_;
284 }
285 } @{ $rec->{$f} };
286 } elsif( defined($rec->{$f}) ) {
287 return $rec->{$f};
288 }
289 }
290
291 =head2 rec2
292
293 Return all values in specific field and subfield
294
295 @v = rec2('200','a')
296
297 =cut
298
299 sub rec2 {
300 my $f = shift;
301 return unless (defined($rec && $rec->{$f}));
302 my $sf = shift;
303 return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
304 }
305
306 =head2 rec
307
308 syntaxtic sugar for
309
310 @v = rec('200')
311 @v = rec('200','a')
312
313 =cut
314
315 sub rec {
316 if ($#_ == 0) {
317 return rec1(@_);
318 } elsif ($#_ == 1) {
319 return rec2(@_);
320 }
321 }
322
323 =head2 regex
324
325 Apply regex to some or all values
326
327 @v = regex( 's/foo/bar/g', @v );
328
329 =cut
330
331 sub regex {
332 my $r = shift;
333 my @out;
334 #warn "r: $r\n",Dumper(\@_);
335 foreach my $t (@_) {
336 next unless ($t);
337 eval "\$t =~ $r";
338 push @out, $t if ($t && $t ne '');
339 }
340 return @out;
341 }
342
343 =head2 prefix
344
345 Prefix all values with a string
346
347 @v = prefix( 'my_', @v );
348
349 =cut
350
351 sub prefix {
352 my $p = shift or die "prefix needs string as first argument";
353 return map { $p . $_ } grep { defined($_) } @_;
354 }
355
356 =head2 suffix
357
358 suffix all values with a string
359
360 @v = suffix( '_my', @v );
361
362 =cut
363
364 sub suffix {
365 my $s = shift or die "suffix needs string as first argument";
366 return map { $_ . $s } grep { defined($_) } @_;
367 }
368
369 =head2 surround
370
371 surround all values with a two strings
372
373 @v = surround( 'prefix_', '_suffix', @v );
374
375 =cut
376
377 sub surround {
378 my $p = shift or die "surround need prefix as first argument";
379 my $s = shift or die "surround needs suffix as second argument";
380 return map { $p . $_ . $s } grep { defined($_) } @_;
381 }
382
383 =head2 first
384
385 Return first element
386
387 $v = first( @v );
388
389 =cut
390
391 sub first {
392 my $r = shift;
393 return $r;
394 }
395
396 =head2 lookup
397
398 Consult lookup hashes for some value
399
400 @v = lookup( $v );
401 @v = lookup( @v );
402
403 =cut
404
405 sub lookup {
406 my $k = shift or return;
407 return unless (defined($lookup->{$k}));
408 if (ref($lookup->{$k}) eq 'ARRAY') {
409 return @{ $lookup->{$k} };
410 } else {
411 return $lookup->{$k};
412 }
413 }
414
415 =head2 join_with
416
417 Joins walues with some delimiter
418
419 $v = join_with(", ", @v);
420
421 =cut
422
423 sub join_with {
424 my $d = shift;
425 return join($d, grep { defined($_) && $_ ne '' } @_);
426 }
427
428 # END
429 1;

  ViewVC Help
Powered by ViewVC 1.1.26