/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 14 - (show annotations)
Sun Jul 17 00:04:25 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 11728 byte(s)
small fixes

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use Data::Dumper;
6 use Storable;
7
8 =head1 NAME
9
10 WebPAC::Normalize - normalisation of source file
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This package contains code that could be helpful in implementing different
23 normalisation front-ends.
24
25 =head1 FUNCTIONS
26
27 =head2 new
28
29 Create new normalisation object
30
31 my $n = new WebPAC::Normalize::Something(
32 cache_data_structure => './cache/ds/',
33 lookup_regex => $lookup->regex,
34 );
35
36 Optional parameter C<cache_data_structure> defines path to directory
37 in which cache file for C<data_structure> call will be created.
38
39 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
40 in structures.
41
42 =cut
43
44 sub new {
45 my $class = shift;
46 my $self = {@_};
47 bless($self, $class);
48
49 $self->setup_cache_dir( $self->{'cache_data_structure'} );
50
51 $self ? return $self : return undef;
52 }
53
54 =head2 setup_cache_dir
55
56 Check if specified cache directory exist, and if not, disable caching.
57
58 $setup_cache_dir('./cache/ds/');
59
60 If you pass false or zero value to this function, it will disable
61 cacheing.
62
63 =cut
64
65 sub setup_cache_dir {
66 my $self = shift;
67
68 my $dir = shift;
69
70 my $log = $self->_get_logger();
71
72 if ($dir) {
73 my $msg;
74 if (! -e $dir) {
75 $msg = "doesn't exist";
76 } elsif (! -d $dir) {
77 $msg = "is not directory";
78 } elsif (! -w $dir) {
79 $msg = "not writable";
80 }
81
82 if ($msg) {
83 undef $self->{'cache_data_structure'};
84 $log->warn("cache_data_structure $dir $msg, disabling...");
85 } else {
86 $log->debug("using cache dir $dir");
87 }
88 } else {
89 $log->debug("disabling cache");
90 undef $self->{'cache_data_structure'};
91 }
92 }
93
94
95 =head2 data_structure
96
97 Create in-memory data structure which represents normalized layout from
98 C<conf/normalize/*.xml>.
99
100 This structures are used to produce output.
101
102 my @ds = $webpac->data_structure($rec);
103
104 B<Note: historical oddity follows>
105
106 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
107 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
108 C<< <headline> >> tag.
109
110 =cut
111
112 sub data_structure {
113 my $self = shift;
114
115 my $log = $self->_get_logger();
116
117 my $rec = shift;
118 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
119
120 my $cache_file;
121
122 if (my $cache_path = $self->{'cache_data_structure'}) {
123 my $id = $rec->{'000'};
124 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
125 unless (defined($id)) {
126 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
127 undef $self->{'cache_data_structure'};
128 } else {
129 $cache_file = "$cache_path/$id";
130 if (-r $cache_file) {
131 my $ds_ref = retrieve($cache_file);
132 if ($ds_ref) {
133 $log->debug("cache hit: $cache_file");
134 my $ok = 1;
135 foreach my $f (qw(current_filename headline)) {
136 if ($ds_ref->{$f}) {
137 $self->{$f} = $ds_ref->{$f};
138 } else {
139 $ok = 0;
140 }
141 };
142 if ($ok && $ds_ref->{'ds'}) {
143 return @{ $ds_ref->{'ds'} };
144 } else {
145 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
146 undef $self->{'cache_data_structure'};
147 }
148 }
149 }
150 }
151 }
152
153 undef $self->{'currnet_filename'};
154 undef $self->{'headline'};
155
156 my @sorted_tags;
157 if ($self->{tags_by_order}) {
158 @sorted_tags = @{$self->{tags_by_order}};
159 } else {
160 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
161 $self->{tags_by_order} = \@sorted_tags;
162 }
163
164 my @ds;
165
166 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
167
168 foreach my $field (@sorted_tags) {
169
170 my $row;
171
172 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
173
174 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
175 my $format = $tag->{'value'} || $tag->{'content'};
176
177 $log->debug("format: $format");
178
179 my @v;
180 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
181 @v = $self->fill_in_to_arr($rec,$format);
182 } else {
183 @v = $self->parse_to_arr($rec,$format);
184 }
185 next if (! @v);
186
187 if ($tag->{'sort'}) {
188 @v = $self->sort_arr(@v);
189 }
190
191 # use format?
192 if ($tag->{'format_name'}) {
193 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
194 }
195
196 if ($field eq 'filename') {
197 $self->{'current_filename'} = join('',@v);
198 $log->debug("filename: ",$self->{'current_filename'});
199 } elsif ($field eq 'headline') {
200 $self->{'headline'} .= join('',@v);
201 $log->debug("headline: ",$self->{'headline'});
202 next; # don't return headline in data_structure!
203 }
204
205 # delimiter will join repeatable fields
206 if ($tag->{'delimiter'}) {
207 @v = ( join($tag->{'delimiter'}, @v) );
208 }
209
210 # default types
211 my @types = qw(display swish);
212 # override by type attribute
213 @types = ( $tag->{'type'} ) if ($tag->{'type'});
214
215 foreach my $type (@types) {
216 # append to previous line?
217 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
218 if ($tag->{'append'}) {
219
220 # I will delimit appended part with
221 # delimiter (or ,)
222 my $d = $tag->{'delimiter'};
223 # default delimiter
224 $d ||= " ";
225
226 my $last = pop @{$row->{$type}};
227 $d = "" if (! $last);
228 $last .= $d . join($d, @v);
229 push @{$row->{$type}}, $last;
230
231 } else {
232 push @{$row->{$type}}, @v;
233 }
234 }
235
236
237 }
238
239 if ($row) {
240 $row->{'tag'} = $field;
241
242 # TODO: name_sigular, name_plural
243 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
244 $row->{'name'} = $name ? $self->_x($name) : $field;
245
246 # post-sort all values in field
247 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
248 $log->warn("sort at field tag not implemented");
249 }
250
251 push @ds, $row;
252
253 $log->debug("row $field: ",sub { Dumper($row) });
254 }
255
256 }
257
258 if ($cache_file) {
259 store {
260 ds => \@ds,
261 current_filename => $self->{'current_filename'},
262 headline => $self->{'headline'},
263 }, $cache_file;
264 $log->debug("created storable cache file $cache_file");
265 }
266
267 return @ds;
268
269 }
270
271 =head2 apply_format
272
273 Apply format specified in tag with C<format_name="name"> and
274 C<format_delimiter=";;">.
275
276 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
277
278 Formats can contain C<lookup{...}> if you need them.
279
280 =cut
281
282 sub apply_format {
283 my $self = shift;
284
285 my ($name,$delimiter,$data) = @_;
286
287 my $log = $self->_get_logger();
288
289 if (! $self->{'import_xml'}->{'format'}->{$name}) {
290 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
291 return $data;
292 }
293
294 $log->warn("no delimiter for format $name") if (! $delimiter);
295
296 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
297
298 my @data = split(/\Q$delimiter\E/, $data);
299
300 my $out = sprintf($format, @data);
301 $log->debug("using format $name [$format] on $data to produce: $out");
302
303 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
304 return $self->lookup($out);
305 } else {
306 return $out;
307 }
308
309 }
310
311 =head2 parse
312
313 Perform smart parsing of string, skipping delimiters for fields which aren't
314 defined. It can also eval code in format starting with C<eval{...}> and
315 return output or nothing depending on eval code.
316
317 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
318
319 =cut
320
321 sub parse {
322 my $self = shift;
323
324 my ($rec, $format_utf8, $i) = @_;
325
326 return if (! $format_utf8);
327
328 my $log = $self->_get_logger();
329
330 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
331
332 $i = 0 if (! $i);
333
334 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
335
336 my @out;
337
338 $log->debug("format: $format");
339
340 my $eval_code;
341 # remove eval{...} from beginning
342 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
343
344 my $filter_name;
345 # remove filter{...} from beginning
346 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
347
348 my $prefix;
349 my $all_found=0;
350
351 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
352
353 my $del = $1 || '';
354 $prefix ||= $del if ($all_found == 0);
355
356 # repeatable index
357 my $r = $i;
358 $r = 0 if (lc("$2") eq 's');
359
360 my $found = 0;
361 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
362
363 if ($found) {
364 push @out, $del;
365 push @out, $tmp;
366 $all_found += $found;
367 }
368 }
369
370 return if (! $all_found);
371
372 my $out = join('',@out);
373
374 if ($out) {
375 # add rest of format (suffix)
376 $out .= $format;
377
378 # add prefix if not there
379 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
380
381 $log->debug("result: $out");
382 }
383
384 if ($eval_code) {
385 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
386 $log->debug("about to eval{$eval} format: $out");
387 return if (! $self->_eval($eval));
388 }
389
390 if ($filter_name && $self->{'filter'}->{$filter_name}) {
391 $log->debug("about to filter{$filter_name} format: $out");
392 $out = $self->{'filter'}->{$filter_name}->($out);
393 return unless(defined($out));
394 $log->debug("filter result: $out");
395 }
396
397 return $out;
398 }
399
400 =head2 parse_to_arr
401
402 Similar to C<parse>, but returns array of all repeatable fields
403
404 my @arr = $webpac->parse_to_arr($rec,'v250^a');
405
406 =cut
407
408 sub parse_to_arr {
409 my $self = shift;
410
411 my ($rec, $format_utf8) = @_;
412
413 my $log = $self->_get_logger();
414
415 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
416 return if (! $format_utf8);
417
418 my $i = 0;
419 my @arr;
420
421 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
422 push @arr, $v;
423 }
424
425 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
426
427 return @arr;
428 }
429
430 =head2 fill_in_to_arr
431
432 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
433 for fields which have lookups, so they shouldn't be parsed but rather
434 C<fill_id>ed.
435
436 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
437
438 =cut
439
440 sub fill_in_to_arr {
441 my $self = shift;
442
443 my ($rec, $format_utf8) = @_;
444
445 my $log = $self->_get_logger();
446
447 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
448 return if (! $format_utf8);
449
450 my $i = 0;
451 my @arr;
452
453 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
454 push @arr, @v;
455 }
456
457 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
458
459 return @arr;
460 }
461
462 =head2 sort_arr
463
464 Sort array ignoring case and html in data
465
466 my @sorted = $webpac->sort_arr(@unsorted);
467
468 =cut
469
470 sub sort_arr {
471 my $self = shift;
472
473 my $log = $self->_get_logger();
474
475 # FIXME add Schwartzian Transformation?
476
477 my @sorted = sort {
478 $a =~ s#<[^>]+/*>##;
479 $b =~ s#<[^>]+/*>##;
480 lc($b) cmp lc($a)
481 } @_;
482 $log->debug("sorted values: ",sub { join(", ",@sorted) });
483
484 return @sorted;
485 }
486
487
488 =head2 _sort_by_order
489
490 Sort xml tags data structure accoding to C<order=""> attribute.
491
492 =cut
493
494 sub _sort_by_order {
495 my $self = shift;
496
497 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
498 $self->{'import_xml'}->{'indexer'}->{$a};
499 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
500 $self->{'import_xml'}->{'indexer'}->{$b};
501
502 return $va <=> $vb;
503 }
504
505 =head2 _x
506
507 Convert strings from C<conf/normalize> encoding into application specific
508 (optinally specified using C<code_page> to C<new> constructor.
509
510 my $text = $n->_x('normalize text string');
511
512 This is a stub so that other modules doesn't have to implement it.
513
514 =cut
515
516 sub _x {
517 my $self = shift;
518 return shift;
519 }
520
521
522 =head1 AUTHOR
523
524 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
525
526 =head1 COPYRIGHT & LICENSE
527
528 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
529
530 This program is free software; you can redistribute it and/or modify it
531 under the same terms as Perl itself.
532
533 =cut
534
535 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26