/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (show annotations)
Sat Jul 16 23:56:14 2005 UTC (17 years, 6 months ago) by dpavlin
File size: 11714 byte(s)
data_source seems to work

1 package WebPAC::Normalize;
2
3 use warnings;
4 use strict;
5 use Data::Dumper;
6
7 =head1 NAME
8
9 WebPAC::Normalize - normalisation of source file
10
11 =head1 VERSION
12
13 Version 0.01
14
15 =cut
16
17 our $VERSION = '0.01';
18
19 =head1 SYNOPSIS
20
21 This package contains code that could be helpful in implementing different
22 normalisation front-ends.
23
24 =head1 FUNCTIONS
25
26 =head2 new
27
28 Create new normalisation object
29
30 my $n = new WebPAC::Normalize::Something(
31 cache_data_structure => './cache/ds/',
32 lookup_regex => $lookup->regex,
33 );
34
35 Optional parameter C<cache_data_structure> defines path to directory
36 in which cache file for C<data_structure> call will be created.
37
38 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
39 in structures.
40
41 =cut
42
43 sub new {
44 my $class = shift;
45 my $self = {@_};
46 bless($self, $class);
47
48 $self->setup_cache_dir( $self->{'cache_data_structure'} );
49
50 $self ? return $self : return undef;
51 }
52
53 =head2 setup_cache_dir
54
55 Check if specified cache directory exist, and if not, disable caching.
56
57 $setup_cache_dir('./cache/ds/');
58
59 If you pass false or zero value to this function, it will disable
60 cacheing.
61
62 =cut
63
64 sub setup_cache_dir {
65 my $self = shift;
66
67 my $dir = shift;
68
69 my $log = $self->_get_logger();
70
71 if ($dir) {
72 my $msg;
73 if (! -e $dir) {
74 $msg = "doesn't exist";
75 } elsif (! -d $dir) {
76 $msg = "is not directory";
77 } elsif (! -w $dir) {
78 $msg = "not writable";
79 }
80
81 if ($msg) {
82 undef $self->{'cache_data_structure'};
83 $log->warn("cache_data_structure $dir $msg, disabling...");
84 } else {
85 $log->debug("using cache dir $dir");
86 }
87 } else {
88 $log->debug("disabling cache");
89 undef $self->{'cache_data_structure'};
90 }
91 }
92
93
94 =head2 data_structure
95
96 Create in-memory data structure which represents normalized layout from
97 C<conf/normalize/*.xml>.
98
99 This structures are used to produce output.
100
101 my @ds = $webpac->data_structure($rec);
102
103 B<Note: historical oddity follows>
104
105 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
106 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
107 C<< <headline> >> tag.
108
109 =cut
110
111 sub data_structure {
112 my $self = shift;
113
114 my $log = $self->_get_logger();
115
116 my $rec = shift;
117 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
118
119 my $cache_file;
120
121 if (my $cache_path = $self->{'cache_data_structure'}) {
122 my $id = $rec->{'000'};
123 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
124 unless (defined($id)) {
125 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
126 undef $self->{'cache_data_structure'};
127 } else {
128 $cache_file = "$cache_path/$id";
129 if (-r $cache_file) {
130 my $ds_ref = retrieve($cache_file);
131 if ($ds_ref) {
132 $log->debug("cache hit: $cache_file");
133 my $ok = 1;
134 foreach my $f (qw(current_filename headline)) {
135 if ($ds_ref->{$f}) {
136 $self->{$f} = $ds_ref->{$f};
137 } else {
138 $ok = 0;
139 }
140 };
141 if ($ok && $ds_ref->{'ds'}) {
142 return @{ $ds_ref->{'ds'} };
143 } else {
144 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
145 undef $self->{'cache_data_structure'};
146 }
147 }
148 }
149 }
150 }
151
152 undef $self->{'currnet_filename'};
153 undef $self->{'headline'};
154
155 my @sorted_tags;
156 if ($self->{tags_by_order}) {
157 @sorted_tags = @{$self->{tags_by_order}};
158 } else {
159 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160 $self->{tags_by_order} = \@sorted_tags;
161 }
162
163 my @ds;
164
165 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166
167 foreach my $field (@sorted_tags) {
168
169 my $row;
170
171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172
173 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174 my $format = $tag->{'value'} || $tag->{'content'};
175
176 $log->debug("format: $format");
177
178 my @v;
179 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180 @v = $self->fill_in_to_arr($rec,$format);
181 } else {
182 @v = $self->parse_to_arr($rec,$format);
183 }
184 next if (! @v);
185
186 if ($tag->{'sort'}) {
187 @v = $self->sort_arr(@v);
188 }
189
190 # use format?
191 if ($tag->{'format_name'}) {
192 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193 }
194
195 if ($field eq 'filename') {
196 $self->{'current_filename'} = join('',@v);
197 $log->debug("filename: ",$self->{'current_filename'});
198 } elsif ($field eq 'headline') {
199 $self->{'headline'} .= join('',@v);
200 $log->debug("headline: ",$self->{'headline'});
201 next; # don't return headline in data_structure!
202 }
203
204 # delimiter will join repeatable fields
205 if ($tag->{'delimiter'}) {
206 @v = ( join($tag->{'delimiter'}, @v) );
207 }
208
209 # default types
210 my @types = qw(display swish);
211 # override by type attribute
212 @types = ( $tag->{'type'} ) if ($tag->{'type'});
213
214 foreach my $type (@types) {
215 # append to previous line?
216 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217 if ($tag->{'append'}) {
218
219 # I will delimit appended part with
220 # delimiter (or ,)
221 my $d = $tag->{'delimiter'};
222 # default delimiter
223 $d ||= " ";
224
225 my $last = pop @{$row->{$type}};
226 $d = "" if (! $last);
227 $last .= $d . join($d, @v);
228 push @{$row->{$type}}, $last;
229
230 } else {
231 push @{$row->{$type}}, @v;
232 }
233 }
234
235
236 }
237
238 if ($row) {
239 $row->{'tag'} = $field;
240
241 # TODO: name_sigular, name_plural
242 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243 $row->{'name'} = $name ? $self->_x($name) : $field;
244
245 # post-sort all values in field
246 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247 $log->warn("sort at field tag not implemented");
248 }
249
250 push @ds, $row;
251
252 $log->debug("row $field: ",sub { Dumper($row) });
253 }
254
255 }
256
257 if ($cache_file) {
258 store {
259 ds => \@ds,
260 current_filename => $self->{'current_filename'},
261 headline => $self->{'headline'},
262 }, $cache_file;
263 $log->debug("created storable cache file $cache_file");
264 }
265
266 return @ds;
267
268 }
269
270 =head2 apply_format
271
272 Apply format specified in tag with C<format_name="name"> and
273 C<format_delimiter=";;">.
274
275 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
276
277 Formats can contain C<lookup{...}> if you need them.
278
279 =cut
280
281 sub apply_format {
282 my $self = shift;
283
284 my ($name,$delimiter,$data) = @_;
285
286 my $log = $self->_get_logger();
287
288 if (! $self->{'import_xml'}->{'format'}->{$name}) {
289 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
290 return $data;
291 }
292
293 $log->warn("no delimiter for format $name") if (! $delimiter);
294
295 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
296
297 my @data = split(/\Q$delimiter\E/, $data);
298
299 my $out = sprintf($format, @data);
300 $log->debug("using format $name [$format] on $data to produce: $out");
301
302 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
303 return $self->lookup($out);
304 } else {
305 return $out;
306 }
307
308 }
309
310 =head2 parse
311
312 Perform smart parsing of string, skipping delimiters for fields which aren't
313 defined. It can also eval code in format starting with C<eval{...}> and
314 return output or nothing depending on eval code.
315
316 my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
317
318 =cut
319
320 sub parse {
321 my $self = shift;
322
323 my ($rec, $format_utf8, $i) = @_;
324
325 return if (! $format_utf8);
326
327 my $log = $self->_get_logger();
328
329 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
330
331 $i = 0 if (! $i);
332
333 my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
334
335 my @out;
336
337 $log->debug("format: $format");
338
339 my $eval_code;
340 # remove eval{...} from beginning
341 $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
342
343 my $filter_name;
344 # remove filter{...} from beginning
345 $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
346
347 my $prefix;
348 my $all_found=0;
349
350 while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
351
352 my $del = $1 || '';
353 $prefix ||= $del if ($all_found == 0);
354
355 # repeatable index
356 my $r = $i;
357 $r = 0 if (lc("$2") eq 's');
358
359 my $found = 0;
360 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
361
362 if ($found) {
363 push @out, $del;
364 push @out, $tmp;
365 $all_found += $found;
366 }
367 }
368
369 return if (! $all_found);
370
371 my $out = join('',@out);
372
373 if ($out) {
374 # add rest of format (suffix)
375 $out .= $format;
376
377 # add prefix if not there
378 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
379
380 $log->debug("result: $out");
381 }
382
383 if ($eval_code) {
384 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
385 $log->debug("about to eval{$eval} format: $out");
386 return if (! $self->_eval($eval));
387 }
388
389 if ($filter_name && $self->{'filter'}->{$filter_name}) {
390 $log->debug("about to filter{$filter_name} format: $out");
391 $out = $self->{'filter'}->{$filter_name}->($out);
392 return unless(defined($out));
393 $log->debug("filter result: $out");
394 }
395
396 return $out;
397 }
398
399 =head2 parse_to_arr
400
401 Similar to C<parse>, but returns array of all repeatable fields
402
403 my @arr = $webpac->parse_to_arr($rec,'v250^a');
404
405 =cut
406
407 sub parse_to_arr {
408 my $self = shift;
409
410 my ($rec, $format_utf8) = @_;
411
412 my $log = $self->_get_logger();
413
414 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
415 return if (! $format_utf8);
416
417 my $i = 0;
418 my @arr;
419
420 while (my $v = $self->parse($rec,$format_utf8,$i++)) {
421 push @arr, $v;
422 }
423
424 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
425
426 return @arr;
427 }
428
429 =head2 fill_in_to_arr
430
431 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
432 for fields which have lookups, so they shouldn't be parsed but rather
433 C<fill_id>ed.
434
435 my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
436
437 =cut
438
439 sub fill_in_to_arr {
440 my $self = shift;
441
442 my ($rec, $format_utf8) = @_;
443
444 my $log = $self->_get_logger();
445
446 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
447 return if (! $format_utf8);
448
449 my $i = 0;
450 my @arr;
451
452 while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
453 push @arr, @v;
454 }
455
456 $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
457
458 return @arr;
459 }
460
461 =head2 sort_arr
462
463 Sort array ignoring case and html in data
464
465 my @sorted = $webpac->sort_arr(@unsorted);
466
467 =cut
468
469 sub sort_arr {
470 my $self = shift;
471
472 my $log = $self->_get_logger();
473
474 # FIXME add Schwartzian Transformation?
475
476 my @sorted = sort {
477 $a =~ s#<[^>]+/*>##;
478 $b =~ s#<[^>]+/*>##;
479 lc($b) cmp lc($a)
480 } @_;
481 $log->debug("sorted values: ",sub { join(", ",@sorted) });
482
483 return @sorted;
484 }
485
486
487 =head2 _sort_by_order
488
489 Sort xml tags data structure accoding to C<order=""> attribute.
490
491 =cut
492
493 sub _sort_by_order {
494 my $self = shift;
495
496 my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
497 $self->{'import_xml'}->{'indexer'}->{$a};
498 my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
499 $self->{'import_xml'}->{'indexer'}->{$b};
500
501 return $va <=> $vb;
502 }
503
504 =head2 _x
505
506 Convert strings from C<conf/normalize> encoding into application specific
507 (optinally specified using C<code_page> to C<new> constructor.
508
509 my $text = $n->_x('normalize text string');
510
511 This is a stub so that other modules doesn't have to implement it.
512
513 =cut
514
515 sub _x {
516 my $self = shift;
517 return shift;
518 }
519
520
521 =head1 AUTHOR
522
523 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
524
525 =head1 COPYRIGHT & LICENSE
526
527 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
528
529 This program is free software; you can redistribute it and/or modify it
530 under the same terms as Perl itself.
531
532 =cut
533
534 1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26