/[webpac2]/trunk/lib/WebPAC/Normalize.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Sat Jul 16 23:56:14 2005 UTC (18 years, 8 months ago) by dpavlin
File size: 11714 byte(s)
data_source seems to work

1 dpavlin 10 package WebPAC::Normalize;
2    
3     use warnings;
4     use strict;
5 dpavlin 13 use Data::Dumper;
6 dpavlin 10
7     =head1 NAME
8    
9     WebPAC::Normalize - normalisation of source file
10    
11     =head1 VERSION
12    
13     Version 0.01
14    
15     =cut
16    
17     our $VERSION = '0.01';
18    
19     =head1 SYNOPSIS
20    
21     This package contains code that could be helpful in implementing different
22     normalisation front-ends.
23    
24     =head1 FUNCTIONS
25    
26 dpavlin 13 =head2 new
27 dpavlin 10
28 dpavlin 13 Create new normalisation object
29    
30     my $n = new WebPAC::Normalize::Something(
31     cache_data_structure => './cache/ds/',
32     lookup_regex => $lookup->regex,
33     );
34    
35     Optional parameter C<cache_data_structure> defines path to directory
36     in which cache file for C<data_structure> call will be created.
37    
38     Recommended parametar C<lookup_regex> is used to enable parsing of lookups
39     in structures.
40    
41 dpavlin 10 =cut
42    
43 dpavlin 13 sub new {
44     my $class = shift;
45     my $self = {@_};
46     bless($self, $class);
47    
48     $self->setup_cache_dir( $self->{'cache_data_structure'} );
49    
50     $self ? return $self : return undef;
51 dpavlin 10 }
52    
53 dpavlin 13 =head2 setup_cache_dir
54    
55     Check if specified cache directory exist, and if not, disable caching.
56    
57     $setup_cache_dir('./cache/ds/');
58    
59     If you pass false or zero value to this function, it will disable
60     cacheing.
61    
62     =cut
63    
64     sub setup_cache_dir {
65     my $self = shift;
66    
67     my $dir = shift;
68    
69     my $log = $self->_get_logger();
70    
71     if ($dir) {
72     my $msg;
73     if (! -e $dir) {
74     $msg = "doesn't exist";
75     } elsif (! -d $dir) {
76     $msg = "is not directory";
77     } elsif (! -w $dir) {
78     $msg = "not writable";
79     }
80    
81     if ($msg) {
82     undef $self->{'cache_data_structure'};
83     $log->warn("cache_data_structure $dir $msg, disabling...");
84     } else {
85     $log->debug("using cache dir $dir");
86     }
87     } else {
88     $log->debug("disabling cache");
89     undef $self->{'cache_data_structure'};
90     }
91     }
92    
93    
94     =head2 data_structure
95    
96     Create in-memory data structure which represents normalized layout from
97     C<conf/normalize/*.xml>.
98    
99     This structures are used to produce output.
100    
101     my @ds = $webpac->data_structure($rec);
102    
103     B<Note: historical oddity follows>
104    
105     This method will also set C<< $webpac->{'currnet_filename'} >> if there is
106     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
107     C<< <headline> >> tag.
108    
109     =cut
110    
111     sub data_structure {
112     my $self = shift;
113    
114     my $log = $self->_get_logger();
115    
116     my $rec = shift;
117     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
118    
119     my $cache_file;
120    
121     if (my $cache_path = $self->{'cache_data_structure'}) {
122     my $id = $rec->{'000'};
123     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
124     unless (defined($id)) {
125     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
126     undef $self->{'cache_data_structure'};
127     } else {
128     $cache_file = "$cache_path/$id";
129     if (-r $cache_file) {
130     my $ds_ref = retrieve($cache_file);
131     if ($ds_ref) {
132     $log->debug("cache hit: $cache_file");
133     my $ok = 1;
134     foreach my $f (qw(current_filename headline)) {
135     if ($ds_ref->{$f}) {
136     $self->{$f} = $ds_ref->{$f};
137     } else {
138     $ok = 0;
139     }
140     };
141     if ($ok && $ds_ref->{'ds'}) {
142     return @{ $ds_ref->{'ds'} };
143     } else {
144     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
145     undef $self->{'cache_data_structure'};
146     }
147     }
148     }
149     }
150     }
151    
152     undef $self->{'currnet_filename'};
153     undef $self->{'headline'};
154    
155     my @sorted_tags;
156     if ($self->{tags_by_order}) {
157     @sorted_tags = @{$self->{tags_by_order}};
158     } else {
159     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
160     $self->{tags_by_order} = \@sorted_tags;
161     }
162    
163     my @ds;
164    
165     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
166    
167     foreach my $field (@sorted_tags) {
168    
169     my $row;
170    
171     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
172    
173     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
174     my $format = $tag->{'value'} || $tag->{'content'};
175    
176     $log->debug("format: $format");
177    
178     my @v;
179     if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
180     @v = $self->fill_in_to_arr($rec,$format);
181     } else {
182     @v = $self->parse_to_arr($rec,$format);
183     }
184     next if (! @v);
185    
186     if ($tag->{'sort'}) {
187     @v = $self->sort_arr(@v);
188     }
189    
190     # use format?
191     if ($tag->{'format_name'}) {
192     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
193     }
194    
195     if ($field eq 'filename') {
196     $self->{'current_filename'} = join('',@v);
197     $log->debug("filename: ",$self->{'current_filename'});
198     } elsif ($field eq 'headline') {
199     $self->{'headline'} .= join('',@v);
200     $log->debug("headline: ",$self->{'headline'});
201     next; # don't return headline in data_structure!
202     }
203    
204     # delimiter will join repeatable fields
205     if ($tag->{'delimiter'}) {
206     @v = ( join($tag->{'delimiter'}, @v) );
207     }
208    
209     # default types
210     my @types = qw(display swish);
211     # override by type attribute
212     @types = ( $tag->{'type'} ) if ($tag->{'type'});
213    
214     foreach my $type (@types) {
215     # append to previous line?
216     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
217     if ($tag->{'append'}) {
218    
219     # I will delimit appended part with
220     # delimiter (or ,)
221     my $d = $tag->{'delimiter'};
222     # default delimiter
223     $d ||= " ";
224    
225     my $last = pop @{$row->{$type}};
226     $d = "" if (! $last);
227     $last .= $d . join($d, @v);
228     push @{$row->{$type}}, $last;
229    
230     } else {
231     push @{$row->{$type}}, @v;
232     }
233     }
234    
235    
236     }
237    
238     if ($row) {
239     $row->{'tag'} = $field;
240    
241     # TODO: name_sigular, name_plural
242     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
243     $row->{'name'} = $name ? $self->_x($name) : $field;
244    
245     # post-sort all values in field
246     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
247     $log->warn("sort at field tag not implemented");
248     }
249    
250     push @ds, $row;
251    
252     $log->debug("row $field: ",sub { Dumper($row) });
253     }
254    
255     }
256    
257     if ($cache_file) {
258     store {
259     ds => \@ds,
260     current_filename => $self->{'current_filename'},
261     headline => $self->{'headline'},
262     }, $cache_file;
263     $log->debug("created storable cache file $cache_file");
264     }
265    
266     return @ds;
267    
268     }
269    
270     =head2 apply_format
271    
272     Apply format specified in tag with C<format_name="name"> and
273     C<format_delimiter=";;">.
274    
275     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
276    
277     Formats can contain C<lookup{...}> if you need them.
278    
279     =cut
280    
281     sub apply_format {
282     my $self = shift;
283    
284     my ($name,$delimiter,$data) = @_;
285    
286     my $log = $self->_get_logger();
287    
288     if (! $self->{'import_xml'}->{'format'}->{$name}) {
289     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
290     return $data;
291     }
292    
293     $log->warn("no delimiter for format $name") if (! $delimiter);
294    
295     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
296    
297     my @data = split(/\Q$delimiter\E/, $data);
298    
299     my $out = sprintf($format, @data);
300     $log->debug("using format $name [$format] on $data to produce: $out");
301    
302     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
303     return $self->lookup($out);
304     } else {
305     return $out;
306     }
307    
308     }
309    
310     =head2 parse
311    
312     Perform smart parsing of string, skipping delimiters for fields which aren't
313     defined. It can also eval code in format starting with C<eval{...}> and
314     return output or nothing depending on eval code.
315    
316     my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
317    
318     =cut
319    
320     sub parse {
321     my $self = shift;
322    
323     my ($rec, $format_utf8, $i) = @_;
324    
325     return if (! $format_utf8);
326    
327     my $log = $self->_get_logger();
328    
329     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
330    
331     $i = 0 if (! $i);
332    
333     my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
334    
335     my @out;
336    
337     $log->debug("format: $format");
338    
339     my $eval_code;
340     # remove eval{...} from beginning
341     $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
342    
343     my $filter_name;
344     # remove filter{...} from beginning
345     $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
346    
347     my $prefix;
348     my $all_found=0;
349    
350     while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
351    
352     my $del = $1 || '';
353     $prefix ||= $del if ($all_found == 0);
354    
355     # repeatable index
356     my $r = $i;
357     $r = 0 if (lc("$2") eq 's');
358    
359     my $found = 0;
360     my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
361    
362     if ($found) {
363     push @out, $del;
364     push @out, $tmp;
365     $all_found += $found;
366     }
367     }
368    
369     return if (! $all_found);
370    
371     my $out = join('',@out);
372    
373     if ($out) {
374     # add rest of format (suffix)
375     $out .= $format;
376    
377     # add prefix if not there
378     $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
379    
380     $log->debug("result: $out");
381     }
382    
383     if ($eval_code) {
384     my $eval = $self->fill_in($rec,$eval_code,$i) || return;
385     $log->debug("about to eval{$eval} format: $out");
386     return if (! $self->_eval($eval));
387     }
388    
389     if ($filter_name && $self->{'filter'}->{$filter_name}) {
390     $log->debug("about to filter{$filter_name} format: $out");
391     $out = $self->{'filter'}->{$filter_name}->($out);
392     return unless(defined($out));
393     $log->debug("filter result: $out");
394     }
395    
396     return $out;
397     }
398    
399     =head2 parse_to_arr
400    
401     Similar to C<parse>, but returns array of all repeatable fields
402    
403     my @arr = $webpac->parse_to_arr($rec,'v250^a');
404    
405     =cut
406    
407     sub parse_to_arr {
408     my $self = shift;
409    
410     my ($rec, $format_utf8) = @_;
411    
412     my $log = $self->_get_logger();
413    
414     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
415     return if (! $format_utf8);
416    
417     my $i = 0;
418     my @arr;
419    
420     while (my $v = $self->parse($rec,$format_utf8,$i++)) {
421     push @arr, $v;
422     }
423    
424     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
425    
426     return @arr;
427     }
428    
429     =head2 fill_in_to_arr
430    
431     Similar to C<fill_in>, but returns array of all repeatable fields. Usable
432     for fields which have lookups, so they shouldn't be parsed but rather
433     C<fill_id>ed.
434    
435     my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
436    
437     =cut
438    
439     sub fill_in_to_arr {
440     my $self = shift;
441    
442     my ($rec, $format_utf8) = @_;
443    
444     my $log = $self->_get_logger();
445    
446     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
447     return if (! $format_utf8);
448    
449     my $i = 0;
450     my @arr;
451    
452     while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
453     push @arr, @v;
454     }
455    
456     $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
457    
458     return @arr;
459     }
460    
461     =head2 sort_arr
462    
463     Sort array ignoring case and html in data
464    
465     my @sorted = $webpac->sort_arr(@unsorted);
466    
467     =cut
468    
469     sub sort_arr {
470     my $self = shift;
471    
472     my $log = $self->_get_logger();
473    
474     # FIXME add Schwartzian Transformation?
475    
476     my @sorted = sort {
477     $a =~ s#<[^>]+/*>##;
478     $b =~ s#<[^>]+/*>##;
479     lc($b) cmp lc($a)
480     } @_;
481     $log->debug("sorted values: ",sub { join(", ",@sorted) });
482    
483     return @sorted;
484     }
485    
486    
487     =head2 _sort_by_order
488    
489     Sort xml tags data structure accoding to C<order=""> attribute.
490    
491     =cut
492    
493     sub _sort_by_order {
494     my $self = shift;
495    
496     my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
497     $self->{'import_xml'}->{'indexer'}->{$a};
498     my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
499     $self->{'import_xml'}->{'indexer'}->{$b};
500    
501     return $va <=> $vb;
502     }
503    
504     =head2 _x
505    
506     Convert strings from C<conf/normalize> encoding into application specific
507     (optinally specified using C<code_page> to C<new> constructor.
508    
509     my $text = $n->_x('normalize text string');
510    
511     This is a stub so that other modules doesn't have to implement it.
512    
513     =cut
514    
515     sub _x {
516     my $self = shift;
517     return shift;
518     }
519    
520    
521 dpavlin 10 =head1 AUTHOR
522    
523     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
524    
525     =head1 COPYRIGHT & LICENSE
526    
527     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
528    
529     This program is free software; you can redistribute it and/or modify it
530     under the same terms as Perl itself.
531    
532     =cut
533    
534     1; # End of WebPAC::DB

  ViewVC Help
Powered by ViewVC 1.1.26