/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (hide annotations)
Sat Jul 16 22:57:26 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 8171 byte(s)
improvements to WebPAC::Normalize::XML

1 dpavlin 12 package WebPAC::Normalize::XML;
2 dpavlin 8
3     use warnings;
4     use strict;
5    
6     use base qw/WebPAC::Common/;
7 dpavlin 10 use Storable;
8 dpavlin 12 use XML::Simple;
9     use Data::Dumper;
10 dpavlin 8
11     =head1 NAME
12    
13 dpavlin 12 WebPAC::Normalize::XML - apply XML normalisaton rules
14 dpavlin 8
15     =head1 VERSION
16    
17     Version 0.01
18    
19     =cut
20    
21     our $VERSION = '0.01';
22    
23     =head1 SYNOPSIS
24    
25     This module uses C<conf/normalize/*.xml> files to perform normalisation
26     from input records
27    
28     =cut
29    
30     =head1 FUNCTIONS
31    
32     =head2 new
33    
34 dpavlin 12 Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
35     parse it.
36 dpavlin 8
37     my $n = new WebPAC::Normalize::XML(
38 dpavlin 12 tag => 'isis',
39     xml_file => '/path/to/conf/normalize/isis.xml',
40 dpavlin 8 cache_data_structure => './cache/ds/',
41 dpavlin 12 lookup_regex => $lookup->regex,
42 dpavlin 8 }
43    
44 dpavlin 12 C<tag> defines tag to use within C<xml_file>
45    
46     C<xml_file> defines path to normalize XML.
47    
48 dpavlin 8 Optional parameter C<cache_data_structure> defines path to directory
49     in which cache file for C<data_structure> call will be created.
50    
51 dpavlin 12 Recommended parametar C<lookup_regex> specify ...
52    
53 dpavlin 8 =cut
54    
55     sub new {
56     my $class = shift;
57     my $self = {@_};
58     bless($self, $class);
59    
60     $self->setup_cache_dir( $self->{'cache_data_structure'} );
61    
62     my $log = $self->_get_logger();
63    
64 dpavlin 12 foreach my $req (qw/tag xml_file/) {
65     $log->logconfess("need argument $req") unless $self->{$req};
66     }
67 dpavlin 8
68 dpavlin 12 my $f =
69 dpavlin 8
70 dpavlin 12 my $xml_file = $self->{'xml_file'};
71 dpavlin 8
72 dpavlin 12 $log->info("using $xml_file tag <",$self->{'tag'},">");
73 dpavlin 8
74 dpavlin 12 $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
75 dpavlin 8
76 dpavlin 12 $self->{'import_xml_file'} = $xml_file;
77 dpavlin 8
78     $self->{'import_xml'} = XMLin($f,
79 dpavlin 12 ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
80 dpavlin 8 );
81    
82     $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
83    
84 dpavlin 12 return $self;
85 dpavlin 8 }
86    
87     =head2 setup_cache_dir
88    
89     Check if specified cache directory exist, and if not, disable caching.
90    
91     $setup_cache_dir('./cache/ds/');
92    
93     If you pass false or zero value to this function, it will disable
94     cacheing.
95    
96     =cut
97    
98     sub setup_cache_dir {
99     my $self = shift;
100    
101     my $dir = shift;
102    
103     my $log = $self->_get_logger();
104    
105     if ($dir) {
106     my $msg;
107     if (! -e $dir) {
108     $msg = "doesn't exist";
109     } elsif (! -d $dir) {
110     $msg = "is not directory";
111     } elsif (! -w $dir) {
112     $msg = "not writable";
113     }
114    
115     if ($msg) {
116     undef $self->{'cache_data_structure'};
117     $log->warn("cache_data_structure $dir $msg, disabling...");
118     } else {
119     $log->debug("using cache dir $dir");
120     }
121     } else {
122     $log->debug("disabling cache");
123     undef $self->{'cache_data_structure'};
124     }
125     }
126    
127    
128     =head2 data_structure
129    
130 dpavlin 12 Create in-memory data structure which represents normalized layout from
131     C<conf/normalize/*.xml>.
132 dpavlin 8
133 dpavlin 12 This structures are used to produce output.
134    
135 dpavlin 8 my @ds = $webpac->data_structure($rec);
136    
137 dpavlin 12 B<Note: historical oddity follows>
138 dpavlin 8
139 dpavlin 12 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140     C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141     C<< <headline> >> tag.
142    
143 dpavlin 8 =cut
144    
145     sub data_structure {
146     my $self = shift;
147    
148     my $log = $self->_get_logger();
149    
150     my $rec = shift;
151     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
152    
153     my $cache_file;
154    
155     if (my $cache_path = $self->{'cache_data_structure'}) {
156     my $id = $rec->{'000'};
157     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
158     unless (defined($id)) {
159     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
160     undef $self->{'cache_data_structure'};
161     } else {
162     $cache_file = "$cache_path/$id";
163     if (-r $cache_file) {
164     my $ds_ref = retrieve($cache_file);
165     if ($ds_ref) {
166     $log->debug("cache hit: $cache_file");
167     my $ok = 1;
168     foreach my $f (qw(current_filename headline)) {
169     if ($ds_ref->{$f}) {
170     $self->{$f} = $ds_ref->{$f};
171     } else {
172     $ok = 0;
173     }
174     };
175     if ($ok && $ds_ref->{'ds'}) {
176     return @{ $ds_ref->{'ds'} };
177     } else {
178     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
179     undef $self->{'cache_data_structure'};
180     }
181     }
182     }
183     }
184     }
185    
186     undef $self->{'currnet_filename'};
187     undef $self->{'headline'};
188    
189     my @sorted_tags;
190     if ($self->{tags_by_order}) {
191     @sorted_tags = @{$self->{tags_by_order}};
192     } else {
193     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
194     $self->{tags_by_order} = \@sorted_tags;
195     }
196    
197     my @ds;
198    
199     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
200    
201     foreach my $field (@sorted_tags) {
202    
203     my $row;
204    
205     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
206    
207     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
208     my $format = $tag->{'value'} || $tag->{'content'};
209    
210     $log->debug("format: $format");
211    
212     my @v;
213 dpavlin 12 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
214 dpavlin 8 @v = $self->fill_in_to_arr($rec,$format);
215     } else {
216     @v = $self->parse_to_arr($rec,$format);
217     }
218     next if (! @v);
219    
220     if ($tag->{'sort'}) {
221     @v = $self->sort_arr(@v);
222     }
223    
224     # use format?
225     if ($tag->{'format_name'}) {
226     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
227     }
228    
229     if ($field eq 'filename') {
230     $self->{'current_filename'} = join('',@v);
231     $log->debug("filename: ",$self->{'current_filename'});
232     } elsif ($field eq 'headline') {
233     $self->{'headline'} .= join('',@v);
234     $log->debug("headline: ",$self->{'headline'});
235     next; # don't return headline in data_structure!
236     }
237    
238     # delimiter will join repeatable fields
239     if ($tag->{'delimiter'}) {
240     @v = ( join($tag->{'delimiter'}, @v) );
241     }
242    
243     # default types
244     my @types = qw(display swish);
245     # override by type attribute
246     @types = ( $tag->{'type'} ) if ($tag->{'type'});
247    
248     foreach my $type (@types) {
249     # append to previous line?
250     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
251     if ($tag->{'append'}) {
252    
253     # I will delimit appended part with
254     # delimiter (or ,)
255     my $d = $tag->{'delimiter'};
256     # default delimiter
257     $d ||= " ";
258    
259     my $last = pop @{$row->{$type}};
260     $d = "" if (! $last);
261     $last .= $d . join($d, @v);
262     push @{$row->{$type}}, $last;
263    
264     } else {
265     push @{$row->{$type}}, @v;
266     }
267     }
268    
269    
270     }
271    
272     if ($row) {
273     $row->{'tag'} = $field;
274    
275     # TODO: name_sigular, name_plural
276     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
277     $row->{'name'} = $name ? $self->_x($name) : $field;
278    
279     # post-sort all values in field
280     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
281     $log->warn("sort at field tag not implemented");
282     }
283    
284     push @ds, $row;
285    
286     $log->debug("row $field: ",sub { Dumper($row) });
287     }
288    
289     }
290    
291     if ($cache_file) {
292     store {
293     ds => \@ds,
294     current_filename => $self->{'current_filename'},
295     headline => $self->{'headline'},
296     }, $cache_file;
297     $log->debug("created storable cache file $cache_file");
298     }
299    
300     return @ds;
301    
302     }
303    
304 dpavlin 12 =head2 apply_format
305 dpavlin 8
306 dpavlin 12 Apply format specified in tag with C<format_name="name"> and
307     C<format_delimiter=";;">.
308    
309     my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
310    
311     Formats can contain C<lookup{...}> if you need them.
312    
313     =cut
314    
315     sub apply_format {
316     my $self = shift;
317    
318     my ($name,$delimiter,$data) = @_;
319    
320     my $log = $self->_get_logger();
321    
322     if (! $self->{'import_xml'}->{'format'}->{$name}) {
323     $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
324     return $data;
325     }
326    
327     $log->warn("no delimiter for format $name") if (! $delimiter);
328    
329     my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
330    
331     my @data = split(/\Q$delimiter\E/, $data);
332    
333     my $out = sprintf($format, @data);
334     $log->debug("using format $name [$format] on $data to produce: $out");
335    
336     if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337     return $self->lookup($out);
338     } else {
339     return $out;
340     }
341    
342     }
343    
344    
345 dpavlin 8 =head1 AUTHOR
346    
347     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
348    
349     =head1 COPYRIGHT & LICENSE
350    
351     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
352    
353     This program is free software; you can redistribute it and/or modify it
354     under the same terms as Perl itself.
355    
356     =cut
357    
358 dpavlin 12 1; # End of WebPAC::Normalize::XML

  ViewVC Help
Powered by ViewVC 1.1.26