/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (show annotations)
Sat Jul 16 22:57:26 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 8171 byte(s)
improvements to WebPAC::Normalize::XML

1 package WebPAC::Normalize::XML;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7 use Storable;
8 use XML::Simple;
9 use Data::Dumper;
10
11 =head1 NAME
12
13 WebPAC::Normalize::XML - apply XML normalisaton rules
14
15 =head1 VERSION
16
17 Version 0.01
18
19 =cut
20
21 our $VERSION = '0.01';
22
23 =head1 SYNOPSIS
24
25 This module uses C<conf/normalize/*.xml> files to perform normalisation
26 from input records
27
28 =cut
29
30 =head1 FUNCTIONS
31
32 =head2 new
33
34 Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
35 parse it.
36
37 my $n = new WebPAC::Normalize::XML(
38 tag => 'isis',
39 xml_file => '/path/to/conf/normalize/isis.xml',
40 cache_data_structure => './cache/ds/',
41 lookup_regex => $lookup->regex,
42 }
43
44 C<tag> defines tag to use within C<xml_file>
45
46 C<xml_file> defines path to normalize XML.
47
48 Optional parameter C<cache_data_structure> defines path to directory
49 in which cache file for C<data_structure> call will be created.
50
51 Recommended parametar C<lookup_regex> specify ...
52
53 =cut
54
55 sub new {
56 my $class = shift;
57 my $self = {@_};
58 bless($self, $class);
59
60 $self->setup_cache_dir( $self->{'cache_data_structure'} );
61
62 my $log = $self->_get_logger();
63
64 foreach my $req (qw/tag xml_file/) {
65 $log->logconfess("need argument $req") unless $self->{$req};
66 }
67
68 my $f =
69
70 my $xml_file = $self->{'xml_file'};
71
72 $log->info("using $xml_file tag <",$self->{'tag'},">");
73
74 $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
75
76 $self->{'import_xml_file'} = $xml_file;
77
78 $self->{'import_xml'} = XMLin($f,
79 ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ],
80 );
81
82 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
83
84 return $self;
85 }
86
87 =head2 setup_cache_dir
88
89 Check if specified cache directory exist, and if not, disable caching.
90
91 $setup_cache_dir('./cache/ds/');
92
93 If you pass false or zero value to this function, it will disable
94 cacheing.
95
96 =cut
97
98 sub setup_cache_dir {
99 my $self = shift;
100
101 my $dir = shift;
102
103 my $log = $self->_get_logger();
104
105 if ($dir) {
106 my $msg;
107 if (! -e $dir) {
108 $msg = "doesn't exist";
109 } elsif (! -d $dir) {
110 $msg = "is not directory";
111 } elsif (! -w $dir) {
112 $msg = "not writable";
113 }
114
115 if ($msg) {
116 undef $self->{'cache_data_structure'};
117 $log->warn("cache_data_structure $dir $msg, disabling...");
118 } else {
119 $log->debug("using cache dir $dir");
120 }
121 } else {
122 $log->debug("disabling cache");
123 undef $self->{'cache_data_structure'};
124 }
125 }
126
127
128 =head2 data_structure
129
130 Create in-memory data structure which represents normalized layout from
131 C<conf/normalize/*.xml>.
132
133 This structures are used to produce output.
134
135 my @ds = $webpac->data_structure($rec);
136
137 B<Note: historical oddity follows>
138
139 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
140 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
141 C<< <headline> >> tag.
142
143 =cut
144
145 sub data_structure {
146 my $self = shift;
147
148 my $log = $self->_get_logger();
149
150 my $rec = shift;
151 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
152
153 my $cache_file;
154
155 if (my $cache_path = $self->{'cache_data_structure'}) {
156 my $id = $rec->{'000'};
157 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
158 unless (defined($id)) {
159 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
160 undef $self->{'cache_data_structure'};
161 } else {
162 $cache_file = "$cache_path/$id";
163 if (-r $cache_file) {
164 my $ds_ref = retrieve($cache_file);
165 if ($ds_ref) {
166 $log->debug("cache hit: $cache_file");
167 my $ok = 1;
168 foreach my $f (qw(current_filename headline)) {
169 if ($ds_ref->{$f}) {
170 $self->{$f} = $ds_ref->{$f};
171 } else {
172 $ok = 0;
173 }
174 };
175 if ($ok && $ds_ref->{'ds'}) {
176 return @{ $ds_ref->{'ds'} };
177 } else {
178 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
179 undef $self->{'cache_data_structure'};
180 }
181 }
182 }
183 }
184 }
185
186 undef $self->{'currnet_filename'};
187 undef $self->{'headline'};
188
189 my @sorted_tags;
190 if ($self->{tags_by_order}) {
191 @sorted_tags = @{$self->{tags_by_order}};
192 } else {
193 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
194 $self->{tags_by_order} = \@sorted_tags;
195 }
196
197 my @ds;
198
199 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
200
201 foreach my $field (@sorted_tags) {
202
203 my $row;
204
205 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
206
207 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
208 my $format = $tag->{'value'} || $tag->{'content'};
209
210 $log->debug("format: $format");
211
212 my @v;
213 if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
214 @v = $self->fill_in_to_arr($rec,$format);
215 } else {
216 @v = $self->parse_to_arr($rec,$format);
217 }
218 next if (! @v);
219
220 if ($tag->{'sort'}) {
221 @v = $self->sort_arr(@v);
222 }
223
224 # use format?
225 if ($tag->{'format_name'}) {
226 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
227 }
228
229 if ($field eq 'filename') {
230 $self->{'current_filename'} = join('',@v);
231 $log->debug("filename: ",$self->{'current_filename'});
232 } elsif ($field eq 'headline') {
233 $self->{'headline'} .= join('',@v);
234 $log->debug("headline: ",$self->{'headline'});
235 next; # don't return headline in data_structure!
236 }
237
238 # delimiter will join repeatable fields
239 if ($tag->{'delimiter'}) {
240 @v = ( join($tag->{'delimiter'}, @v) );
241 }
242
243 # default types
244 my @types = qw(display swish);
245 # override by type attribute
246 @types = ( $tag->{'type'} ) if ($tag->{'type'});
247
248 foreach my $type (@types) {
249 # append to previous line?
250 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
251 if ($tag->{'append'}) {
252
253 # I will delimit appended part with
254 # delimiter (or ,)
255 my $d = $tag->{'delimiter'};
256 # default delimiter
257 $d ||= " ";
258
259 my $last = pop @{$row->{$type}};
260 $d = "" if (! $last);
261 $last .= $d . join($d, @v);
262 push @{$row->{$type}}, $last;
263
264 } else {
265 push @{$row->{$type}}, @v;
266 }
267 }
268
269
270 }
271
272 if ($row) {
273 $row->{'tag'} = $field;
274
275 # TODO: name_sigular, name_plural
276 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
277 $row->{'name'} = $name ? $self->_x($name) : $field;
278
279 # post-sort all values in field
280 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
281 $log->warn("sort at field tag not implemented");
282 }
283
284 push @ds, $row;
285
286 $log->debug("row $field: ",sub { Dumper($row) });
287 }
288
289 }
290
291 if ($cache_file) {
292 store {
293 ds => \@ds,
294 current_filename => $self->{'current_filename'},
295 headline => $self->{'headline'},
296 }, $cache_file;
297 $log->debug("created storable cache file $cache_file");
298 }
299
300 return @ds;
301
302 }
303
304 =head2 apply_format
305
306 Apply format specified in tag with C<format_name="name"> and
307 C<format_delimiter=";;">.
308
309 my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
310
311 Formats can contain C<lookup{...}> if you need them.
312
313 =cut
314
315 sub apply_format {
316 my $self = shift;
317
318 my ($name,$delimiter,$data) = @_;
319
320 my $log = $self->_get_logger();
321
322 if (! $self->{'import_xml'}->{'format'}->{$name}) {
323 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
324 return $data;
325 }
326
327 $log->warn("no delimiter for format $name") if (! $delimiter);
328
329 my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
330
331 my @data = split(/\Q$delimiter\E/, $data);
332
333 my $out = sprintf($format, @data);
334 $log->debug("using format $name [$format] on $data to produce: $out");
335
336 if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
337 return $self->lookup($out);
338 } else {
339 return $out;
340 }
341
342 }
343
344
345 =head1 AUTHOR
346
347 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
348
349 =head1 COPYRIGHT & LICENSE
350
351 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
352
353 This program is free software; you can redistribute it and/or modify it
354 under the same terms as Perl itself.
355
356 =cut
357
358 1; # End of WebPAC::Normalize::XML

  ViewVC Help
Powered by ViewVC 1.1.26