/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (show annotations)
Sat Jul 16 16:48:35 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 7520 byte(s)
little cleanup and first cut into WebPAC::Normalize::XML

1 package WebPAC::Normalise::XML;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7
8 =head1 NAME
9
10 WebPAC::Normalise::XML - apply XML normalisaton rules
11
12 =head1 VERSION
13
14 Version 0.01
15
16 =cut
17
18 our $VERSION = '0.01';
19
20 =head1 SYNOPSIS
21
22 This module uses C<conf/normalize/*.xml> files to perform normalisation
23 from input records
24
25 use WebPAC::Normalise::XML;
26
27 my $foo = WebPAC::Normalise::XML->new();
28 ...
29
30 =cut
31
32 # mapping between data type and tag which specify
33 # format in XML file
34 my %type2tag = (
35 'isis' => 'isis',
36 # 'excel' => 'column',
37 # 'marc' => 'marc',
38 # 'feed' => 'feed'
39 );
40
41
42 =head1 EXPORT
43
44 A list of functions that can be exported. You can delete this section
45 if you don't export anything, such as for a purely object-oriented module.
46
47 =head1 FUNCTIONS
48
49 =head2 new
50
51 Create new instance of WebPAC using configuration specified by C<config_file>.
52
53 my $n = new WebPAC::Normalize::XML(
54 cache_data_structure => './cache/ds/',
55 }
56
57 Optional parameter C<cache_data_structure> defines path to directory
58 in which cache file for C<data_structure> call will be created.
59
60 =cut
61
62 sub new {
63 my $class = shift;
64 my $self = {@_};
65 bless($self, $class);
66
67 $self->setup_cache_dir( $self->{'cache_data_structure'} );
68
69 return $self;
70 }
71
72 =head2 open_import_xml
73
74 Read file from C<import_xml/> directory and parse it.
75
76 $webpac->open_import_xml(type => 'isis');
77
78 =cut
79
80 sub open_import_xml {
81 my $self = shift;
82
83 my $log = $self->_get_logger();
84
85 my $arg = {@_};
86 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
87
88 $self->{'type'} = $arg->{'type'};
89
90 my $type_base = $arg->{'type'};
91 $type_base =~ s/_.*$//g;
92
93 $self->{'tag'} = $type2tag{$type_base};
94
95 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
96
97 my $f = "./import_xml/".$self->{'type'}.".xml";
98 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
99
100 $log->info("reading '$f'");
101
102 $self->{'import_xml_file'} = $f;
103
104 $self->{'import_xml'} = XMLin($f,
105 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
106 );
107
108 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
109
110 }
111
112 =head2 setup_cache_dir
113
114 Check if specified cache directory exist, and if not, disable caching.
115
116 $setup_cache_dir('./cache/ds/');
117
118 If you pass false or zero value to this function, it will disable
119 cacheing.
120
121 =cut
122
123 sub setup_cache_dir {
124 my $self = shift;
125
126 my $dir = shift;
127
128 my $log = $self->_get_logger();
129
130 if ($dir) {
131 my $msg;
132 if (! -e $dir) {
133 $msg = "doesn't exist";
134 } elsif (! -d $dir) {
135 $msg = "is not directory";
136 } elsif (! -w $dir) {
137 $msg = "not writable";
138 }
139
140 if ($msg) {
141 undef $self->{'cache_data_structure'};
142 $log->warn("cache_data_structure $dir $msg, disabling...");
143 } else {
144 $log->debug("using cache dir $dir");
145 }
146 } else {
147 $log->debug("disabling cache");
148 undef $self->{'cache_data_structure'};
149 }
150 }
151
152
153 =head2 data_structure
154
155 Create in-memory data structure which represents layout from C<import_xml>.
156 It is used later to produce output.
157
158 my @ds = $webpac->data_structure($rec);
159
160 This method will also set C<$webpac->{'currnet_filename'}> if there is
161 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
162 <headline> tag.
163
164 =cut
165
166 sub data_structure {
167 my $self = shift;
168
169 my $log = $self->_get_logger();
170
171 my $rec = shift;
172 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
173
174 my $cache_file;
175
176 if (my $cache_path = $self->{'cache_data_structure'}) {
177 my $id = $rec->{'000'};
178 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
179 unless (defined($id)) {
180 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
181 undef $self->{'cache_data_structure'};
182 } else {
183 $cache_file = "$cache_path/$id";
184 if (-r $cache_file) {
185 my $ds_ref = retrieve($cache_file);
186 if ($ds_ref) {
187 $log->debug("cache hit: $cache_file");
188 my $ok = 1;
189 foreach my $f (qw(current_filename headline)) {
190 if ($ds_ref->{$f}) {
191 $self->{$f} = $ds_ref->{$f};
192 } else {
193 $ok = 0;
194 }
195 };
196 if ($ok && $ds_ref->{'ds'}) {
197 return @{ $ds_ref->{'ds'} };
198 } else {
199 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
200 undef $self->{'cache_data_structure'};
201 }
202 }
203 }
204 }
205 }
206
207 undef $self->{'currnet_filename'};
208 undef $self->{'headline'};
209
210 my @sorted_tags;
211 if ($self->{tags_by_order}) {
212 @sorted_tags = @{$self->{tags_by_order}};
213 } else {
214 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
215 $self->{tags_by_order} = \@sorted_tags;
216 }
217
218 my @ds;
219
220 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
221
222 foreach my $field (@sorted_tags) {
223
224 my $row;
225
226 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
227
228 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
229 my $format = $tag->{'value'} || $tag->{'content'};
230
231 $log->debug("format: $format");
232
233 my @v;
234 if ($format =~ /$LOOKUP_REGEX/o) {
235 @v = $self->fill_in_to_arr($rec,$format);
236 } else {
237 @v = $self->parse_to_arr($rec,$format);
238 }
239 next if (! @v);
240
241 if ($tag->{'sort'}) {
242 @v = $self->sort_arr(@v);
243 }
244
245 # use format?
246 if ($tag->{'format_name'}) {
247 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
248 }
249
250 if ($field eq 'filename') {
251 $self->{'current_filename'} = join('',@v);
252 $log->debug("filename: ",$self->{'current_filename'});
253 } elsif ($field eq 'headline') {
254 $self->{'headline'} .= join('',@v);
255 $log->debug("headline: ",$self->{'headline'});
256 next; # don't return headline in data_structure!
257 }
258
259 # delimiter will join repeatable fields
260 if ($tag->{'delimiter'}) {
261 @v = ( join($tag->{'delimiter'}, @v) );
262 }
263
264 # default types
265 my @types = qw(display swish);
266 # override by type attribute
267 @types = ( $tag->{'type'} ) if ($tag->{'type'});
268
269 foreach my $type (@types) {
270 # append to previous line?
271 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
272 if ($tag->{'append'}) {
273
274 # I will delimit appended part with
275 # delimiter (or ,)
276 my $d = $tag->{'delimiter'};
277 # default delimiter
278 $d ||= " ";
279
280 my $last = pop @{$row->{$type}};
281 $d = "" if (! $last);
282 $last .= $d . join($d, @v);
283 push @{$row->{$type}}, $last;
284
285 } else {
286 push @{$row->{$type}}, @v;
287 }
288 }
289
290
291 }
292
293 if ($row) {
294 $row->{'tag'} = $field;
295
296 # TODO: name_sigular, name_plural
297 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
298 $row->{'name'} = $name ? $self->_x($name) : $field;
299
300 # post-sort all values in field
301 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
302 $log->warn("sort at field tag not implemented");
303 }
304
305 push @ds, $row;
306
307 $log->debug("row $field: ",sub { Dumper($row) });
308 }
309
310 }
311
312 if ($cache_file) {
313 store {
314 ds => \@ds,
315 current_filename => $self->{'current_filename'},
316 headline => $self->{'headline'},
317 }, $cache_file;
318 $log->debug("created storable cache file $cache_file");
319 }
320
321 return @ds;
322
323 }
324
325
326 =head1 AUTHOR
327
328 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
329
330 =head1 COPYRIGHT & LICENSE
331
332 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
333
334 This program is free software; you can redistribute it and/or modify it
335 under the same terms as Perl itself.
336
337 =cut
338
339 1; # End of WebPAC::Normalise::XML

  ViewVC Help
Powered by ViewVC 1.1.26