/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (show annotations)
Sat Jul 16 20:35:30 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 7579 byte(s)
ISIS input is finished, low_mem option has code (and not only documentation :-)

1 package WebPAC::Normalise::XML;
2
3 use warnings;
4 use strict;
5
6 use base qw/WebPAC::Common/;
7 use Storable;
8
9 =head1 NAME
10
11 WebPAC::Normalise::XML - apply XML normalisaton rules
12
13 =head1 VERSION
14
15 Version 0.01
16
17 =cut
18
19 our $VERSION = '0.01';
20
21 =head1 SYNOPSIS
22
23 This module uses C<conf/normalize/*.xml> files to perform normalisation
24 from input records
25
26 use WebPAC::Normalise::XML;
27
28 my $foo = WebPAC::Normalise::XML->new();
29 ...
30
31 =cut
32
33 # mapping between data type and tag which specify
34 # format in XML file
35 my %type2tag = (
36 'isis' => 'isis',
37 # 'excel' => 'column',
38 # 'marc' => 'marc',
39 # 'feed' => 'feed'
40 );
41
42
43 =head1 EXPORT
44
45 A list of functions that can be exported. You can delete this section
46 if you don't export anything, such as for a purely object-oriented module.
47
48 =head1 FUNCTIONS
49
50 =head2 new
51
52 Create new instance of WebPAC using configuration specified by C<config_file>.
53
54 my $n = new WebPAC::Normalize::XML(
55 cache_data_structure => './cache/ds/',
56 }
57
58 Optional parameter C<cache_data_structure> defines path to directory
59 in which cache file for C<data_structure> call will be created.
60
61 =cut
62
63 sub new {
64 my $class = shift;
65 my $self = {@_};
66 bless($self, $class);
67
68 $self->setup_cache_dir( $self->{'cache_data_structure'} );
69
70 return $self;
71 }
72
73 =head2 open_import_xml
74
75 Read file from C<import_xml/> directory and parse it.
76
77 $webpac->open_import_xml(type => 'isis');
78
79 =cut
80
81 sub open_import_xml {
82 my $self = shift;
83
84 my $log = $self->_get_logger();
85
86 my $arg = {@_};
87 $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
88
89 $self->{'type'} = $arg->{'type'};
90
91 my $type_base = $arg->{'type'};
92 $type_base =~ s/_.*$//g;
93
94 $self->{'tag'} = $type2tag{$type_base};
95
96 $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
97
98 my $f = "./import_xml/".$self->{'type'}.".xml";
99 $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
100
101 $log->info("reading '$f'");
102
103 $self->{'import_xml_file'} = $f;
104
105 $self->{'import_xml'} = XMLin($f,
106 ForceArray => [ $self->{'tag'}, 'config', 'format' ],
107 );
108
109 $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
110
111 }
112
113 =head2 setup_cache_dir
114
115 Check if specified cache directory exist, and if not, disable caching.
116
117 $setup_cache_dir('./cache/ds/');
118
119 If you pass false or zero value to this function, it will disable
120 cacheing.
121
122 =cut
123
124 sub setup_cache_dir {
125 my $self = shift;
126
127 my $dir = shift;
128
129 my $log = $self->_get_logger();
130
131 if ($dir) {
132 my $msg;
133 if (! -e $dir) {
134 $msg = "doesn't exist";
135 } elsif (! -d $dir) {
136 $msg = "is not directory";
137 } elsif (! -w $dir) {
138 $msg = "not writable";
139 }
140
141 if ($msg) {
142 undef $self->{'cache_data_structure'};
143 $log->warn("cache_data_structure $dir $msg, disabling...");
144 } else {
145 $log->debug("using cache dir $dir");
146 }
147 } else {
148 $log->debug("disabling cache");
149 undef $self->{'cache_data_structure'};
150 }
151 }
152
153
154 =head2 data_structure
155
156 Create in-memory data structure which represents layout from C<import_xml>.
157 It is used later to produce output.
158
159 my @ds = $webpac->data_structure($rec);
160
161 This method will also set C<$webpac->{'currnet_filename'}> if there is
162 <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
163 <headline> tag.
164
165 =cut
166
167 sub data_structure {
168 my $self = shift;
169
170 my $log = $self->_get_logger();
171
172 my $rec = shift;
173 $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
174
175 my $cache_file;
176
177 if (my $cache_path = $self->{'cache_data_structure'}) {
178 my $id = $rec->{'000'};
179 $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
180 unless (defined($id)) {
181 $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
182 undef $self->{'cache_data_structure'};
183 } else {
184 $cache_file = "$cache_path/$id";
185 if (-r $cache_file) {
186 my $ds_ref = retrieve($cache_file);
187 if ($ds_ref) {
188 $log->debug("cache hit: $cache_file");
189 my $ok = 1;
190 foreach my $f (qw(current_filename headline)) {
191 if ($ds_ref->{$f}) {
192 $self->{$f} = $ds_ref->{$f};
193 } else {
194 $ok = 0;
195 }
196 };
197 if ($ok && $ds_ref->{'ds'}) {
198 return @{ $ds_ref->{'ds'} };
199 } else {
200 $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
201 undef $self->{'cache_data_structure'};
202 }
203 }
204 }
205 }
206 }
207
208 undef $self->{'currnet_filename'};
209 undef $self->{'headline'};
210
211 my @sorted_tags;
212 if ($self->{tags_by_order}) {
213 @sorted_tags = @{$self->{tags_by_order}};
214 } else {
215 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
216 $self->{tags_by_order} = \@sorted_tags;
217 }
218
219 my @ds;
220
221 $log->debug("tags: ",sub { join(", ",@sorted_tags) });
222
223 foreach my $field (@sorted_tags) {
224
225 my $row;
226
227 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
228
229 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
230 my $format = $tag->{'value'} || $tag->{'content'};
231
232 $log->debug("format: $format");
233
234 my @v;
235 # FIXME this is a cludge!
236 if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
237 @v = $self->fill_in_to_arr($rec,$format);
238 } else {
239 @v = $self->parse_to_arr($rec,$format);
240 }
241 next if (! @v);
242
243 if ($tag->{'sort'}) {
244 @v = $self->sort_arr(@v);
245 }
246
247 # use format?
248 if ($tag->{'format_name'}) {
249 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
250 }
251
252 if ($field eq 'filename') {
253 $self->{'current_filename'} = join('',@v);
254 $log->debug("filename: ",$self->{'current_filename'});
255 } elsif ($field eq 'headline') {
256 $self->{'headline'} .= join('',@v);
257 $log->debug("headline: ",$self->{'headline'});
258 next; # don't return headline in data_structure!
259 }
260
261 # delimiter will join repeatable fields
262 if ($tag->{'delimiter'}) {
263 @v = ( join($tag->{'delimiter'}, @v) );
264 }
265
266 # default types
267 my @types = qw(display swish);
268 # override by type attribute
269 @types = ( $tag->{'type'} ) if ($tag->{'type'});
270
271 foreach my $type (@types) {
272 # append to previous line?
273 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
274 if ($tag->{'append'}) {
275
276 # I will delimit appended part with
277 # delimiter (or ,)
278 my $d = $tag->{'delimiter'};
279 # default delimiter
280 $d ||= " ";
281
282 my $last = pop @{$row->{$type}};
283 $d = "" if (! $last);
284 $last .= $d . join($d, @v);
285 push @{$row->{$type}}, $last;
286
287 } else {
288 push @{$row->{$type}}, @v;
289 }
290 }
291
292
293 }
294
295 if ($row) {
296 $row->{'tag'} = $field;
297
298 # TODO: name_sigular, name_plural
299 my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
300 $row->{'name'} = $name ? $self->_x($name) : $field;
301
302 # post-sort all values in field
303 if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
304 $log->warn("sort at field tag not implemented");
305 }
306
307 push @ds, $row;
308
309 $log->debug("row $field: ",sub { Dumper($row) });
310 }
311
312 }
313
314 if ($cache_file) {
315 store {
316 ds => \@ds,
317 current_filename => $self->{'current_filename'},
318 headline => $self->{'headline'},
319 }, $cache_file;
320 $log->debug("created storable cache file $cache_file");
321 }
322
323 return @ds;
324
325 }
326
327
328 =head1 AUTHOR
329
330 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
331
332 =head1 COPYRIGHT & LICENSE
333
334 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
335
336 This program is free software; you can redistribute it and/or modify it
337 under the same terms as Perl itself.
338
339 =cut
340
341 1; # End of WebPAC::Normalise::XML

  ViewVC Help
Powered by ViewVC 1.1.26