/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (hide annotations)
Sat Jul 16 20:35:30 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 7579 byte(s)
ISIS input is finished, low_mem option has code (and not only documentation :-)

1 dpavlin 8 package WebPAC::Normalise::XML;
2    
3     use warnings;
4     use strict;
5    
6     use base qw/WebPAC::Common/;
7 dpavlin 10 use Storable;
8 dpavlin 8
9     =head1 NAME
10    
11     WebPAC::Normalise::XML - apply XML normalisaton rules
12    
13     =head1 VERSION
14    
15     Version 0.01
16    
17     =cut
18    
19     our $VERSION = '0.01';
20    
21     =head1 SYNOPSIS
22    
23     This module uses C<conf/normalize/*.xml> files to perform normalisation
24     from input records
25    
26     use WebPAC::Normalise::XML;
27    
28     my $foo = WebPAC::Normalise::XML->new();
29     ...
30    
31     =cut
32    
33     # mapping between data type and tag which specify
34     # format in XML file
35     my %type2tag = (
36     'isis' => 'isis',
37     # 'excel' => 'column',
38     # 'marc' => 'marc',
39     # 'feed' => 'feed'
40     );
41    
42    
43     =head1 EXPORT
44    
45     A list of functions that can be exported. You can delete this section
46     if you don't export anything, such as for a purely object-oriented module.
47    
48     =head1 FUNCTIONS
49    
50     =head2 new
51    
52     Create new instance of WebPAC using configuration specified by C<config_file>.
53    
54     my $n = new WebPAC::Normalize::XML(
55     cache_data_structure => './cache/ds/',
56     }
57    
58     Optional parameter C<cache_data_structure> defines path to directory
59     in which cache file for C<data_structure> call will be created.
60    
61     =cut
62    
63     sub new {
64     my $class = shift;
65     my $self = {@_};
66     bless($self, $class);
67    
68     $self->setup_cache_dir( $self->{'cache_data_structure'} );
69    
70     return $self;
71     }
72    
73     =head2 open_import_xml
74    
75     Read file from C<import_xml/> directory and parse it.
76    
77     $webpac->open_import_xml(type => 'isis');
78    
79     =cut
80    
81     sub open_import_xml {
82     my $self = shift;
83    
84     my $log = $self->_get_logger();
85    
86     my $arg = {@_};
87     $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
88    
89     $self->{'type'} = $arg->{'type'};
90    
91     my $type_base = $arg->{'type'};
92     $type_base =~ s/_.*$//g;
93    
94     $self->{'tag'} = $type2tag{$type_base};
95    
96     $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
97    
98     my $f = "./import_xml/".$self->{'type'}.".xml";
99     $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
100    
101     $log->info("reading '$f'");
102    
103     $self->{'import_xml_file'} = $f;
104    
105     $self->{'import_xml'} = XMLin($f,
106     ForceArray => [ $self->{'tag'}, 'config', 'format' ],
107     );
108    
109     $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
110    
111     }
112    
113     =head2 setup_cache_dir
114    
115     Check if specified cache directory exist, and if not, disable caching.
116    
117     $setup_cache_dir('./cache/ds/');
118    
119     If you pass false or zero value to this function, it will disable
120     cacheing.
121    
122     =cut
123    
124     sub setup_cache_dir {
125     my $self = shift;
126    
127     my $dir = shift;
128    
129     my $log = $self->_get_logger();
130    
131     if ($dir) {
132     my $msg;
133     if (! -e $dir) {
134     $msg = "doesn't exist";
135     } elsif (! -d $dir) {
136     $msg = "is not directory";
137     } elsif (! -w $dir) {
138     $msg = "not writable";
139     }
140    
141     if ($msg) {
142     undef $self->{'cache_data_structure'};
143     $log->warn("cache_data_structure $dir $msg, disabling...");
144     } else {
145     $log->debug("using cache dir $dir");
146     }
147     } else {
148     $log->debug("disabling cache");
149     undef $self->{'cache_data_structure'};
150     }
151     }
152    
153    
154     =head2 data_structure
155    
156     Create in-memory data structure which represents layout from C<import_xml>.
157     It is used later to produce output.
158    
159     my @ds = $webpac->data_structure($rec);
160    
161     This method will also set C<$webpac->{'currnet_filename'}> if there is
162     <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
163     <headline> tag.
164    
165     =cut
166    
167     sub data_structure {
168     my $self = shift;
169    
170     my $log = $self->_get_logger();
171    
172     my $rec = shift;
173     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
174    
175     my $cache_file;
176    
177     if (my $cache_path = $self->{'cache_data_structure'}) {
178     my $id = $rec->{'000'};
179     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
180     unless (defined($id)) {
181     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
182     undef $self->{'cache_data_structure'};
183     } else {
184     $cache_file = "$cache_path/$id";
185     if (-r $cache_file) {
186     my $ds_ref = retrieve($cache_file);
187     if ($ds_ref) {
188     $log->debug("cache hit: $cache_file");
189     my $ok = 1;
190     foreach my $f (qw(current_filename headline)) {
191     if ($ds_ref->{$f}) {
192     $self->{$f} = $ds_ref->{$f};
193     } else {
194     $ok = 0;
195     }
196     };
197     if ($ok && $ds_ref->{'ds'}) {
198     return @{ $ds_ref->{'ds'} };
199     } else {
200     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
201     undef $self->{'cache_data_structure'};
202     }
203     }
204     }
205     }
206     }
207    
208     undef $self->{'currnet_filename'};
209     undef $self->{'headline'};
210    
211     my @sorted_tags;
212     if ($self->{tags_by_order}) {
213     @sorted_tags = @{$self->{tags_by_order}};
214     } else {
215     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
216     $self->{tags_by_order} = \@sorted_tags;
217     }
218    
219     my @ds;
220    
221     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
222    
223     foreach my $field (@sorted_tags) {
224    
225     my $row;
226    
227     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
228    
229     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
230     my $format = $tag->{'value'} || $tag->{'content'};
231    
232     $log->debug("format: $format");
233    
234     my @v;
235 dpavlin 10 # FIXME this is a cludge!
236     if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) {
237 dpavlin 8 @v = $self->fill_in_to_arr($rec,$format);
238     } else {
239     @v = $self->parse_to_arr($rec,$format);
240     }
241     next if (! @v);
242    
243     if ($tag->{'sort'}) {
244     @v = $self->sort_arr(@v);
245     }
246    
247     # use format?
248     if ($tag->{'format_name'}) {
249     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
250     }
251    
252     if ($field eq 'filename') {
253     $self->{'current_filename'} = join('',@v);
254     $log->debug("filename: ",$self->{'current_filename'});
255     } elsif ($field eq 'headline') {
256     $self->{'headline'} .= join('',@v);
257     $log->debug("headline: ",$self->{'headline'});
258     next; # don't return headline in data_structure!
259     }
260    
261     # delimiter will join repeatable fields
262     if ($tag->{'delimiter'}) {
263     @v = ( join($tag->{'delimiter'}, @v) );
264     }
265    
266     # default types
267     my @types = qw(display swish);
268     # override by type attribute
269     @types = ( $tag->{'type'} ) if ($tag->{'type'});
270    
271     foreach my $type (@types) {
272     # append to previous line?
273     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
274     if ($tag->{'append'}) {
275    
276     # I will delimit appended part with
277     # delimiter (or ,)
278     my $d = $tag->{'delimiter'};
279     # default delimiter
280     $d ||= " ";
281    
282     my $last = pop @{$row->{$type}};
283     $d = "" if (! $last);
284     $last .= $d . join($d, @v);
285     push @{$row->{$type}}, $last;
286    
287     } else {
288     push @{$row->{$type}}, @v;
289     }
290     }
291    
292    
293     }
294    
295     if ($row) {
296     $row->{'tag'} = $field;
297    
298     # TODO: name_sigular, name_plural
299     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
300     $row->{'name'} = $name ? $self->_x($name) : $field;
301    
302     # post-sort all values in field
303     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
304     $log->warn("sort at field tag not implemented");
305     }
306    
307     push @ds, $row;
308    
309     $log->debug("row $field: ",sub { Dumper($row) });
310     }
311    
312     }
313    
314     if ($cache_file) {
315     store {
316     ds => \@ds,
317     current_filename => $self->{'current_filename'},
318     headline => $self->{'headline'},
319     }, $cache_file;
320     $log->debug("created storable cache file $cache_file");
321     }
322    
323     return @ds;
324    
325     }
326    
327    
328     =head1 AUTHOR
329    
330     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
331    
332     =head1 COPYRIGHT & LICENSE
333    
334     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
335    
336     This program is free software; you can redistribute it and/or modify it
337     under the same terms as Perl itself.
338    
339     =cut
340    
341     1; # End of WebPAC::Normalise::XML

  ViewVC Help
Powered by ViewVC 1.1.26