/[webpac2]/trunk/lib/WebPAC/Normalize/XML.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Normalize/XML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 8 - (hide annotations)
Sat Jul 16 16:48:35 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 7520 byte(s)
little cleanup and first cut into WebPAC::Normalize::XML

1 dpavlin 8 package WebPAC::Normalise::XML;
2    
3     use warnings;
4     use strict;
5    
6     use base qw/WebPAC::Common/;
7    
8     =head1 NAME
9    
10     WebPAC::Normalise::XML - apply XML normalisaton rules
11    
12     =head1 VERSION
13    
14     Version 0.01
15    
16     =cut
17    
18     our $VERSION = '0.01';
19    
20     =head1 SYNOPSIS
21    
22     This module uses C<conf/normalize/*.xml> files to perform normalisation
23     from input records
24    
25     use WebPAC::Normalise::XML;
26    
27     my $foo = WebPAC::Normalise::XML->new();
28     ...
29    
30     =cut
31    
32     # mapping between data type and tag which specify
33     # format in XML file
34     my %type2tag = (
35     'isis' => 'isis',
36     # 'excel' => 'column',
37     # 'marc' => 'marc',
38     # 'feed' => 'feed'
39     );
40    
41    
42     =head1 EXPORT
43    
44     A list of functions that can be exported. You can delete this section
45     if you don't export anything, such as for a purely object-oriented module.
46    
47     =head1 FUNCTIONS
48    
49     =head2 new
50    
51     Create new instance of WebPAC using configuration specified by C<config_file>.
52    
53     my $n = new WebPAC::Normalize::XML(
54     cache_data_structure => './cache/ds/',
55     }
56    
57     Optional parameter C<cache_data_structure> defines path to directory
58     in which cache file for C<data_structure> call will be created.
59    
60     =cut
61    
62     sub new {
63     my $class = shift;
64     my $self = {@_};
65     bless($self, $class);
66    
67     $self->setup_cache_dir( $self->{'cache_data_structure'} );
68    
69     return $self;
70     }
71    
72     =head2 open_import_xml
73    
74     Read file from C<import_xml/> directory and parse it.
75    
76     $webpac->open_import_xml(type => 'isis');
77    
78     =cut
79    
80     sub open_import_xml {
81     my $self = shift;
82    
83     my $log = $self->_get_logger();
84    
85     my $arg = {@_};
86     $log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'});
87    
88     $self->{'type'} = $arg->{'type'};
89    
90     my $type_base = $arg->{'type'};
91     $type_base =~ s/_.*$//g;
92    
93     $self->{'tag'} = $type2tag{$type_base};
94    
95     $log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">");
96    
97     my $f = "./import_xml/".$self->{'type'}.".xml";
98     $log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f");
99    
100     $log->info("reading '$f'");
101    
102     $self->{'import_xml_file'} = $f;
103    
104     $self->{'import_xml'} = XMLin($f,
105     ForceArray => [ $self->{'tag'}, 'config', 'format' ],
106     );
107    
108     $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) });
109    
110     }
111    
112     =head2 setup_cache_dir
113    
114     Check if specified cache directory exist, and if not, disable caching.
115    
116     $setup_cache_dir('./cache/ds/');
117    
118     If you pass false or zero value to this function, it will disable
119     cacheing.
120    
121     =cut
122    
123     sub setup_cache_dir {
124     my $self = shift;
125    
126     my $dir = shift;
127    
128     my $log = $self->_get_logger();
129    
130     if ($dir) {
131     my $msg;
132     if (! -e $dir) {
133     $msg = "doesn't exist";
134     } elsif (! -d $dir) {
135     $msg = "is not directory";
136     } elsif (! -w $dir) {
137     $msg = "not writable";
138     }
139    
140     if ($msg) {
141     undef $self->{'cache_data_structure'};
142     $log->warn("cache_data_structure $dir $msg, disabling...");
143     } else {
144     $log->debug("using cache dir $dir");
145     }
146     } else {
147     $log->debug("disabling cache");
148     undef $self->{'cache_data_structure'};
149     }
150     }
151    
152    
153     =head2 data_structure
154    
155     Create in-memory data structure which represents layout from C<import_xml>.
156     It is used later to produce output.
157    
158     my @ds = $webpac->data_structure($rec);
159    
160     This method will also set C<$webpac->{'currnet_filename'}> if there is
161     <filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is
162     <headline> tag.
163    
164     =cut
165    
166     sub data_structure {
167     my $self = shift;
168    
169     my $log = $self->_get_logger();
170    
171     my $rec = shift;
172     $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
173    
174     my $cache_file;
175    
176     if (my $cache_path = $self->{'cache_data_structure'}) {
177     my $id = $rec->{'000'};
178     $id = $rec->{'000'}->[0] if ($id =~ m/^ARRAY/o);
179     unless (defined($id)) {
180     $log->warn("Can't use cache_data_structure on records without unique identifier in field 000");
181     undef $self->{'cache_data_structure'};
182     } else {
183     $cache_file = "$cache_path/$id";
184     if (-r $cache_file) {
185     my $ds_ref = retrieve($cache_file);
186     if ($ds_ref) {
187     $log->debug("cache hit: $cache_file");
188     my $ok = 1;
189     foreach my $f (qw(current_filename headline)) {
190     if ($ds_ref->{$f}) {
191     $self->{$f} = $ds_ref->{$f};
192     } else {
193     $ok = 0;
194     }
195     };
196     if ($ok && $ds_ref->{'ds'}) {
197     return @{ $ds_ref->{'ds'} };
198     } else {
199     $log->warn("cache_data_structure $cache_path corrupt. Use rm $cache_path/* to re-create it on next run!");
200     undef $self->{'cache_data_structure'};
201     }
202     }
203     }
204     }
205     }
206    
207     undef $self->{'currnet_filename'};
208     undef $self->{'headline'};
209    
210     my @sorted_tags;
211     if ($self->{tags_by_order}) {
212     @sorted_tags = @{$self->{tags_by_order}};
213     } else {
214     @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
215     $self->{tags_by_order} = \@sorted_tags;
216     }
217    
218     my @ds;
219    
220     $log->debug("tags: ",sub { join(", ",@sorted_tags) });
221    
222     foreach my $field (@sorted_tags) {
223    
224     my $row;
225    
226     #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
227    
228     foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
229     my $format = $tag->{'value'} || $tag->{'content'};
230    
231     $log->debug("format: $format");
232    
233     my @v;
234     if ($format =~ /$LOOKUP_REGEX/o) {
235     @v = $self->fill_in_to_arr($rec,$format);
236     } else {
237     @v = $self->parse_to_arr($rec,$format);
238     }
239     next if (! @v);
240    
241     if ($tag->{'sort'}) {
242     @v = $self->sort_arr(@v);
243     }
244    
245     # use format?
246     if ($tag->{'format_name'}) {
247     @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
248     }
249    
250     if ($field eq 'filename') {
251     $self->{'current_filename'} = join('',@v);
252     $log->debug("filename: ",$self->{'current_filename'});
253     } elsif ($field eq 'headline') {
254     $self->{'headline'} .= join('',@v);
255     $log->debug("headline: ",$self->{'headline'});
256     next; # don't return headline in data_structure!
257     }
258    
259     # delimiter will join repeatable fields
260     if ($tag->{'delimiter'}) {
261     @v = ( join($tag->{'delimiter'}, @v) );
262     }
263    
264     # default types
265     my @types = qw(display swish);
266     # override by type attribute
267     @types = ( $tag->{'type'} ) if ($tag->{'type'});
268    
269     foreach my $type (@types) {
270     # append to previous line?
271     $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
272     if ($tag->{'append'}) {
273    
274     # I will delimit appended part with
275     # delimiter (or ,)
276     my $d = $tag->{'delimiter'};
277     # default delimiter
278     $d ||= " ";
279    
280     my $last = pop @{$row->{$type}};
281     $d = "" if (! $last);
282     $last .= $d . join($d, @v);
283     push @{$row->{$type}}, $last;
284    
285     } else {
286     push @{$row->{$type}}, @v;
287     }
288     }
289    
290    
291     }
292    
293     if ($row) {
294     $row->{'tag'} = $field;
295    
296     # TODO: name_sigular, name_plural
297     my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
298     $row->{'name'} = $name ? $self->_x($name) : $field;
299    
300     # post-sort all values in field
301     if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
302     $log->warn("sort at field tag not implemented");
303     }
304    
305     push @ds, $row;
306    
307     $log->debug("row $field: ",sub { Dumper($row) });
308     }
309    
310     }
311    
312     if ($cache_file) {
313     store {
314     ds => \@ds,
315     current_filename => $self->{'current_filename'},
316     headline => $self->{'headline'},
317     }, $cache_file;
318     $log->debug("created storable cache file $cache_file");
319     }
320    
321     return @ds;
322    
323     }
324    
325    
326     =head1 AUTHOR
327    
328     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
329    
330     =head1 COPYRIGHT & LICENSE
331    
332     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
333    
334     This program is free software; you can redistribute it and/or modify it
335     under the same terms as Perl itself.
336    
337     =cut
338    
339     1; # End of WebPAC::Normalise::XML

  ViewVC Help
Powered by ViewVC 1.1.26