1 |
package WebPAC::Normalise::XML; |
package WebPAC::Normalize::XML; |
2 |
|
|
3 |
use warnings; |
use warnings; |
4 |
use strict; |
use strict; |
5 |
|
|
6 |
use base qw/WebPAC::Common/; |
use base qw/WebPAC::Common/; |
7 |
use Storable; |
use Storable; |
8 |
|
use XML::Simple; |
9 |
|
use Data::Dumper; |
10 |
|
|
11 |
=head1 NAME |
=head1 NAME |
12 |
|
|
13 |
WebPAC::Normalise::XML - apply XML normalisaton rules |
WebPAC::Normalize::XML - apply XML normalisaton rules |
14 |
|
|
15 |
=head1 VERSION |
=head1 VERSION |
16 |
|
|
25 |
This module uses C<conf/normalize/*.xml> files to perform normalisation |
This module uses C<conf/normalize/*.xml> files to perform normalisation |
26 |
from input records |
from input records |
27 |
|
|
|
use WebPAC::Normalise::XML; |
|
|
|
|
|
my $foo = WebPAC::Normalise::XML->new(); |
|
|
... |
|
|
|
|
28 |
=cut |
=cut |
29 |
|
|
|
# mapping between data type and tag which specify |
|
|
# format in XML file |
|
|
my %type2tag = ( |
|
|
'isis' => 'isis', |
|
|
# 'excel' => 'column', |
|
|
# 'marc' => 'marc', |
|
|
# 'feed' => 'feed' |
|
|
); |
|
|
|
|
|
|
|
|
=head1 EXPORT |
|
|
|
|
|
A list of functions that can be exported. You can delete this section |
|
|
if you don't export anything, such as for a purely object-oriented module. |
|
|
|
|
30 |
=head1 FUNCTIONS |
=head1 FUNCTIONS |
31 |
|
|
32 |
=head2 new |
=head2 new |
33 |
|
|
34 |
Create new instance of WebPAC using configuration specified by C<config_file>. |
Read normalisation rules defined using XML from C<conf/normalize/*.xml> and |
35 |
|
parse it. |
36 |
|
|
37 |
my $n = new WebPAC::Normalize::XML( |
my $n = new WebPAC::Normalize::XML( |
38 |
|
tag => 'isis', |
39 |
|
xml_file => '/path/to/conf/normalize/isis.xml', |
40 |
cache_data_structure => './cache/ds/', |
cache_data_structure => './cache/ds/', |
41 |
|
lookup_regex => $lookup->regex, |
42 |
} |
} |
43 |
|
|
44 |
|
C<tag> defines tag to use within C<xml_file> |
45 |
|
|
46 |
|
C<xml_file> defines path to normalize XML. |
47 |
|
|
48 |
Optional parameter C<cache_data_structure> defines path to directory |
Optional parameter C<cache_data_structure> defines path to directory |
49 |
in which cache file for C<data_structure> call will be created. |
in which cache file for C<data_structure> call will be created. |
50 |
|
|
51 |
|
Recommended parametar C<lookup_regex> specify ... |
52 |
|
|
53 |
=cut |
=cut |
54 |
|
|
55 |
sub new { |
sub new { |
59 |
|
|
60 |
$self->setup_cache_dir( $self->{'cache_data_structure'} ); |
$self->setup_cache_dir( $self->{'cache_data_structure'} ); |
61 |
|
|
|
return $self; |
|
|
} |
|
|
|
|
|
=head2 open_import_xml |
|
|
|
|
|
Read file from C<import_xml/> directory and parse it. |
|
|
|
|
|
$webpac->open_import_xml(type => 'isis'); |
|
|
|
|
|
=cut |
|
|
|
|
|
sub open_import_xml { |
|
|
my $self = shift; |
|
|
|
|
62 |
my $log = $self->_get_logger(); |
my $log = $self->_get_logger(); |
63 |
|
|
64 |
my $arg = {@_}; |
foreach my $req (qw/tag xml_file/) { |
65 |
$log->logconfess("need type to load file from import_xml/") if (! $arg->{'type'}); |
$log->logconfess("need argument $req") unless $self->{$req}; |
66 |
|
} |
|
$self->{'type'} = $arg->{'type'}; |
|
|
|
|
|
my $type_base = $arg->{'type'}; |
|
|
$type_base =~ s/_.*$//g; |
|
67 |
|
|
68 |
$self->{'tag'} = $type2tag{$type_base}; |
my $f = |
69 |
|
|
70 |
$log->info("using type '",$self->{'type'},"' tag <",$self->{'tag'},">"); |
my $xml_file = $self->{'xml_file'}; |
71 |
|
|
72 |
my $f = "./import_xml/".$self->{'type'}.".xml"; |
$log->info("using $xml_file tag <",$self->{'tag'},">"); |
|
$log->logconfess("import_xml file '$f' doesn't exist!") if (! -e "$f"); |
|
73 |
|
|
74 |
$log->info("reading '$f'"); |
$log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file); |
75 |
|
|
76 |
$self->{'import_xml_file'} = $f; |
$self->{'import_xml_file'} = $xml_file; |
77 |
|
|
78 |
$self->{'import_xml'} = XMLin($f, |
$self->{'import_xml'} = XMLin($f, |
79 |
ForceArray => [ $self->{'tag'}, 'config', 'format' ], |
ForceArray => [ $self->{'tag'}, $self->{'tags'}, 'config', 'format' ], |
80 |
); |
); |
81 |
|
|
82 |
$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); |
$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }); |
83 |
|
|
84 |
|
return $self; |
85 |
} |
} |
86 |
|
|
87 |
=head2 setup_cache_dir |
=head2 setup_cache_dir |
127 |
|
|
128 |
=head2 data_structure |
=head2 data_structure |
129 |
|
|
130 |
Create in-memory data structure which represents layout from C<import_xml>. |
Create in-memory data structure which represents normalized layout from |
131 |
It is used later to produce output. |
C<conf/normalize/*.xml>. |
132 |
|
|
133 |
|
This structures are used to produce output. |
134 |
|
|
135 |
my @ds = $webpac->data_structure($rec); |
my @ds = $webpac->data_structure($rec); |
136 |
|
|
137 |
This method will also set C<$webpac->{'currnet_filename'}> if there is |
B<Note: historical oddity follows> |
138 |
<filename> tag in C<import_xml> and C<$webpac->{'headline'}> if there is |
|
139 |
<headline> tag. |
This method will also set C<< $webpac->{'currnet_filename'} >> if there is |
140 |
|
C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is |
141 |
|
C<< <headline> >> tag. |
142 |
|
|
143 |
=cut |
=cut |
144 |
|
|
210 |
$log->debug("format: $format"); |
$log->debug("format: $format"); |
211 |
|
|
212 |
my @v; |
my @v; |
213 |
# FIXME this is a cludge! |
if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) { |
|
if ($format =~ /$WebPAC::Lookup::LOOKUP_REGEX/o) { |
|
214 |
@v = $self->fill_in_to_arr($rec,$format); |
@v = $self->fill_in_to_arr($rec,$format); |
215 |
} else { |
} else { |
216 |
@v = $self->parse_to_arr($rec,$format); |
@v = $self->parse_to_arr($rec,$format); |
301 |
|
|
302 |
} |
} |
303 |
|
|
304 |
|
=head2 apply_format |
305 |
|
|
306 |
|
Apply format specified in tag with C<format_name="name"> and |
307 |
|
C<format_delimiter=";;">. |
308 |
|
|
309 |
|
my $text = $webpac->apply_format($format_name,$format_delimiter,$data); |
310 |
|
|
311 |
|
Formats can contain C<lookup{...}> if you need them. |
312 |
|
|
313 |
|
=cut |
314 |
|
|
315 |
|
sub apply_format { |
316 |
|
my $self = shift; |
317 |
|
|
318 |
|
my ($name,$delimiter,$data) = @_; |
319 |
|
|
320 |
|
my $log = $self->_get_logger(); |
321 |
|
|
322 |
|
if (! $self->{'import_xml'}->{'format'}->{$name}) { |
323 |
|
$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'}); |
324 |
|
return $data; |
325 |
|
} |
326 |
|
|
327 |
|
$log->warn("no delimiter for format $name") if (! $delimiter); |
328 |
|
|
329 |
|
my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'"); |
330 |
|
|
331 |
|
my @data = split(/\Q$delimiter\E/, $data); |
332 |
|
|
333 |
|
my $out = sprintf($format, @data); |
334 |
|
$log->debug("using format $name [$format] on $data to produce: $out"); |
335 |
|
|
336 |
|
if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) { |
337 |
|
return $self->lookup($out); |
338 |
|
} else { |
339 |
|
return $out; |
340 |
|
} |
341 |
|
|
342 |
|
} |
343 |
|
|
344 |
|
|
345 |
=head1 AUTHOR |
=head1 AUTHOR |
346 |
|
|
355 |
|
|
356 |
=cut |
=cut |
357 |
|
|
358 |
1; # End of WebPAC::Normalise::XML |
1; # End of WebPAC::Normalize::XML |