1 |
dpavlin |
968 |
package WebPAC::Input::XML; |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
|
|
use WebPAC::Input; |
7 |
|
|
use base qw/WebPAC::Common/; |
8 |
|
|
|
9 |
|
|
use XML::Simple; |
10 |
|
|
use File::Find; |
11 |
dpavlin |
989 |
use File::Slurp; |
12 |
dpavlin |
968 |
|
13 |
|
|
use Data::Dump qw/dump/; |
14 |
|
|
|
15 |
|
|
=head1 NAME |
16 |
|
|
|
17 |
|
|
WebPAC::Input::XML - support for reading XML files |
18 |
|
|
|
19 |
|
|
=cut |
20 |
|
|
|
21 |
dpavlin |
989 |
our $VERSION = '0.02'; |
22 |
dpavlin |
968 |
|
23 |
|
|
=head1 FUNCTIONS |
24 |
|
|
|
25 |
|
|
=head2 new |
26 |
|
|
|
27 |
|
|
Returns new low-level input API object |
28 |
|
|
|
29 |
|
|
my $input = new WebPAC::Input::XML( |
30 |
|
|
path => '/path/to/XML/records.txt' |
31 |
|
|
filter => sub { |
32 |
|
|
my ($l,$field_nr) = @_; |
33 |
|
|
# do something with $l which is line of input file |
34 |
|
|
return $l; |
35 |
|
|
}, |
36 |
dpavlin |
989 |
mungle => 'conf/mungle/xml-mungle.pl', |
37 |
dpavlin |
968 |
} |
38 |
|
|
|
39 |
|
|
Options: |
40 |
|
|
|
41 |
|
|
=over 4 |
42 |
|
|
|
43 |
|
|
=item path |
44 |
|
|
|
45 |
|
|
path to directory with xml files ending in C<.xml> |
46 |
|
|
|
47 |
dpavlin |
989 |
=item mungle |
48 |
|
|
|
49 |
|
|
path to perl data_structure mungler which will be called to pre-normalize |
50 |
|
|
hash produced by this module. |
51 |
|
|
|
52 |
|
|
It's ugly and souldn't be really here, but I didn't wanted to write separate |
53 |
|
|
input module for each possible XML in the face of the earth, and having perl |
54 |
|
|
power to transform hash is just... Best solution :-) |
55 |
|
|
|
56 |
dpavlin |
968 |
=back |
57 |
|
|
|
58 |
|
|
=cut |
59 |
|
|
|
60 |
|
|
sub new { |
61 |
|
|
my $class = shift; |
62 |
|
|
my $self = {@_}; |
63 |
|
|
bless($self, $class); |
64 |
|
|
|
65 |
|
|
my $arg = {@_}; |
66 |
|
|
|
67 |
dpavlin |
992 |
#warn "#### arg = ",dump( $arg ); |
68 |
dpavlin |
989 |
|
69 |
dpavlin |
968 |
my $log = $self->_get_logger(); |
70 |
|
|
|
71 |
|
|
$log->logdie("can't find path ", $arg->{path}, ": $!\n") unless -d $arg->{path}; |
72 |
|
|
|
73 |
|
|
$log->info("collecting xml files from ", $arg->{path}); |
74 |
|
|
|
75 |
|
|
my @files; |
76 |
|
|
|
77 |
|
|
find({ |
78 |
|
|
wanted => sub { |
79 |
|
|
my $path = $File::Find::name; |
80 |
|
|
return unless -f $path && $path =~ m/\.xml$/i; |
81 |
|
|
push @files, $path; |
82 |
|
|
}, |
83 |
|
|
follow => 1, |
84 |
|
|
}, $arg->{path} ); |
85 |
|
|
|
86 |
|
|
$log->info("found ", $#files + 1, " XML files in ", $arg->{path}); |
87 |
|
|
|
88 |
dpavlin |
984 |
$self->{_files} = [ sort @files ]; |
89 |
dpavlin |
968 |
|
90 |
dpavlin |
989 |
if ( my $path = $arg->{mungle} ) { |
91 |
|
|
$log->logdie("can't find $path: $!") unless -r $path; |
92 |
|
|
$log->info("using $path as mungle rules"); |
93 |
|
|
|
94 |
|
|
$self->{mungle_rules} = read_file( $path ) || $log->logdie("can't open $path: $!"); |
95 |
|
|
} |
96 |
|
|
|
97 |
dpavlin |
968 |
$self ? return $self : return undef; |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
=head2 fetch_rec |
101 |
|
|
|
102 |
|
|
Return record with ID C<$mfn> from database |
103 |
|
|
|
104 |
|
|
my $rec = $input->fetch_rec( $mfn, $filter_coderef ); |
105 |
|
|
|
106 |
|
|
=cut |
107 |
|
|
|
108 |
|
|
sub fetch_rec { |
109 |
|
|
my $self = shift; |
110 |
|
|
|
111 |
|
|
my ( $mfn, $filter_coderef ) = @_; |
112 |
|
|
|
113 |
|
|
my $path = $self->{_files}->[ $mfn - 1 ] || return; |
114 |
|
|
|
115 |
dpavlin |
970 |
my $log = $self->_get_logger(); |
116 |
dpavlin |
968 |
|
117 |
dpavlin |
989 |
our $xml = XMLin( |
118 |
dpavlin |
981 |
$path, |
119 |
|
|
# ForceArray => 1, |
120 |
|
|
# ForceContent => 1, |
121 |
dpavlin |
989 |
KeepRoot => 1, |
122 |
dpavlin |
981 |
# SuppressEmpty => 1, # '' undef |
123 |
dpavlin |
970 |
|
124 |
dpavlin |
981 |
) || $log->logdie("can't open $path: $!"); |
125 |
|
|
|
126 |
dpavlin |
984 |
$log->debug("fetch_rec( $mfn ) => $path => xml is ",sub { dump($xml) }); |
127 |
dpavlin |
970 |
|
128 |
dpavlin |
989 |
our $ds; |
129 |
|
|
|
130 |
|
|
if ( my $rules = $self->{mungle_rules} ) { |
131 |
|
|
|
132 |
|
|
sub get_ds { |
133 |
dpavlin |
992 |
# warn "### get_ds xml = ",dump($xml); |
134 |
dpavlin |
989 |
return $xml; |
135 |
|
|
} |
136 |
|
|
sub set_ds { |
137 |
|
|
my $hash = {@_}; |
138 |
dpavlin |
992 |
# warn "### set_ds hash = ",dump($hash); |
139 |
dpavlin |
989 |
foreach my $f ( keys %$hash ) { |
140 |
dpavlin |
992 |
# warn "+++ $f ", dump( $hash->{$f} ),"\n"; |
141 |
dpavlin |
989 |
$ds->{$f} = $hash->{$f}; |
142 |
|
|
} |
143 |
dpavlin |
992 |
# warn "### set_ds mungle_ds = ",dump($ds); |
144 |
dpavlin |
989 |
} |
145 |
|
|
eval "$rules"; |
146 |
|
|
$log->logdie("mungle rules $path error: $@") if $@; |
147 |
|
|
|
148 |
dpavlin |
992 |
# warn "### set_ds after mungle_rules ds = ",dump($ds); |
149 |
dpavlin |
989 |
} else { |
150 |
|
|
|
151 |
|
|
$ds = $xml; |
152 |
|
|
|
153 |
|
|
} |
154 |
|
|
|
155 |
dpavlin |
970 |
# add mfn |
156 |
dpavlin |
989 |
$ds->{'000'} = [ $mfn ]; |
157 |
dpavlin |
970 |
|
158 |
dpavlin |
989 |
return $ds; |
159 |
dpavlin |
968 |
} |
160 |
|
|
|
161 |
|
|
|
162 |
|
|
=head2 size |
163 |
|
|
|
164 |
|
|
Return number of records in database |
165 |
|
|
|
166 |
|
|
my $size = $input->size; |
167 |
|
|
|
168 |
|
|
=cut |
169 |
|
|
|
170 |
|
|
sub size { |
171 |
|
|
my $self = shift; |
172 |
|
|
return $#{$self->{_files}} + 1; |
173 |
|
|
} |
174 |
|
|
|
175 |
|
|
=head1 AUTHOR |
176 |
|
|
|
177 |
|
|
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >> |
178 |
|
|
|
179 |
|
|
=head1 COPYRIGHT & LICENSE |
180 |
|
|
|
181 |
|
|
Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. |
182 |
|
|
|
183 |
|
|
This program is free software; you can redistribute it and/or modify it |
184 |
|
|
under the same terms as Perl itself. |
185 |
|
|
|
186 |
|
|
=cut |
187 |
|
|
|
188 |
|
|
1; |