1 |
dpavlin |
968 |
package WebPAC::Input::XML; |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
|
|
use WebPAC::Input; |
7 |
|
|
use base qw/WebPAC::Common/; |
8 |
|
|
|
9 |
|
|
use XML::Simple; |
10 |
|
|
use File::Find; |
11 |
dpavlin |
989 |
use File::Slurp; |
12 |
dpavlin |
968 |
|
13 |
|
|
use Data::Dump qw/dump/; |
14 |
|
|
|
15 |
|
|
=head1 NAME |
16 |
|
|
|
17 |
|
|
WebPAC::Input::XML - support for reading XML files |
18 |
|
|
|
19 |
|
|
=cut |
20 |
|
|
|
21 |
dpavlin |
989 |
our $VERSION = '0.02'; |
22 |
dpavlin |
968 |
|
23 |
|
|
=head1 FUNCTIONS |
24 |
|
|
|
25 |
|
|
=head2 new |
26 |
|
|
|
27 |
|
|
Returns new low-level input API object |
28 |
|
|
|
29 |
|
|
my $input = new WebPAC::Input::XML( |
30 |
|
|
path => '/path/to/XML/records.txt' |
31 |
|
|
filter => sub { |
32 |
|
|
my ($l,$field_nr) = @_; |
33 |
|
|
# do something with $l which is line of input file |
34 |
|
|
return $l; |
35 |
|
|
}, |
36 |
dpavlin |
989 |
mungle => 'conf/mungle/xml-mungle.pl', |
37 |
dpavlin |
968 |
} |
38 |
|
|
|
39 |
|
|
Options: |
40 |
|
|
|
41 |
|
|
=over 4 |
42 |
|
|
|
43 |
|
|
=item path |
44 |
|
|
|
45 |
|
|
path to directory with xml files ending in C<.xml> |
46 |
|
|
|
47 |
dpavlin |
989 |
=item mungle |
48 |
|
|
|
49 |
|
|
path to perl data_structure mungler which will be called to pre-normalize |
50 |
|
|
hash produced by this module. |
51 |
|
|
|
52 |
|
|
It's ugly and souldn't be really here, but I didn't wanted to write separate |
53 |
|
|
input module for each possible XML in the face of the earth, and having perl |
54 |
|
|
power to transform hash is just... Best solution :-) |
55 |
|
|
|
56 |
dpavlin |
968 |
=back |
57 |
|
|
|
58 |
|
|
=cut |
59 |
|
|
|
60 |
|
|
sub new { |
61 |
|
|
my $class = shift; |
62 |
|
|
my $self = {@_}; |
63 |
|
|
bless($self, $class); |
64 |
|
|
|
65 |
|
|
my $arg = {@_}; |
66 |
|
|
|
67 |
dpavlin |
992 |
#warn "#### arg = ",dump( $arg ); |
68 |
dpavlin |
989 |
|
69 |
dpavlin |
968 |
my $log = $self->_get_logger(); |
70 |
|
|
|
71 |
|
|
$log->logdie("can't find path ", $arg->{path}, ": $!\n") unless -d $arg->{path}; |
72 |
|
|
|
73 |
|
|
$log->info("collecting xml files from ", $arg->{path}); |
74 |
|
|
|
75 |
|
|
my @files; |
76 |
|
|
|
77 |
|
|
find({ |
78 |
|
|
wanted => sub { |
79 |
|
|
my $path = $File::Find::name; |
80 |
|
|
return unless -f $path && $path =~ m/\.xml$/i; |
81 |
|
|
push @files, $path; |
82 |
|
|
}, |
83 |
|
|
follow => 1, |
84 |
|
|
}, $arg->{path} ); |
85 |
|
|
|
86 |
|
|
$log->info("found ", $#files + 1, " XML files in ", $arg->{path}); |
87 |
|
|
|
88 |
dpavlin |
984 |
$self->{_files} = [ sort @files ]; |
89 |
dpavlin |
968 |
|
90 |
dpavlin |
989 |
if ( my $path = $arg->{mungle} ) { |
91 |
|
|
$log->logdie("can't find $path: $!") unless -r $path; |
92 |
|
|
$log->info("using $path as mungle rules"); |
93 |
|
|
|
94 |
|
|
$self->{mungle_rules} = read_file( $path ) || $log->logdie("can't open $path: $!"); |
95 |
|
|
} |
96 |
|
|
|
97 |
dpavlin |
968 |
$self ? return $self : return undef; |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
=head2 fetch_rec |
101 |
|
|
|
102 |
|
|
Return record with ID C<$mfn> from database |
103 |
|
|
|
104 |
|
|
my $rec = $input->fetch_rec( $mfn, $filter_coderef ); |
105 |
|
|
|
106 |
|
|
=cut |
107 |
|
|
|
108 |
|
|
sub fetch_rec { |
109 |
|
|
my $self = shift; |
110 |
|
|
|
111 |
|
|
my ( $mfn, $filter_coderef ) = @_; |
112 |
|
|
|
113 |
|
|
my $path = $self->{_files}->[ $mfn - 1 ] || return; |
114 |
|
|
|
115 |
dpavlin |
970 |
my $log = $self->_get_logger(); |
116 |
dpavlin |
968 |
|
117 |
dpavlin |
998 |
our $xml; |
118 |
|
|
eval { |
119 |
|
|
$xml = XMLin( |
120 |
|
|
$path, |
121 |
|
|
# ForceArray => 1, |
122 |
|
|
# ForceContent => 1, |
123 |
|
|
KeepRoot => 1, |
124 |
|
|
# SuppressEmpty => 1, # '' undef |
125 |
|
|
) || die "can't open $path: $!"; |
126 |
|
|
}; |
127 |
dpavlin |
970 |
|
128 |
dpavlin |
998 |
if ( $@ ) { |
129 |
|
|
$log->error("$@"); |
130 |
|
|
return; |
131 |
|
|
} |
132 |
dpavlin |
981 |
|
133 |
dpavlin |
984 |
$log->debug("fetch_rec( $mfn ) => $path => xml is ",sub { dump($xml) }); |
134 |
dpavlin |
970 |
|
135 |
dpavlin |
989 |
our $ds; |
136 |
|
|
|
137 |
|
|
if ( my $rules = $self->{mungle_rules} ) { |
138 |
|
|
|
139 |
|
|
sub get_ds { |
140 |
dpavlin |
992 |
# warn "### get_ds xml = ",dump($xml); |
141 |
dpavlin |
989 |
return $xml; |
142 |
|
|
} |
143 |
|
|
sub set_ds { |
144 |
|
|
my $hash = {@_}; |
145 |
dpavlin |
992 |
# warn "### set_ds hash = ",dump($hash); |
146 |
dpavlin |
989 |
foreach my $f ( keys %$hash ) { |
147 |
dpavlin |
992 |
# warn "+++ $f ", dump( $hash->{$f} ),"\n"; |
148 |
dpavlin |
989 |
$ds->{$f} = $hash->{$f}; |
149 |
|
|
} |
150 |
dpavlin |
992 |
# warn "### set_ds mungle_ds = ",dump($ds); |
151 |
dpavlin |
989 |
} |
152 |
|
|
eval "$rules"; |
153 |
|
|
$log->logdie("mungle rules $path error: $@") if $@; |
154 |
|
|
|
155 |
dpavlin |
992 |
# warn "### set_ds after mungle_rules ds = ",dump($ds); |
156 |
dpavlin |
989 |
} else { |
157 |
|
|
|
158 |
|
|
$ds = $xml; |
159 |
|
|
|
160 |
|
|
} |
161 |
|
|
|
162 |
dpavlin |
970 |
# add mfn |
163 |
dpavlin |
989 |
$ds->{'000'} = [ $mfn ]; |
164 |
dpavlin |
970 |
|
165 |
dpavlin |
989 |
return $ds; |
166 |
dpavlin |
968 |
} |
167 |
|
|
|
168 |
|
|
|
169 |
|
|
=head2 size |
170 |
|
|
|
171 |
|
|
Return number of records in database |
172 |
|
|
|
173 |
|
|
my $size = $input->size; |
174 |
|
|
|
175 |
|
|
=cut |
176 |
|
|
|
177 |
|
|
sub size { |
178 |
|
|
my $self = shift; |
179 |
|
|
return $#{$self->{_files}} + 1; |
180 |
|
|
} |
181 |
|
|
|
182 |
|
|
=head1 AUTHOR |
183 |
|
|
|
184 |
|
|
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >> |
185 |
|
|
|
186 |
|
|
=head1 COPYRIGHT & LICENSE |
187 |
|
|
|
188 |
|
|
Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. |
189 |
|
|
|
190 |
|
|
This program is free software; you can redistribute it and/or modify it |
191 |
|
|
under the same terms as Perl itself. |
192 |
|
|
|
193 |
|
|
=cut |
194 |
|
|
|
195 |
|
|
1; |