1 |
dpavlin |
1054 |
package WebPAC::Input::PDF; |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
|
|
use WebPAC::Input; |
7 |
|
|
use base qw/WebPAC::Common/; |
8 |
|
|
|
9 |
|
|
use CAM::PDF; |
10 |
|
|
use Carp qw/confess/; |
11 |
|
|
|
12 |
|
|
use Data::Dump qw/dump/; |
13 |
|
|
|
14 |
|
|
=head1 NAME |
15 |
|
|
|
16 |
|
|
WebPAC::Input::PDF - try to parse PDF tabular data |
17 |
|
|
|
18 |
|
|
=head1 SYNOPSIS |
19 |
|
|
|
20 |
dpavlin |
1063 |
Open PDF file |
21 |
dpavlin |
1054 |
|
22 |
|
|
my $input = new WebPAC::Input::PDF( |
23 |
|
|
path => '/path/to/file.pdf', |
24 |
|
|
); |
25 |
|
|
|
26 |
|
|
=head1 FUNCTIONS |
27 |
|
|
|
28 |
|
|
=head2 new |
29 |
|
|
|
30 |
|
|
Returns new low-level input API object |
31 |
|
|
|
32 |
|
|
my $input = new WebPAC::Input::PDF( |
33 |
|
|
path => '/path/to/file.pdf' |
34 |
|
|
filter => sub { |
35 |
|
|
my ($l,$field_nr) = @_; |
36 |
|
|
# do something with $l which is line of input file |
37 |
|
|
return $l; |
38 |
|
|
}, |
39 |
|
|
} |
40 |
|
|
|
41 |
|
|
Options: |
42 |
|
|
|
43 |
|
|
=over 4 |
44 |
|
|
|
45 |
|
|
=item path |
46 |
|
|
|
47 |
|
|
path to PDF file |
48 |
|
|
|
49 |
|
|
=back |
50 |
|
|
|
51 |
|
|
=cut |
52 |
|
|
|
53 |
|
|
my $verbose = 1; |
54 |
|
|
|
55 |
|
|
sub new { |
56 |
|
|
my $class = shift; |
57 |
|
|
my $self = {@_}; |
58 |
|
|
bless($self, $class); |
59 |
|
|
|
60 |
|
|
my $arg = {@_}; |
61 |
|
|
|
62 |
|
|
my $log = $self->_get_logger(); |
63 |
|
|
|
64 |
|
|
my $file = $arg->{path} || $log->logide("need path"); |
65 |
|
|
|
66 |
|
|
my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr ); |
67 |
|
|
|
68 |
|
|
my $pages = $doc->numPages(); |
69 |
|
|
|
70 |
|
|
$log->info("opend $file with $pages pages"); |
71 |
|
|
|
72 |
dpavlin |
1057 |
my @lines = (); |
73 |
dpavlin |
1054 |
|
74 |
|
|
foreach my $p ( 1 .. $pages ) { |
75 |
|
|
my $tree = $doc->getPageContentTree($p); |
76 |
|
|
if ($tree) { |
77 |
|
|
my $out; |
78 |
|
|
|
79 |
|
|
confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY'; |
80 |
|
|
|
81 |
|
|
foreach my $blocks ( @{ $tree->{blocks} } ) { |
82 |
|
|
foreach my $block ( $blocks ) { |
83 |
|
|
next unless defined $block->{value}; |
84 |
|
|
foreach my $value ( $block->{value} ) { |
85 |
|
|
confess "expect array for value" unless ref($value) eq 'ARRAY'; |
86 |
|
|
foreach my $v ( @$value ) { |
87 |
|
|
next unless defined $v->{args}; |
88 |
|
|
#warn "## v ",ref($v),dump( $v ); |
89 |
|
|
my @data; |
90 |
|
|
foreach my $args ( $v->{args} ) { |
91 |
|
|
#warn "## args ",ref($args),dump( $args ); |
92 |
|
|
confess "expect array for args" unless ref($args) eq 'ARRAY'; |
93 |
|
|
foreach my $a ( @$args ) { |
94 |
|
|
if ( $a->{type} eq 'array' ) { |
95 |
|
|
#warn "## a ",ref($a),dump( $a ); |
96 |
|
|
foreach my $av ( @{ $a->{value} } ) { |
97 |
|
|
next unless $av->{type} eq 'string'; |
98 |
|
|
#warn "## av ",ref($av),dump( $av ); |
99 |
|
|
push @data, $av->{value}; |
100 |
|
|
} |
101 |
|
|
} elsif ( $a->{type} eq 'string' ) { |
102 |
|
|
push @data, $a->{value}; |
103 |
|
|
} |
104 |
|
|
} |
105 |
|
|
next unless @data; |
106 |
|
|
warn "data $#data = ",dump(@data); |
107 |
|
|
## FIXME data specific! |
108 |
|
|
if ( $#data == 4 ) { |
109 |
|
|
push @lines, [ @data ]; |
110 |
|
|
} elsif ( $#data == 0 && $#lines >= 0 ) { |
111 |
|
|
my $v = shift @data; |
112 |
|
|
warn "add $#lines to ",dump( $lines[ $#lines ]->[4] ); |
113 |
|
|
$lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v; |
114 |
|
|
warn "added to ",dump( $lines[ $#lines ] ); |
115 |
|
|
} else { |
116 |
|
|
$log->warn("ignored: ",dump( @data )); |
117 |
|
|
} |
118 |
|
|
} |
119 |
|
|
} |
120 |
|
|
} |
121 |
|
|
} |
122 |
|
|
} |
123 |
|
|
} |
124 |
|
|
} |
125 |
|
|
|
126 |
dpavlin |
1057 |
$self->{_lines} = \@lines; |
127 |
dpavlin |
1054 |
|
128 |
|
|
$log->debug("loaded ", $self->size, " records", sub { dump( @lines ) }); |
129 |
|
|
|
130 |
|
|
$self ? return $self : return undef; |
131 |
|
|
} |
132 |
|
|
|
133 |
|
|
=head2 fetch_rec |
134 |
|
|
|
135 |
|
|
Return record with ID C<$mfn> from database |
136 |
|
|
|
137 |
|
|
my $rec = $input->fetch_rec( $mfn, $filter_coderef ); |
138 |
|
|
|
139 |
dpavlin |
1057 |
Records are returned as field C<A>, C<B> and so on... |
140 |
|
|
|
141 |
|
|
Last supported column is C<ZZ>. |
142 |
|
|
|
143 |
dpavlin |
1054 |
=cut |
144 |
|
|
|
145 |
|
|
sub fetch_rec { |
146 |
|
|
my $self = shift; |
147 |
|
|
|
148 |
|
|
my ( $mfn, $filter_coderef ) = @_; |
149 |
|
|
|
150 |
dpavlin |
1057 |
my $rec = { |
151 |
|
|
'000' => [ $mfn ], |
152 |
|
|
}; |
153 |
|
|
|
154 |
|
|
my $line = $self->{_lines}->[ $mfn - 1 ] || return; |
155 |
|
|
confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY'; |
156 |
|
|
|
157 |
|
|
# warn "## line = ",dump( $line ); |
158 |
|
|
|
159 |
|
|
my $col = 'A'; |
160 |
|
|
my $c = 0; |
161 |
|
|
foreach my $e ( @$line ) { |
162 |
|
|
$rec->{$col} = $e; |
163 |
|
|
$c++; |
164 |
|
|
# FIXME what about columns > ZZ |
165 |
|
|
if ( $col eq 'Z' ) { |
166 |
|
|
$col .= 'AA'; |
167 |
|
|
} elsif ( $col eq 'ZZ' ) { |
168 |
|
|
$self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)"); |
169 |
|
|
last; |
170 |
|
|
} elsif ( $col =~ m/([A-Z])Z$/ ) { |
171 |
|
|
$col .= $1++ . 'A'; |
172 |
|
|
} else { |
173 |
|
|
$col++; |
174 |
|
|
} |
175 |
|
|
} |
176 |
|
|
|
177 |
|
|
# warn "## rec = ",dump( $rec ); |
178 |
|
|
|
179 |
|
|
return $rec; |
180 |
dpavlin |
1054 |
} |
181 |
|
|
|
182 |
|
|
|
183 |
|
|
=head2 size |
184 |
|
|
|
185 |
|
|
Return number of records in database |
186 |
|
|
|
187 |
|
|
my $size = $input->size; |
188 |
|
|
|
189 |
|
|
=cut |
190 |
|
|
|
191 |
|
|
sub size { |
192 |
|
|
my $self = shift; |
193 |
dpavlin |
1057 |
return $#{$self->{_lines}} + 1; |
194 |
dpavlin |
1054 |
} |
195 |
|
|
|
196 |
|
|
|
197 |
|
|
=head1 AUTHOR |
198 |
|
|
|
199 |
|
|
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >> |
200 |
|
|
|
201 |
|
|
=head1 COPYRIGHT & LICENSE |
202 |
|
|
|
203 |
|
|
Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. |
204 |
|
|
|
205 |
|
|
This program is free software; you can redistribute it and/or modify it |
206 |
|
|
under the same terms as Perl itself. |
207 |
|
|
|
208 |
|
|
=cut |
209 |
|
|
|
210 |
|
|
1; # End of WebPAC::Input::PDF |