1 |
package WebPAC::Input::PDF; |
2 |
|
3 |
use warnings; |
4 |
use strict; |
5 |
|
6 |
use WebPAC::Input; |
7 |
use base qw/WebPAC::Common/; |
8 |
|
9 |
use CAM::PDF; |
10 |
use Carp qw/confess/; |
11 |
|
12 |
use Data::Dump qw/dump/; |
13 |
|
14 |
=head1 NAME |
15 |
|
16 |
WebPAC::Input::PDF - try to parse PDF tabular data |
17 |
|
18 |
=head1 SYNOPSIS |
19 |
|
20 |
Open PDF file |
21 |
|
22 |
my $input = new WebPAC::Input::PDF( |
23 |
path => '/path/to/file.pdf', |
24 |
); |
25 |
|
26 |
=head1 FUNCTIONS |
27 |
|
28 |
=head2 new |
29 |
|
30 |
Returns new low-level input API object |
31 |
|
32 |
my $input = new WebPAC::Input::PDF( |
33 |
path => '/path/to/file.pdf' |
34 |
filter => sub { |
35 |
my ($l,$field_nr) = @_; |
36 |
# do something with $l which is line of input file |
37 |
return $l; |
38 |
}, |
39 |
} |
40 |
|
41 |
Options: |
42 |
|
43 |
=over 4 |
44 |
|
45 |
=item path |
46 |
|
47 |
path to PDF file |
48 |
|
49 |
=back |
50 |
|
51 |
=cut |
52 |
|
53 |
my $verbose = 1; |
54 |
|
55 |
sub new { |
56 |
my $class = shift; |
57 |
my $self = {@_}; |
58 |
bless($self, $class); |
59 |
|
60 |
my $arg = {@_}; |
61 |
|
62 |
my $log = $self->_get_logger(); |
63 |
|
64 |
my $file = $arg->{path} || $log->logide("need path"); |
65 |
|
66 |
my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr ); |
67 |
|
68 |
my $pages = $doc->numPages(); |
69 |
|
70 |
$log->info("opend $file with $pages pages"); |
71 |
|
72 |
my @lines = (); |
73 |
|
74 |
foreach my $p ( 1 .. $pages ) { |
75 |
my $tree = $doc->getPageContentTree($p); |
76 |
if ($tree) { |
77 |
my $out; |
78 |
|
79 |
confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY'; |
80 |
|
81 |
foreach my $blocks ( @{ $tree->{blocks} } ) { |
82 |
foreach my $block ( $blocks ) { |
83 |
next unless defined $block->{value}; |
84 |
foreach my $value ( $block->{value} ) { |
85 |
confess "expect array for value" unless ref($value) eq 'ARRAY'; |
86 |
foreach my $v ( @$value ) { |
87 |
next unless defined $v->{args}; |
88 |
#warn "## v ",ref($v),dump( $v ); |
89 |
my @data; |
90 |
foreach my $args ( $v->{args} ) { |
91 |
#warn "## args ",ref($args),dump( $args ); |
92 |
confess "expect array for args" unless ref($args) eq 'ARRAY'; |
93 |
foreach my $a ( @$args ) { |
94 |
if ( $a->{type} eq 'array' ) { |
95 |
#warn "## a ",ref($a),dump( $a ); |
96 |
foreach my $av ( @{ $a->{value} } ) { |
97 |
next unless $av->{type} eq 'string'; |
98 |
#warn "## av ",ref($av),dump( $av ); |
99 |
push @data, $av->{value}; |
100 |
} |
101 |
} elsif ( $a->{type} eq 'string' ) { |
102 |
push @data, $a->{value}; |
103 |
} |
104 |
} |
105 |
next unless @data; |
106 |
warn "data $#data = ",dump(@data); |
107 |
## FIXME data specific! |
108 |
if ( $#data == 4 ) { |
109 |
push @lines, [ @data ]; |
110 |
} elsif ( $#data == 0 && $#lines >= 0 ) { |
111 |
my $v = shift @data; |
112 |
warn "add $#lines to ",dump( $lines[ $#lines ]->[4] ); |
113 |
$lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v; |
114 |
warn "added to ",dump( $lines[ $#lines ] ); |
115 |
} else { |
116 |
$log->warn("ignored: ",dump( @data )); |
117 |
} |
118 |
} |
119 |
} |
120 |
} |
121 |
} |
122 |
} |
123 |
} |
124 |
} |
125 |
|
126 |
$self->{_lines} = \@lines; |
127 |
|
128 |
$log->debug("loaded ", $self->size, " records", sub { dump( @lines ) }); |
129 |
|
130 |
$self ? return $self : return undef; |
131 |
} |
132 |
|
133 |
=head2 fetch_rec |
134 |
|
135 |
Return record with ID C<$mfn> from database |
136 |
|
137 |
my $rec = $input->fetch_rec( $mfn, $filter_coderef ); |
138 |
|
139 |
Records are returned as field C<A>, C<B> and so on... |
140 |
|
141 |
Last supported column is C<ZZ>. |
142 |
|
143 |
=cut |
144 |
|
145 |
sub fetch_rec { |
146 |
my $self = shift; |
147 |
|
148 |
my ( $mfn, $filter_coderef ) = @_; |
149 |
|
150 |
my $rec = { |
151 |
'000' => [ $mfn ], |
152 |
}; |
153 |
|
154 |
my $line = $self->{_lines}->[ $mfn - 1 ] || return; |
155 |
confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY'; |
156 |
|
157 |
# warn "## line = ",dump( $line ); |
158 |
|
159 |
my $col = 'A'; |
160 |
my $c = 0; |
161 |
foreach my $e ( @$line ) { |
162 |
$rec->{$col} = $e; |
163 |
$c++; |
164 |
# FIXME what about columns > ZZ |
165 |
if ( $col eq 'Z' ) { |
166 |
$col .= 'AA'; |
167 |
} elsif ( $col eq 'ZZ' ) { |
168 |
$self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)"); |
169 |
last; |
170 |
} elsif ( $col =~ m/([A-Z])Z$/ ) { |
171 |
$col .= $1++ . 'A'; |
172 |
} else { |
173 |
$col++; |
174 |
} |
175 |
} |
176 |
|
177 |
# warn "## rec = ",dump( $rec ); |
178 |
|
179 |
return $rec; |
180 |
} |
181 |
|
182 |
|
183 |
=head2 size |
184 |
|
185 |
Return number of records in database |
186 |
|
187 |
my $size = $input->size; |
188 |
|
189 |
=cut |
190 |
|
191 |
sub size { |
192 |
my $self = shift; |
193 |
return $#{$self->{_lines}} + 1; |
194 |
} |
195 |
|
196 |
|
197 |
=head1 AUTHOR |
198 |
|
199 |
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >> |
200 |
|
201 |
=head1 COPYRIGHT & LICENSE |
202 |
|
203 |
Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. |
204 |
|
205 |
This program is free software; you can redistribute it and/or modify it |
206 |
under the same terms as Perl itself. |
207 |
|
208 |
=cut |
209 |
|
210 |
1; # End of WebPAC::Input::PDF |