1 |
package WebPAC::Input::PDF; |
2 |
|
3 |
use warnings; |
4 |
use strict; |
5 |
|
6 |
use WebPAC::Input; |
7 |
use base qw/WebPAC::Common/; |
8 |
|
9 |
use CAM::PDF; |
10 |
use Carp qw/confess/; |
11 |
|
12 |
use Data::Dump qw/dump/; |
13 |
|
14 |
=head1 NAME |
15 |
|
16 |
WebPAC::Input::PDF - try to parse PDF tabular data |
17 |
|
18 |
=head1 SYNOPSIS |
19 |
|
20 |
Open PBF file in PDF export fromat |
21 |
|
22 |
my $input = new WebPAC::Input::PDF( |
23 |
path => '/path/to/file.pdf', |
24 |
); |
25 |
|
26 |
=head1 FUNCTIONS |
27 |
|
28 |
=head2 new |
29 |
|
30 |
Returns new low-level input API object |
31 |
|
32 |
my $input = new WebPAC::Input::PDF( |
33 |
path => '/path/to/file.pdf' |
34 |
filter => sub { |
35 |
my ($l,$field_nr) = @_; |
36 |
# do something with $l which is line of input file |
37 |
return $l; |
38 |
}, |
39 |
} |
40 |
|
41 |
Options: |
42 |
|
43 |
=over 4 |
44 |
|
45 |
=item path |
46 |
|
47 |
path to PDF file |
48 |
|
49 |
=back |
50 |
|
51 |
=cut |
52 |
|
53 |
my $verbose = 1; |
54 |
|
55 |
sub new { |
56 |
my $class = shift; |
57 |
my $self = {@_}; |
58 |
bless($self, $class); |
59 |
|
60 |
my $arg = {@_}; |
61 |
|
62 |
my $log = $self->_get_logger(); |
63 |
|
64 |
my $file = $arg->{path} || $log->logide("need path"); |
65 |
|
66 |
my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr ); |
67 |
|
68 |
my $pages = $doc->numPages(); |
69 |
|
70 |
$log->info("opend $file with $pages pages"); |
71 |
|
72 |
my @lines; |
73 |
|
74 |
foreach my $p ( 1 .. $pages ) { |
75 |
my $tree = $doc->getPageContentTree($p); |
76 |
if ($tree) { |
77 |
my $out; |
78 |
|
79 |
confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY'; |
80 |
|
81 |
foreach my $blocks ( @{ $tree->{blocks} } ) { |
82 |
foreach my $block ( $blocks ) { |
83 |
next unless defined $block->{value}; |
84 |
foreach my $value ( $block->{value} ) { |
85 |
confess "expect array for value" unless ref($value) eq 'ARRAY'; |
86 |
foreach my $v ( @$value ) { |
87 |
next unless defined $v->{args}; |
88 |
#warn "## v ",ref($v),dump( $v ); |
89 |
my @data; |
90 |
foreach my $args ( $v->{args} ) { |
91 |
#warn "## args ",ref($args),dump( $args ); |
92 |
confess "expect array for args" unless ref($args) eq 'ARRAY'; |
93 |
foreach my $a ( @$args ) { |
94 |
if ( $a->{type} eq 'array' ) { |
95 |
#warn "## a ",ref($a),dump( $a ); |
96 |
foreach my $av ( @{ $a->{value} } ) { |
97 |
next unless $av->{type} eq 'string'; |
98 |
#warn "## av ",ref($av),dump( $av ); |
99 |
push @data, $av->{value}; |
100 |
} |
101 |
} elsif ( $a->{type} eq 'string' ) { |
102 |
push @data, $a->{value}; |
103 |
} |
104 |
} |
105 |
next unless @data; |
106 |
warn "data $#data = ",dump(@data); |
107 |
## FIXME data specific! |
108 |
if ( $#data == 4 ) { |
109 |
push @lines, [ @data ]; |
110 |
} elsif ( $#data == 0 && $#lines >= 0 ) { |
111 |
my $v = shift @data; |
112 |
warn "add $#lines to ",dump( $lines[ $#lines ]->[4] ); |
113 |
$lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v; |
114 |
warn "added to ",dump( $lines[ $#lines ] ); |
115 |
} else { |
116 |
$log->warn("ignored: ",dump( @data )); |
117 |
} |
118 |
} |
119 |
} |
120 |
} |
121 |
} |
122 |
} |
123 |
} |
124 |
} |
125 |
|
126 |
$self->size( $#lines ); |
127 |
|
128 |
$log->debug("loaded ", $self->size, " records", sub { dump( @lines ) }); |
129 |
|
130 |
$self ? return $self : return undef; |
131 |
} |
132 |
|
133 |
=head2 fetch_rec |
134 |
|
135 |
Return record with ID C<$mfn> from database |
136 |
|
137 |
my $rec = $input->fetch_rec( $mfn, $filter_coderef ); |
138 |
|
139 |
=cut |
140 |
|
141 |
sub fetch_rec { |
142 |
my $self = shift; |
143 |
|
144 |
my ( $mfn, $filter_coderef ) = @_; |
145 |
|
146 |
return $self->{_rec}->[$mfn-1]; |
147 |
} |
148 |
|
149 |
|
150 |
=head2 size |
151 |
|
152 |
Return number of records in database |
153 |
|
154 |
my $size = $input->size; |
155 |
|
156 |
=cut |
157 |
|
158 |
sub size { |
159 |
my $self = shift; |
160 |
return $#{$self->{_rec}} + 1; |
161 |
} |
162 |
|
163 |
=head1 SEE ALSO |
164 |
|
165 |
L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find... |
166 |
|
167 |
=head1 AUTHOR |
168 |
|
169 |
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >> |
170 |
|
171 |
=head1 COPYRIGHT & LICENSE |
172 |
|
173 |
Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. |
174 |
|
175 |
This program is free software; you can redistribute it and/or modify it |
176 |
under the same terms as Perl itself. |
177 |
|
178 |
=cut |
179 |
|
180 |
1; # End of WebPAC::Input::PDF |