/[webpac2]/trunk/lib/WebPAC/Input/PDF.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input/PDF.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1063 - (show annotations)
Tue Nov 27 21:01:44 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 4193 byte(s)
pod fixes

1 package WebPAC::Input::PDF;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8
9 use CAM::PDF;
10 use Carp qw/confess/;
11
12 use Data::Dump qw/dump/;
13
14 =head1 NAME
15
16 WebPAC::Input::PDF - try to parse PDF tabular data
17
18 =head1 SYNOPSIS
19
20 Open PDF file
21
22 my $input = new WebPAC::Input::PDF(
23 path => '/path/to/file.pdf',
24 );
25
26 =head1 FUNCTIONS
27
28 =head2 new
29
30 Returns new low-level input API object
31
32 my $input = new WebPAC::Input::PDF(
33 path => '/path/to/file.pdf'
34 filter => sub {
35 my ($l,$field_nr) = @_;
36 # do something with $l which is line of input file
37 return $l;
38 },
39 }
40
41 Options:
42
43 =over 4
44
45 =item path
46
47 path to PDF file
48
49 =back
50
51 =cut
52
53 my $verbose = 1;
54
55 sub new {
56 my $class = shift;
57 my $self = {@_};
58 bless($self, $class);
59
60 my $arg = {@_};
61
62 my $log = $self->_get_logger();
63
64 my $file = $arg->{path} || $log->logide("need path");
65
66 my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
67
68 my $pages = $doc->numPages();
69
70 $log->info("opend $file with $pages pages");
71
72 my @lines = ();
73
74 foreach my $p ( 1 .. $pages ) {
75 my $tree = $doc->getPageContentTree($p);
76 if ($tree) {
77 my $out;
78
79 confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
80
81 foreach my $blocks ( @{ $tree->{blocks} } ) {
82 foreach my $block ( $blocks ) {
83 next unless defined $block->{value};
84 foreach my $value ( $block->{value} ) {
85 confess "expect array for value" unless ref($value) eq 'ARRAY';
86 foreach my $v ( @$value ) {
87 next unless defined $v->{args};
88 #warn "## v ",ref($v),dump( $v );
89 my @data;
90 foreach my $args ( $v->{args} ) {
91 #warn "## args ",ref($args),dump( $args );
92 confess "expect array for args" unless ref($args) eq 'ARRAY';
93 foreach my $a ( @$args ) {
94 if ( $a->{type} eq 'array' ) {
95 #warn "## a ",ref($a),dump( $a );
96 foreach my $av ( @{ $a->{value} } ) {
97 next unless $av->{type} eq 'string';
98 #warn "## av ",ref($av),dump( $av );
99 push @data, $av->{value};
100 }
101 } elsif ( $a->{type} eq 'string' ) {
102 push @data, $a->{value};
103 }
104 }
105 next unless @data;
106 warn "data $#data = ",dump(@data);
107 ## FIXME data specific!
108 if ( $#data == 4 ) {
109 push @lines, [ @data ];
110 } elsif ( $#data == 0 && $#lines >= 0 ) {
111 my $v = shift @data;
112 warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
113 $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
114 warn "added to ",dump( $lines[ $#lines ] );
115 } else {
116 $log->warn("ignored: ",dump( @data ));
117 }
118 }
119 }
120 }
121 }
122 }
123 }
124 }
125
126 $self->{_lines} = \@lines;
127
128 $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
129
130 $self ? return $self : return undef;
131 }
132
133 =head2 fetch_rec
134
135 Return record with ID C<$mfn> from database
136
137 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
138
139 Records are returned as field C<A>, C<B> and so on...
140
141 Last supported column is C<ZZ>.
142
143 =cut
144
145 sub fetch_rec {
146 my $self = shift;
147
148 my ( $mfn, $filter_coderef ) = @_;
149
150 my $rec = {
151 '000' => [ $mfn ],
152 };
153
154 my $line = $self->{_lines}->[ $mfn - 1 ] || return;
155 confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY';
156
157 # warn "## line = ",dump( $line );
158
159 my $col = 'A';
160 my $c = 0;
161 foreach my $e ( @$line ) {
162 $rec->{$col} = $e;
163 $c++;
164 # FIXME what about columns > ZZ
165 if ( $col eq 'Z' ) {
166 $col .= 'AA';
167 } elsif ( $col eq 'ZZ' ) {
168 $self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)");
169 last;
170 } elsif ( $col =~ m/([A-Z])Z$/ ) {
171 $col .= $1++ . 'A';
172 } else {
173 $col++;
174 }
175 }
176
177 # warn "## rec = ",dump( $rec );
178
179 return $rec;
180 }
181
182
183 =head2 size
184
185 Return number of records in database
186
187 my $size = $input->size;
188
189 =cut
190
191 sub size {
192 my $self = shift;
193 return $#{$self->{_lines}} + 1;
194 }
195
196
197 =head1 AUTHOR
198
199 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
200
201 =head1 COPYRIGHT & LICENSE
202
203 Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
204
205 This program is free software; you can redistribute it and/or modify it
206 under the same terms as Perl itself.
207
208 =cut
209
210 1; # End of WebPAC::Input::PDF

  ViewVC Help
Powered by ViewVC 1.1.26