/[webpac2]/trunk/lib/WebPAC/Input/PDF.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input/PDF.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1057 - (hide annotations)
Tue Nov 20 10:08:02 2007 UTC (16 years, 6 months ago) by dpavlin
File size: 4337 byte(s)
 r1650@llin:  dpavlin | 2007-11-20 11:07:57 +0100
 final tweaks for WebPAC::Input::PDF, emit fields A .. ZZ

1 dpavlin 1054 package WebPAC::Input::PDF;
2    
3     use warnings;
4     use strict;
5    
6     use WebPAC::Input;
7     use base qw/WebPAC::Common/;
8    
9     use CAM::PDF;
10     use Carp qw/confess/;
11    
12     use Data::Dump qw/dump/;
13    
14     =head1 NAME
15    
16     WebPAC::Input::PDF - try to parse PDF tabular data
17    
18     =head1 SYNOPSIS
19    
20     Open PBF file in PDF export fromat
21    
22     my $input = new WebPAC::Input::PDF(
23     path => '/path/to/file.pdf',
24     );
25    
26     =head1 FUNCTIONS
27    
28     =head2 new
29    
30     Returns new low-level input API object
31    
32     my $input = new WebPAC::Input::PDF(
33     path => '/path/to/file.pdf'
34     filter => sub {
35     my ($l,$field_nr) = @_;
36     # do something with $l which is line of input file
37     return $l;
38     },
39     }
40    
41     Options:
42    
43     =over 4
44    
45     =item path
46    
47     path to PDF file
48    
49     =back
50    
51     =cut
52    
53     my $verbose = 1;
54    
55     sub new {
56     my $class = shift;
57     my $self = {@_};
58     bless($self, $class);
59    
60     my $arg = {@_};
61    
62     my $log = $self->_get_logger();
63    
64     my $file = $arg->{path} || $log->logide("need path");
65    
66     my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
67    
68     my $pages = $doc->numPages();
69    
70     $log->info("opend $file with $pages pages");
71    
72 dpavlin 1057 my @lines = ();
73 dpavlin 1054
74     foreach my $p ( 1 .. $pages ) {
75     my $tree = $doc->getPageContentTree($p);
76     if ($tree) {
77     my $out;
78    
79     confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
80    
81     foreach my $blocks ( @{ $tree->{blocks} } ) {
82     foreach my $block ( $blocks ) {
83     next unless defined $block->{value};
84     foreach my $value ( $block->{value} ) {
85     confess "expect array for value" unless ref($value) eq 'ARRAY';
86     foreach my $v ( @$value ) {
87     next unless defined $v->{args};
88     #warn "## v ",ref($v),dump( $v );
89     my @data;
90     foreach my $args ( $v->{args} ) {
91     #warn "## args ",ref($args),dump( $args );
92     confess "expect array for args" unless ref($args) eq 'ARRAY';
93     foreach my $a ( @$args ) {
94     if ( $a->{type} eq 'array' ) {
95     #warn "## a ",ref($a),dump( $a );
96     foreach my $av ( @{ $a->{value} } ) {
97     next unless $av->{type} eq 'string';
98     #warn "## av ",ref($av),dump( $av );
99     push @data, $av->{value};
100     }
101     } elsif ( $a->{type} eq 'string' ) {
102     push @data, $a->{value};
103     }
104     }
105     next unless @data;
106     warn "data $#data = ",dump(@data);
107     ## FIXME data specific!
108     if ( $#data == 4 ) {
109     push @lines, [ @data ];
110     } elsif ( $#data == 0 && $#lines >= 0 ) {
111     my $v = shift @data;
112     warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
113     $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
114     warn "added to ",dump( $lines[ $#lines ] );
115     } else {
116     $log->warn("ignored: ",dump( @data ));
117     }
118     }
119     }
120     }
121     }
122     }
123     }
124     }
125    
126 dpavlin 1057 $self->{_lines} = \@lines;
127 dpavlin 1054
128     $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
129    
130     $self ? return $self : return undef;
131     }
132    
133     =head2 fetch_rec
134    
135     Return record with ID C<$mfn> from database
136    
137     my $rec = $input->fetch_rec( $mfn, $filter_coderef );
138    
139 dpavlin 1057 Records are returned as field C<A>, C<B> and so on...
140    
141     Last supported column is C<ZZ>.
142    
143 dpavlin 1054 =cut
144    
145     sub fetch_rec {
146     my $self = shift;
147    
148     my ( $mfn, $filter_coderef ) = @_;
149    
150 dpavlin 1057 my $rec = {
151     '000' => [ $mfn ],
152     };
153    
154     my $line = $self->{_lines}->[ $mfn - 1 ] || return;
155     confess "expected ARRAY for _lines $mfn" unless ref($line) eq 'ARRAY';
156    
157     # warn "## line = ",dump( $line );
158    
159     my $col = 'A';
160     my $c = 0;
161     foreach my $e ( @$line ) {
162     $rec->{$col} = $e;
163     $c++;
164     # FIXME what about columns > ZZ
165     if ( $col eq 'Z' ) {
166     $col .= 'AA';
167     } elsif ( $col eq 'ZZ' ) {
168     $self->_get_logger()->logwarn("ignoring colums above ZZ (original ", $#$line + 1, " > $c max columns)");
169     last;
170     } elsif ( $col =~ m/([A-Z])Z$/ ) {
171     $col .= $1++ . 'A';
172     } else {
173     $col++;
174     }
175     }
176    
177     # warn "## rec = ",dump( $rec );
178    
179     return $rec;
180 dpavlin 1054 }
181    
182    
183     =head2 size
184    
185     Return number of records in database
186    
187     my $size = $input->size;
188    
189     =cut
190    
191     sub size {
192     my $self = shift;
193 dpavlin 1057 return $#{$self->{_lines}} + 1;
194 dpavlin 1054 }
195    
196     =head1 SEE ALSO
197    
198     L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
199    
200     =head1 AUTHOR
201    
202     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
203    
204     =head1 COPYRIGHT & LICENSE
205    
206     Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
207    
208     This program is free software; you can redistribute it and/or modify it
209     under the same terms as Perl itself.
210    
211     =cut
212    
213     1; # End of WebPAC::Input::PDF

  ViewVC Help
Powered by ViewVC 1.1.26