/[webpac2]/trunk/lib/WebPAC/Input/PDF.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input/PDF.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1054 - (show annotations)
Tue Nov 20 09:30:56 2007 UTC (16 years, 5 months ago) by dpavlin
File size: 3628 byte(s)
 r1645@llin:  dpavlin | 2007-11-19 23:05:23 +0100
 added experimenal (still not working) WebPAC::Input::PDF

1 package WebPAC::Input::PDF;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8
9 use CAM::PDF;
10 use Carp qw/confess/;
11
12 use Data::Dump qw/dump/;
13
14 =head1 NAME
15
16 WebPAC::Input::PDF - try to parse PDF tabular data
17
18 =head1 SYNOPSIS
19
20 Open PBF file in PDF export fromat
21
22 my $input = new WebPAC::Input::PDF(
23 path => '/path/to/file.pdf',
24 );
25
26 =head1 FUNCTIONS
27
28 =head2 new
29
30 Returns new low-level input API object
31
32 my $input = new WebPAC::Input::PDF(
33 path => '/path/to/file.pdf'
34 filter => sub {
35 my ($l,$field_nr) = @_;
36 # do something with $l which is line of input file
37 return $l;
38 },
39 }
40
41 Options:
42
43 =over 4
44
45 =item path
46
47 path to PDF file
48
49 =back
50
51 =cut
52
53 my $verbose = 1;
54
55 sub new {
56 my $class = shift;
57 my $self = {@_};
58 bless($self, $class);
59
60 my $arg = {@_};
61
62 my $log = $self->_get_logger();
63
64 my $file = $arg->{path} || $log->logide("need path");
65
66 my $doc = CAM::PDF->new($file) || $log->logdie( $CAM::PDF::errstr );
67
68 my $pages = $doc->numPages();
69
70 $log->info("opend $file with $pages pages");
71
72 my @lines;
73
74 foreach my $p ( 1 .. $pages ) {
75 my $tree = $doc->getPageContentTree($p);
76 if ($tree) {
77 my $out;
78
79 confess "expect array for blocks" unless ref($tree->{blocks}) eq 'ARRAY';
80
81 foreach my $blocks ( @{ $tree->{blocks} } ) {
82 foreach my $block ( $blocks ) {
83 next unless defined $block->{value};
84 foreach my $value ( $block->{value} ) {
85 confess "expect array for value" unless ref($value) eq 'ARRAY';
86 foreach my $v ( @$value ) {
87 next unless defined $v->{args};
88 #warn "## v ",ref($v),dump( $v );
89 my @data;
90 foreach my $args ( $v->{args} ) {
91 #warn "## args ",ref($args),dump( $args );
92 confess "expect array for args" unless ref($args) eq 'ARRAY';
93 foreach my $a ( @$args ) {
94 if ( $a->{type} eq 'array' ) {
95 #warn "## a ",ref($a),dump( $a );
96 foreach my $av ( @{ $a->{value} } ) {
97 next unless $av->{type} eq 'string';
98 #warn "## av ",ref($av),dump( $av );
99 push @data, $av->{value};
100 }
101 } elsif ( $a->{type} eq 'string' ) {
102 push @data, $a->{value};
103 }
104 }
105 next unless @data;
106 warn "data $#data = ",dump(@data);
107 ## FIXME data specific!
108 if ( $#data == 4 ) {
109 push @lines, [ @data ];
110 } elsif ( $#data == 0 && $#lines >= 0 ) {
111 my $v = shift @data;
112 warn "add $#lines to ",dump( $lines[ $#lines ]->[4] );
113 $lines[ $#lines ]->[4] = $lines[ $#lines ]->[4] . ' ' . $v;
114 warn "added to ",dump( $lines[ $#lines ] );
115 } else {
116 $log->warn("ignored: ",dump( @data ));
117 }
118 }
119 }
120 }
121 }
122 }
123 }
124 }
125
126 $self->size( $#lines );
127
128 $log->debug("loaded ", $self->size, " records", sub { dump( @lines ) });
129
130 $self ? return $self : return undef;
131 }
132
133 =head2 fetch_rec
134
135 Return record with ID C<$mfn> from database
136
137 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
138
139 =cut
140
141 sub fetch_rec {
142 my $self = shift;
143
144 my ( $mfn, $filter_coderef ) = @_;
145
146 return $self->{_rec}->[$mfn-1];
147 }
148
149
150 =head2 size
151
152 Return number of records in database
153
154 my $size = $input->size;
155
156 =cut
157
158 sub size {
159 my $self = shift;
160 return $#{$self->{_rec}} + 1;
161 }
162
163 =head1 SEE ALSO
164
165 L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
166
167 =head1 AUTHOR
168
169 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
170
171 =head1 COPYRIGHT & LICENSE
172
173 Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
174
175 This program is free software; you can redistribute it and/or modify it
176 under the same terms as Perl itself.
177
178 =cut
179
180 1; # End of WebPAC::Input::PDF

  ViewVC Help
Powered by ViewVC 1.1.26