/[webpac2]/trunk/lib/WebPAC/Input/Excel.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input/Excel.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1244 - (show annotations)
Mon Jul 20 22:00:43 2009 UTC (14 years, 10 months ago) by dpavlin
File size: 4643 byte(s)
don't fallback to input name, but use first sheet instead

1 package WebPAC::Input::Excel;
2
3 use warnings;
4 use strict;
5
6 use Spreadsheet::ParseExcel;
7 use Spreadsheet::ParseExcel::Utility qw/int2col/;
8 use base qw/WebPAC::Common/;
9 use Text::Unaccent::PurePerl qw/unac_string/;
10 use Data::Dump qw/dump/;
11
12 =head1 NAME
13
14 WebPAC::Input::Excel - support for Microsoft Excel and compatibile files
15
16 =cut
17
18 our $VERSION = '0.06';
19
20
21 =head1 SYNOPSIS
22
23 Open Microsoft Excell, or compatibile format (for e.g. from OpenOffice.org
24 or Gnuemeric) in C<.xls> format.
25
26 =head1 FUNCTIONS
27
28 =head2 new
29
30 Returns handle to database and size
31
32 my $excel = new WebPAC::Input::Excel(
33 path => '/path/to/workbook.xls'
34 worksheet => 'name of sheet',
35 from => 42,
36 to => 9999,
37 }
38
39 C<worksheet> is case and white-space insensitive name of worksheet in Excel
40 file to use. If not specified, name of input is used. If none of those
41 methods returned sheet, first worksheet in file is used instead.
42
43 C<from> and C<to> specify row numbers to start and finish import.
44
45 =cut
46
47 sub new {
48 my $class = shift;
49 my $self = {@_};
50 bless($self, $class);
51
52 my $log = $self->_get_logger();
53
54 $log->logdie("can't open excel file $self->{path}: $!") unless (-r $self->{path} && -f $self->{path});
55
56 my $workbook = Spreadsheet::ParseExcel::Workbook->Parse($self->{path});
57
58 my $sheet;
59 my $wanted_worksheet = $self->{worksheet} ; # || $self->{name};
60
61 if ($wanted_worksheet) {
62 my $name;
63 do {
64 $sheet = shift @{ $workbook->{Worksheet} };
65 $log->logdie("can't find sheet '$wanted_worksheet' in $self->{path}\n") unless (defined($sheet));
66 $name = $sheet->{Name};
67 $name =~ s/\s\s+/ /g;
68 } until ($name =~ m/^\s*\Q$wanted_worksheet\E\s*$/i);
69
70 }
71
72 $sheet ||= shift @{ $workbook->{Worksheet} };
73
74 $self->{sheet} = $sheet;
75
76 $self->{from} ||= $sheet->{MinRow};
77 $self->{to} ||= $sheet->{MaxRow};
78
79 my $size = $self->{to} - $self->{from};
80 $self->{size} = $size;
81
82 $log->warn("opening Excel file '$self->{path}', using ",
83 $wanted_worksheet ? '' : 'first ',
84 "worksheet: $sheet->{Name} [$size rows]"
85 );
86
87 $self ? return $self : return undef;
88 }
89
90 =head2 fetch_rec
91
92 Return record with ID C<$mfn> from database
93
94 my $rec = $self->fetch_rec( $mfn );
95
96 Columns are named C<A>, C<B> and so on...
97
98 =cut
99
100 sub fetch_rec {
101 my $self = shift;
102
103 my $mfn = shift;
104
105 my $log = $self->_get_logger();
106
107 my $sheet = $self->{sheet};
108 $log->logdie("can't find sheet hash") unless (defined($sheet));
109 $log->logdie("sheet hash isn't Spreadsheet::ParseExcel::Worksheet") unless ($sheet->isa('Spreadsheet::ParseExcel::Worksheet'));
110
111 my $rec;
112
113 my $row = $self->{from} + $mfn - 1;
114
115 $log->debug("fetch_rec( $mfn ) row: $row cols: ",$sheet->{MinCol}," - ",$sheet->{MaxCol});
116
117 foreach my $col ( $sheet->{MinCol} ... $sheet->{MaxCol} ) {
118 my $v = $sheet->{Cells}->[$row]->[$col]->{_Value}; ## XXX _Value = formatted | Val = unformated !
119 $rec->{ int2col($col) } = $v if defined $v;
120 }
121
122 # add mfn only to records with data
123 $rec->{'000'} = [ $mfn ] if ($rec);
124
125 return $rec;
126 }
127
128 =head2 size
129
130 Return number of records in database
131
132 my $size = $isis->size;
133
134 =cut
135
136 sub size {
137 my $self = shift;
138 return $self->{size};
139 }
140
141 our @labels;
142 our @names;
143
144 sub normalize {
145 my ($self,$mfn) = @_;
146
147 my $log = $self->_get_logger();
148
149 my $sheet = $self->{sheet};
150
151 my $ds;
152
153 if ( ! @labels ) {
154
155 my $labels;
156
157 foreach ( $sheet->{MinCol} ... $sheet->{MaxCol} ) {
158 my $label = $sheet->{Cells}->[0]->[$_]->{_Value};
159 last if length($label) == 0;
160 push @labels, $label;
161 }
162 @names = map {
163 my $t = unac_string($_);
164 $t =~ s{[^a-z0-9]+}{_}gi;
165 $t =~ s{_+$}{};
166 $t =~ s{^_+}{};
167 $t = lc($t);
168 $labels .= "$t\t$_\n";
169 $t;
170 } @labels;
171
172 $log->info("columns = ", dump( @names ), " labels = ", dump( @labels ) );
173
174 $ds = {
175 '_labels' => [ @labels ],
176 '_names' => [ @names ],
177 };
178
179 my $path = $self->{labels} || 'var/labels.txt';
180 {
181 warn $labels;
182 open(my $fh, '>:raw', $path) || die "$path: $!";
183 print $fh $labels;
184 close $fh;
185 }
186 $log->info("created labels $path ", -s $path, " bytes");
187 }
188
189
190 my $row = $self->{from} + $mfn - 1;
191
192 my $data;
193 foreach ( $sheet->{MinCol} ... $sheet->{MaxCol} ) {
194 my $name = $names[$_];
195 next unless $name;
196 my $v = $sheet->{Cells}->[$row]->[$_]->{_Value};
197 $data->{ $name } = $v;
198 $ds->{ $name } = { search => [ $v ] } if defined $v;
199 }
200
201 $ds->{'_rows'} = { $self->{sheet}->{Name} => [ $data ] };
202
203 return $ds;
204 }
205
206 =head1 AUTHOR
207
208 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
209
210 =head1 COPYRIGHT & LICENSE
211
212 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
213
214 This program is free software; you can redistribute it and/or modify it
215 under the same terms as Perl itself.
216
217 =cut
218
219 1; # End of WebPAC::Input::Excel

  ViewVC Help
Powered by ViewVC 1.1.26