/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 307 - (show annotations)
Tue Dec 20 00:03:04 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9250 byte(s)
moved clean into WebPAC::Output::Estraier, cleanup

1 package WebPAC::Input;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Common;
7 use base qw/WebPAC::Common/;
8 use Text::Iconv;
9
10 =head1 NAME
11
12 WebPAC::Input - read different file formats into WebPAC
13
14 =head1 VERSION
15
16 Version 0.03
17
18 =cut
19
20 our $VERSION = '0.03';
21
22 =head1 SYNOPSIS
23
24 This module implements input as database which have fixed and known
25 I<size> while indexing and single unique numeric identifier for database
26 position ranging from 1 to I<size>.
27
28 Simply, something that is indexed by unmber from 1 .. I<size>.
29
30 Examples of such databases are CDS/ISIS files, MARC files, lines in
31 text file, and so on.
32
33 Specific file formats are implemented using low-level interface modules,
34 located in C<WebPAC::Input::*> namespace which export C<open_db>,
35 C<fetch_rec> and optional C<init> functions.
36
37 Perhaps a little code snippet.
38
39 use WebPAC::Input;
40
41 my $db = WebPAC::Input->new(
42 module => 'WebPAC::Input::ISIS',
43 config => $config,
44 lookup => $lookup_obj,
45 low_mem => 1,
46 );
47
48 $db->open('/path/to/database');
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
51 }
52
53
54
55 =head1 FUNCTIONS
56
57 =head2 new
58
59 Create new input database object.
60
61 my $db = new WebPAC::Input(
62 module => 'WebPAC::Input::MARC',
63 code_page => 'ISO-8859-2',
64 low_mem => 1,
65 );
66
67 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
68 L<WebPAC::Input::MARC>.
69
70 Optional parametar C<code_page> specify application code page (which will be
71 used internally). This should probably be your terminal encoding, and by
72 default, it C<ISO-8859-2>.
73
74 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
75
76 This function will also call low-level C<init> if it exists with same
77 parametars.
78
79 =cut
80
81 sub new {
82 my $class = shift;
83 my $self = {@_};
84 bless($self, $class);
85
86 my $log = $self->_get_logger;
87
88 $log->logconfess("specify low-level file format module") unless ($self->{module});
89 my $module = $self->{module};
90 $module =~ s#::#/#g;
91 $module .= '.pm';
92 $log->debug("require low-level module $self->{module} from $module");
93
94 require $module;
95 #eval $self->{module} .'->import';
96
97 # check if required subclasses are implemented
98 foreach my $subclass (qw/open_db fetch_rec init/) {
99 my $n = $self->{module} . '::' . $subclass;
100 if (! defined &{ $n }) {
101 my $missing = "missing $subclass in $self->{module}";
102 $self->{$subclass} = sub { $log->logwarn($missing) };
103 } else {
104 $self->{$subclass} = \&{ $n };
105 }
106 }
107
108 if ($self->{init}) {
109 $log->debug("calling init");
110 $self->{init}->($self, @_);
111 }
112
113 $self->{'code_page'} ||= 'ISO-8859-2';
114
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118
119 my $db_file = "data.db";
120
121 if (-e $db_file) {
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
124 }
125
126 require DBM::Deep;
127
128 my $db = new DBM::Deep $db_file;
129
130 $log->logdie("DBM::Deep error: $!") unless ($db);
131
132 if ($db->error()) {
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134 } else {
135 $log->debug("using file '$db_file' for DBM::Deep");
136 }
137
138 $self->{'db'} = $db;
139 }
140
141 $self ? return $self : return undef;
142 }
143
144 =head2 open
145
146 This function will read whole database in memory and produce lookups.
147
148 $input->open(
149 path => '/path/to/database/file',
150 code_page => '852',
151 limit => 500,
152 offset => 6000,
153 lookup => $lookup_obj,
154 );
155
156 By default, C<code_page> is assumed to be C<852>.
157
158 C<offset> is optional parametar to position at some offset before reading from database.
159
160 C<limit> is optional parametar to read just C<limit> records from database
161
162 Returns size of database, regardless of C<offset> and C<limit>
163 parametars, see also C<size>.
164
165 =cut
166
167 sub open {
168 my $self = shift;
169 my $arg = {@_};
170
171 my $log = $self->_get_logger();
172
173 $log->logcroak("need path") if (! $arg->{'path'});
174 my $code_page = $arg->{'code_page'} || '852';
175
176 # store data in object
177 $self->{'input_code_page'} = $code_page;
178 foreach my $v (qw/path offset limit/) {
179 $self->{$v} = $arg->{$v} if ($arg->{$v});
180 }
181
182 # create Text::Iconv object
183 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
184
185 my ($db, $size) = $self->{open_db}->( $self,
186 path => $arg->{path},
187 );
188
189 unless ($db) {
190 $log->logwarn("can't open database $arg->{path}, skipping...");
191 return;
192 }
193
194 unless ($size) {
195 $log->logwarn("no records in database $arg->{path}, skipping...");
196 return;
197 }
198
199 my $offset = 1;
200 my $limit = $size;
201
202 if (my $s = $self->{offset}) {
203 $log->info("skipping to MFN $s");
204 $offset = $s;
205 } else {
206 $self->{offset} = $offset;
207 }
208
209 if ($self->{limit}) {
210 $log->debug("limiting to ",$self->{limit}," records");
211 $limit = $offset + $self->{limit} - 1;
212 $limit = $size if ($limit > $size);
213 }
214
215 # store size for later
216 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
217
218 $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
219
220 # read database
221 for (my $pos = $offset; $pos <= $limit; $pos++) {
222
223 $log->debug("position: $pos\n");
224
225 my $rec = $self->{fetch_rec}->($self, $db, $pos );
226
227 if (! $rec) {
228 $log->warn("record $pos empty? skipping...");
229 next;
230 }
231
232 # store
233 if ($self->{low_mem}) {
234 $self->{db}->put($pos, $rec);
235 } else {
236 $self->{data}->{$pos} = $rec;
237 }
238
239 # create lookup
240 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
241
242 $self->progress_bar($pos,$limit);
243
244 }
245
246 $self->{pos} = -1;
247 $self->{last_pcnt} = 0;
248
249 # store max mfn and return it.
250 $self->{max_pos} = $limit;
251 $log->debug("max_pos: $limit");
252
253 return $size;
254 }
255
256 =head2 fetch
257
258 Fetch next record from database. It will also displays progress bar.
259
260 my $rec = $isis->fetch;
261
262 Record from this function should probably go to C<data_structure> for
263 normalisation.
264
265 =cut
266
267 sub fetch {
268 my $self = shift;
269
270 my $log = $self->_get_logger();
271
272 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
273
274 if ($self->{pos} == -1) {
275 $self->{pos} = $self->{offset};
276 } else {
277 $self->{pos}++;
278 }
279
280 my $mfn = $self->{pos};
281
282 if ($mfn > $self->{max_pos}) {
283 $self->{pos} = $self->{max_pos};
284 $log->debug("at EOF");
285 return;
286 }
287
288 $self->progress_bar($mfn,$self->{max_pos});
289
290 my $rec;
291
292 if ($self->{low_mem}) {
293 $rec = $self->{db}->get($mfn);
294 } else {
295 $rec = $self->{data}->{$mfn};
296 }
297
298 $rec ||= 0E0;
299 }
300
301 =head2 pos
302
303 Returns current record number (MFN).
304
305 print $isis->pos;
306
307 First record in database has position 1.
308
309 =cut
310
311 sub pos {
312 my $self = shift;
313 return $self->{pos};
314 }
315
316
317 =head2 size
318
319 Returns number of records in database
320
321 print $isis->size;
322
323 Result from this function can be used to loop through all records
324
325 foreach my $mfn ( 1 ... $isis->size ) { ... }
326
327 because it takes into account C<offset> and C<limit>.
328
329 =cut
330
331 sub size {
332 my $self = shift;
333 return $self->{size};
334 }
335
336 =head2 seek
337
338 Seek to specified MFN in file.
339
340 $isis->seek(42);
341
342 First record in database has position 1.
343
344 =cut
345
346 sub seek {
347 my $self = shift;
348 my $pos = shift || return;
349
350 my $log = $self->_get_logger();
351
352 if ($pos < 1) {
353 $log->warn("seek before first record");
354 $pos = 1;
355 } elsif ($pos > $self->{max_pos}) {
356 $log->warn("seek beyond last record");
357 $pos = $self->{max_pos};
358 }
359
360 return $self->{pos} = (($pos - 1) || -1);
361 }
362
363
364 =head1 MEMORY USAGE
365
366 C<low_mem> options is double-edged sword. If enabled, WebPAC
367 will run on memory constraint machines (which doesn't have enough
368 physical RAM to create memory structure for whole source database).
369
370 If your machine has 512Mb or more of RAM and database is around 10000 records,
371 memory shouldn't be an issue. If you don't have enough physical RAM, you
372 might consider using virtual memory (if your operating system is handling it
373 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
374 parsed structure of ISIS database (this is what C<low_mem> option does).
375
376 Hitting swap at end of reading source database is probably o.k. However,
377 hitting swap before 90% will dramatically decrease performance and you will
378 be better off with C<low_mem> and using rest of availble memory for
379 operating system disk cache (Linux is particuallary good about this).
380 However, every access to database record will require disk access, so
381 generation phase will be slower 10-100 times.
382
383 Parsed structures are essential - you just have option to trade RAM memory
384 (which is fast) for disk space (which is slow). Be sure to have planty of
385 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
386
387 However, when WebPAC is running on desktop machines (or laptops :-), it's
388 highly undesireable for system to start swapping. Using C<low_mem> option can
389 reduce WecPAC memory usage to around 64Mb for same database with lookup
390 fields and sorted indexes which stay in RAM. Performance will suffer, but
391 memory usage will really be minimal. It might be also more confortable to
392 run WebPAC reniced on those machines.
393
394
395 =head1 AUTHOR
396
397 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
398
399 =head1 COPYRIGHT & LICENSE
400
401 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
402
403 This program is free software; you can redistribute it and/or modify it
404 under the same terms as Perl itself.
405
406 =cut
407
408 1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26