/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 496 - (hide annotations)
Sun May 14 19:45:26 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 10216 byte(s)
 r651@llin:  dpavlin | 2006-05-14 21:47:08 +0200
 allow 0 as valid db handle

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 487 use WebPAC::Common;
7 dpavlin 285 use base qw/WebPAC::Common/;
8     use Text::Iconv;
9 dpavlin 308 use Data::Dumper;
10 dpavlin 285
11 dpavlin 1 =head1 NAME
12    
13 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
14 dpavlin 1
15     =head1 VERSION
16    
17 dpavlin 416 Version 0.04
18 dpavlin 1
19     =cut
20    
21 dpavlin 416 our $VERSION = '0.04';
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 286 This module implements input as database which have fixed and known
26     I<size> while indexing and single unique numeric identifier for database
27     position ranging from 1 to I<size>.
28 dpavlin 1
29 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31     Examples of such databases are CDS/ISIS files, MARC files, lines in
32     text file, and so on.
33    
34     Specific file formats are implemented using low-level interface modules,
35     located in C<WebPAC::Input::*> namespace which export C<open_db>,
36     C<fetch_rec> and optional C<init> functions.
37    
38 dpavlin 1 Perhaps a little code snippet.
39    
40     use WebPAC::Input;
41    
42 dpavlin 3 my $db = WebPAC::Input->new(
43 dpavlin 286 module => 'WebPAC::Input::ISIS',
44     config => $config,
45     lookup => $lookup_obj,
46     low_mem => 1,
47 dpavlin 3 );
48 dpavlin 1
49 dpavlin 3 $db->open('/path/to/database');
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72     L<WebPAC::Input::MARC>.
73    
74 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
75     used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
98     my $module = $self->{module};
99     $module =~ s#::#/#g;
100     $module .= '.pm';
101     $log->debug("require low-level module $self->{module} from $module");
102 dpavlin 289
103 dpavlin 286 require $module;
104 dpavlin 289 #eval $self->{module} .'->import';
105 dpavlin 286
106 dpavlin 285 # check if required subclasses are implemented
107 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
108     my $n = $self->{module} . '::' . $subclass;
109     if (! defined &{ $n }) {
110 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
111 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
112 dpavlin 286 } else {
113 dpavlin 289 $self->{$subclass} = \&{ $n };
114 dpavlin 286 }
115 dpavlin 285 }
116    
117 dpavlin 289 if ($self->{init}) {
118 dpavlin 285 $log->debug("calling init");
119 dpavlin 289 $self->{init}->($self, @_);
120 dpavlin 285 }
121    
122 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
123    
124 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
125     if ($self->{'low_mem'}) {
126     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
127    
128     my $db_file = "data.db";
129    
130     if (-e $db_file) {
131     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
132     $log->debug("removed '$db_file' from last run");
133     }
134    
135     require DBM::Deep;
136    
137     my $db = new DBM::Deep $db_file;
138    
139     $log->logdie("DBM::Deep error: $!") unless ($db);
140    
141     if ($db->error()) {
142     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
143     } else {
144     $log->debug("using file '$db_file' for DBM::Deep");
145     }
146    
147     $self->{'db'} = $db;
148     }
149    
150 dpavlin 3 $self ? return $self : return undef;
151 dpavlin 1 }
152    
153 dpavlin 285 =head2 open
154    
155     This function will read whole database in memory and produce lookups.
156    
157 dpavlin 286 $input->open(
158 dpavlin 285 path => '/path/to/database/file',
159     code_page => '852',
160 dpavlin 286 limit => 500,
161     offset => 6000,
162 dpavlin 285 lookup => $lookup_obj,
163     );
164    
165     By default, C<code_page> is assumed to be C<852>.
166    
167 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
168 dpavlin 285
169 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
170 dpavlin 285
171 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
172     parametars, see also C<size>.
173 dpavlin 285
174     =cut
175    
176     sub open {
177     my $self = shift;
178     my $arg = {@_};
179    
180     my $log = $self->_get_logger();
181    
182     $log->logcroak("need path") if (! $arg->{'path'});
183     my $code_page = $arg->{'code_page'} || '852';
184    
185     # store data in object
186 dpavlin 292 $self->{'input_code_page'} = $code_page;
187 dpavlin 286 foreach my $v (qw/path offset limit/) {
188 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
189     }
190    
191     # create Text::Iconv object
192     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
193    
194 dpavlin 416 my $filter_ref;
195    
196     if ($self->{recode}) {
197     my @r = split(/\s/, $self->{recode});
198     if ($#r % 2 != 1) {
199     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
200     } else {
201     my $recode;
202     while (@r) {
203     my $from = shift @r;
204     my $to = shift @r;
205     $recode->{$from} = $to;
206     }
207    
208     my $regex = join '|' => keys %{ $recode };
209    
210     $log->debug("using recode regex: $regex");
211    
212     $filter_ref = sub {
213     my $t = shift;
214     $t =~ s/($regex)/$recode->{$1}/g;
215     return $t;
216     };
217    
218     }
219    
220     }
221    
222 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
223 dpavlin 285 path => $arg->{path},
224 dpavlin 416 filter => $filter_ref,
225 dpavlin 285 );
226    
227 dpavlin 496 unless (defined($db)) {
228 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
229     return;
230     }
231    
232     unless ($size) {
233     $log->logwarn("no records in database $arg->{path}, skipping...");
234     return;
235     }
236    
237 dpavlin 339 my $from_rec = 1;
238     my $to_rec = $size;
239 dpavlin 285
240 dpavlin 286 if (my $s = $self->{offset}) {
241 dpavlin 285 $log->info("skipping to MFN $s");
242 dpavlin 339 $from_rec = $s;
243 dpavlin 285 } else {
244 dpavlin 339 $self->{offset} = $from_rec;
245 dpavlin 285 }
246    
247 dpavlin 286 if ($self->{limit}) {
248 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
249 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
250     $to_rec = $size if ($to_rec > $size);
251 dpavlin 285 }
252    
253     # store size for later
254 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
255 dpavlin 285
256 dpavlin 339 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
257 dpavlin 285
258     # read database
259 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
260 dpavlin 285
261 dpavlin 286 $log->debug("position: $pos\n");
262 dpavlin 285
263 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
264 dpavlin 285
265 dpavlin 308 $log->debug(sub { Dumper($rec) });
266    
267 dpavlin 285 if (! $rec) {
268 dpavlin 286 $log->warn("record $pos empty? skipping...");
269 dpavlin 285 next;
270     }
271    
272     # store
273 dpavlin 286 if ($self->{low_mem}) {
274     $self->{db}->put($pos, $rec);
275 dpavlin 285 } else {
276 dpavlin 286 $self->{data}->{$pos} = $rec;
277 dpavlin 285 }
278    
279     # create lookup
280     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
281    
282 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
283 dpavlin 285
284     }
285    
286 dpavlin 286 $self->{pos} = -1;
287     $self->{last_pcnt} = 0;
288 dpavlin 285
289     # store max mfn and return it.
290 dpavlin 339 $self->{max_pos} = $to_rec;
291     $log->debug("max_pos: $to_rec");
292 dpavlin 285
293     return $size;
294     }
295    
296     =head2 fetch
297    
298     Fetch next record from database. It will also displays progress bar.
299    
300     my $rec = $isis->fetch;
301    
302     Record from this function should probably go to C<data_structure> for
303     normalisation.
304    
305     =cut
306    
307     sub fetch {
308     my $self = shift;
309    
310     my $log = $self->_get_logger();
311    
312 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
313 dpavlin 285
314 dpavlin 286 if ($self->{pos} == -1) {
315     $self->{pos} = $self->{offset};
316 dpavlin 285 } else {
317 dpavlin 286 $self->{pos}++;
318 dpavlin 285 }
319    
320 dpavlin 286 my $mfn = $self->{pos};
321 dpavlin 285
322 dpavlin 286 if ($mfn > $self->{max_pos}) {
323     $self->{pos} = $self->{max_pos};
324 dpavlin 285 $log->debug("at EOF");
325     return;
326     }
327    
328 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
329 dpavlin 285
330     my $rec;
331    
332 dpavlin 286 if ($self->{low_mem}) {
333     $rec = $self->{db}->get($mfn);
334 dpavlin 285 } else {
335 dpavlin 286 $rec = $self->{data}->{$mfn};
336 dpavlin 285 }
337    
338     $rec ||= 0E0;
339     }
340    
341     =head2 pos
342    
343     Returns current record number (MFN).
344    
345     print $isis->pos;
346    
347     First record in database has position 1.
348    
349     =cut
350    
351     sub pos {
352     my $self = shift;
353 dpavlin 286 return $self->{pos};
354 dpavlin 285 }
355    
356    
357     =head2 size
358    
359     Returns number of records in database
360    
361     print $isis->size;
362    
363     Result from this function can be used to loop through all records
364    
365     foreach my $mfn ( 1 ... $isis->size ) { ... }
366    
367 dpavlin 286 because it takes into account C<offset> and C<limit>.
368 dpavlin 285
369     =cut
370    
371     sub size {
372     my $self = shift;
373 dpavlin 286 return $self->{size};
374 dpavlin 285 }
375    
376     =head2 seek
377    
378     Seek to specified MFN in file.
379    
380     $isis->seek(42);
381    
382     First record in database has position 1.
383    
384     =cut
385    
386     sub seek {
387     my $self = shift;
388     my $pos = shift || return;
389    
390     my $log = $self->_get_logger();
391    
392     if ($pos < 1) {
393     $log->warn("seek before first record");
394     $pos = 1;
395 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
396 dpavlin 285 $log->warn("seek beyond last record");
397 dpavlin 286 $pos = $self->{max_pos};
398 dpavlin 285 }
399    
400 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
401 dpavlin 285 }
402    
403    
404 dpavlin 3 =head1 MEMORY USAGE
405 dpavlin 1
406 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
407     will run on memory constraint machines (which doesn't have enough
408     physical RAM to create memory structure for whole source database).
409 dpavlin 1
410 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
411     memory shouldn't be an issue. If you don't have enough physical RAM, you
412     might consider using virtual memory (if your operating system is handling it
413     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
414     parsed structure of ISIS database (this is what C<low_mem> option does).
415 dpavlin 1
416 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
417     hitting swap before 90% will dramatically decrease performance and you will
418     be better off with C<low_mem> and using rest of availble memory for
419     operating system disk cache (Linux is particuallary good about this).
420     However, every access to database record will require disk access, so
421     generation phase will be slower 10-100 times.
422    
423     Parsed structures are essential - you just have option to trade RAM memory
424     (which is fast) for disk space (which is slow). Be sure to have planty of
425     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
426    
427     However, when WebPAC is running on desktop machines (or laptops :-), it's
428     highly undesireable for system to start swapping. Using C<low_mem> option can
429     reduce WecPAC memory usage to around 64Mb for same database with lookup
430     fields and sorted indexes which stay in RAM. Performance will suffer, but
431     memory usage will really be minimal. It might be also more confortable to
432     run WebPAC reniced on those machines.
433    
434    
435     =head1 AUTHOR
436    
437     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
438    
439 dpavlin 1 =head1 COPYRIGHT & LICENSE
440    
441     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
442    
443     This program is free software; you can redistribute it and/or modify it
444     under the same terms as Perl itself.
445    
446     =cut
447    
448     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26