/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 285 - (hide annotations)
Sun Dec 18 21:06:39 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 8633 byte(s)
 r11777@llin:  dpavlin | 2005-12-19 00:02:47 +0100
 refactor Input::ISIS::* [0.02]

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use blib;
7    
8     use WebPAC::Common;
9     use base qw/WebPAC::Common/;
10     use Text::Iconv;
11    
12 dpavlin 1 =head1 NAME
13    
14 dpavlin 3 WebPAC::Input - core module for input file format
15 dpavlin 1
16     =head1 VERSION
17    
18 dpavlin 285 Version 0.02
19 dpavlin 1
20     =cut
21    
22 dpavlin 285 our $VERSION = '0.02';
23 dpavlin 1
24     =head1 SYNOPSIS
25    
26 dpavlin 285 This module is used as base class for all database specific modules
27     (basically, files which have one handle, fixed size while indexing and some
28     kind of numeric idefinirier which goes from 1 to filesize).
29 dpavlin 1
30     Perhaps a little code snippet.
31    
32     use WebPAC::Input;
33    
34 dpavlin 3 my $db = WebPAC::Input->new(
35     format => 'NULL',
36     config => $config,
37 dpavlin 6 lookup => $lookup_obj,
38 dpavlin 10 low_mem => 1,
39 dpavlin 3 );
40 dpavlin 1
41 dpavlin 3 $db->open('/path/to/database');
42     print "database size: ",$db->size,"\n";
43     while (my $row = $db->fetch) {
44     ...
45     }
46 dpavlin 1
47     =head1 FUNCTIONS
48    
49 dpavlin 3 =head2 new
50 dpavlin 1
51 dpavlin 3 Create new input database object.
52    
53 dpavlin 9 my $db = new WebPAC::Input(
54     format => 'NULL'
55     code_page => 'ISO-8859-2',
56 dpavlin 10 low_mem => 1,
57 dpavlin 9 );
58 dpavlin 3
59 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
60     used internally). This should probably be your terminal encoding, and by
61     default, it C<ISO-8859-2>.
62    
63 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
64    
65 dpavlin 285 This function will also call low-level C<init> if it exists with same
66     parametars.
67    
68 dpavlin 1 =cut
69    
70 dpavlin 3 sub new {
71 dpavlin 285 my $class = shift;
72     my $self = {@_};
73 dpavlin 3 bless($self, $class);
74    
75 dpavlin 285 my $log = $self->_get_logger;
76    
77     # check if required subclasses are implemented
78     foreach my $subclass (qw/open_db fetch_rec/) {
79     $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));
80     }
81    
82     if ($self->can('init')) {
83     $log->debug("calling init");
84     $self->init(@_);
85     }
86    
87 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
88    
89 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
90     if ($self->{'low_mem'}) {
91     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
92    
93     my $db_file = "data.db";
94    
95     if (-e $db_file) {
96     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
97     $log->debug("removed '$db_file' from last run");
98     }
99    
100     require DBM::Deep;
101    
102     my $db = new DBM::Deep $db_file;
103    
104     $log->logdie("DBM::Deep error: $!") unless ($db);
105    
106     if ($db->error()) {
107     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
108     } else {
109     $log->debug("using file '$db_file' for DBM::Deep");
110     }
111    
112     $self->{'db'} = $db;
113     }
114    
115 dpavlin 3 $self ? return $self : return undef;
116 dpavlin 1 }
117    
118 dpavlin 285 =head2 open
119    
120     This function will read whole database in memory and produce lookups.
121    
122     $isis->open(
123     path => '/path/to/database/file',
124     code_page => '852',
125     limit_mfn => 500,
126     start_mfn => 6000,
127     lookup => $lookup_obj,
128     );
129    
130     By default, C<code_page> is assumed to be C<852>.
131    
132     If optional parametar C<start_mfn> is set, this will be first MFN to read
133     from database (so you can skip beginning of your database if you need to).
134    
135     If optional parametar C<limit_mfn> is set, it will read just 500 records
136     from database in example above.
137    
138     Returns size of database, regardless of C<start_mfn> and C<limit_mfn>
139     parametars, see also C<$isis->size>.
140    
141     =cut
142    
143     sub open {
144     my $self = shift;
145     my $arg = {@_};
146    
147     my $log = $self->_get_logger();
148    
149     $log->logcroak("need path") if (! $arg->{'path'});
150     my $code_page = $arg->{'code_page'} || '852';
151    
152     # store data in object
153     $self->{'code_page'} = $code_page;
154     foreach my $v (qw/path start_mfn limit_mfn/) {
155     $self->{$v} = $arg->{$v} if ($arg->{$v});
156     }
157    
158     # create Text::Iconv object
159     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
160    
161     my ($db, $size) = $self->open_db(
162     path => $arg->{path},
163     );
164    
165     unless ($db) {
166     $log->logwarn("can't open database $arg->{path}, skipping...");
167     return;
168     }
169    
170     unless ($size) {
171     $log->logwarn("no records in database $arg->{path}, skipping...");
172     return;
173     }
174    
175     my $startmfn = 1;
176     my $maxmfn = $size;
177    
178     if (my $s = $self->{start_mfn}) {
179     $log->info("skipping to MFN $s");
180     $startmfn = $s;
181     } else {
182     $self->{start_mfn} = $startmfn;
183     }
184    
185     if ($self->{limit_mfn}) {
186     $log->info("limiting to ",$self->{limit_mfn}," records");
187     $maxmfn = $startmfn + $self->{limit_mfn} - 1;
188     $maxmfn = $size if ($maxmfn > $size);
189     }
190    
191     # store size for later
192     $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;
193    
194     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
195    
196     # read database
197     for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
198    
199     $log->debug("mfn: $mfn\n");
200    
201     my $rec = $self->fetch_rec( $db, $mfn );
202    
203     if (! $rec) {
204     $log->warn("record $mfn empty? skipping...");
205     next;
206     }
207    
208     # store
209     if ($self->{'low_mem'}) {
210     $self->{'db'}->put($mfn, $rec);
211     } else {
212     $self->{'data'}->{$mfn} = $rec;
213     }
214    
215     # create lookup
216     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
217    
218     $self->progress_bar($mfn,$maxmfn);
219    
220     }
221    
222     $self->{'current_mfn'} = -1;
223     $self->{'last_pcnt'} = 0;
224    
225     $log->debug("max mfn: $maxmfn");
226    
227     # store max mfn and return it.
228     $self->{'max_mfn'} = $maxmfn;
229    
230     return $size;
231     }
232    
233     =head2 fetch
234    
235     Fetch next record from database. It will also displays progress bar.
236    
237     my $rec = $isis->fetch;
238    
239     Record from this function should probably go to C<data_structure> for
240     normalisation.
241    
242     =cut
243    
244     sub fetch {
245     my $self = shift;
246    
247     my $log = $self->_get_logger();
248    
249     $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
250    
251     if ($self->{'current_mfn'} == -1) {
252     $self->{'current_mfn'} = $self->{'start_mfn'};
253     } else {
254     $self->{'current_mfn'}++;
255     }
256    
257     my $mfn = $self->{'current_mfn'};
258    
259     if ($mfn > $self->{'max_mfn'}) {
260     $self->{'current_mfn'} = $self->{'max_mfn'};
261     $log->debug("at EOF");
262     return;
263     }
264    
265     $self->progress_bar($mfn,$self->{'max_mfn'});
266    
267     my $rec;
268    
269     if ($self->{'low_mem'}) {
270     $rec = $self->{'db'}->get($mfn);
271     } else {
272     $rec = $self->{'data'}->{$mfn};
273     }
274    
275     $rec ||= 0E0;
276     }
277    
278     =head2 pos
279    
280     Returns current record number (MFN).
281    
282     print $isis->pos;
283    
284     First record in database has position 1.
285    
286     =cut
287    
288     sub pos {
289     my $self = shift;
290     return $self->{'current_mfn'};
291     }
292    
293    
294     =head2 size
295    
296     Returns number of records in database
297    
298     print $isis->size;
299    
300     Result from this function can be used to loop through all records
301    
302     foreach my $mfn ( 1 ... $isis->size ) { ... }
303    
304     because it takes into account C<start_mfn> and C<limit_mfn>.
305    
306     =cut
307    
308     sub size {
309     my $self = shift;
310     return $self->{'size'};
311     }
312    
313     =head2 seek
314    
315     Seek to specified MFN in file.
316    
317     $isis->seek(42);
318    
319     First record in database has position 1.
320    
321     =cut
322    
323     sub seek {
324     my $self = shift;
325     my $pos = shift || return;
326    
327     my $log = $self->_get_logger();
328    
329     if ($pos < 1) {
330     $log->warn("seek before first record");
331     $pos = 1;
332     } elsif ($pos > $self->{'max_mfn'}) {
333     $log->warn("seek beyond last record");
334     $pos = $self->{'max_mfn'};
335     }
336    
337     return $self->{'current_mfn'} = (($pos - 1) || -1);
338     }
339    
340    
341 dpavlin 3 =head1 MEMORY USAGE
342 dpavlin 1
343 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
344     will run on memory constraint machines (which doesn't have enough
345     physical RAM to create memory structure for whole source database).
346 dpavlin 1
347 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
348     memory shouldn't be an issue. If you don't have enough physical RAM, you
349     might consider using virtual memory (if your operating system is handling it
350     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
351     parsed structure of ISIS database (this is what C<low_mem> option does).
352 dpavlin 1
353 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
354     hitting swap before 90% will dramatically decrease performance and you will
355     be better off with C<low_mem> and using rest of availble memory for
356     operating system disk cache (Linux is particuallary good about this).
357     However, every access to database record will require disk access, so
358     generation phase will be slower 10-100 times.
359    
360     Parsed structures are essential - you just have option to trade RAM memory
361     (which is fast) for disk space (which is slow). Be sure to have planty of
362     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
363    
364     However, when WebPAC is running on desktop machines (or laptops :-), it's
365     highly undesireable for system to start swapping. Using C<low_mem> option can
366     reduce WecPAC memory usage to around 64Mb for same database with lookup
367     fields and sorted indexes which stay in RAM. Performance will suffer, but
368     memory usage will really be minimal. It might be also more confortable to
369     run WebPAC reniced on those machines.
370    
371    
372     =head1 AUTHOR
373    
374     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
375    
376 dpavlin 1 =head1 COPYRIGHT & LICENSE
377    
378     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
379    
380     This program is free software; you can redistribute it and/or modify it
381     under the same terms as Perl itself.
382    
383     =cut
384    
385     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26