/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 285 - (show annotations)
Sun Dec 18 21:06:39 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 8633 byte(s)
 r11777@llin:  dpavlin | 2005-12-19 00:02:47 +0100
 refactor Input::ISIS::* [0.02]

1 package WebPAC::Input;
2
3 use warnings;
4 use strict;
5
6 use blib;
7
8 use WebPAC::Common;
9 use base qw/WebPAC::Common/;
10 use Text::Iconv;
11
12 =head1 NAME
13
14 WebPAC::Input - core module for input file format
15
16 =head1 VERSION
17
18 Version 0.02
19
20 =cut
21
22 our $VERSION = '0.02';
23
24 =head1 SYNOPSIS
25
26 This module is used as base class for all database specific modules
27 (basically, files which have one handle, fixed size while indexing and some
28 kind of numeric idefinirier which goes from 1 to filesize).
29
30 Perhaps a little code snippet.
31
32 use WebPAC::Input;
33
34 my $db = WebPAC::Input->new(
35 format => 'NULL',
36 config => $config,
37 lookup => $lookup_obj,
38 low_mem => 1,
39 );
40
41 $db->open('/path/to/database');
42 print "database size: ",$db->size,"\n";
43 while (my $row = $db->fetch) {
44 ...
45 }
46
47 =head1 FUNCTIONS
48
49 =head2 new
50
51 Create new input database object.
52
53 my $db = new WebPAC::Input(
54 format => 'NULL'
55 code_page => 'ISO-8859-2',
56 low_mem => 1,
57 );
58
59 Optional parametar C<code_page> specify application code page (which will be
60 used internally). This should probably be your terminal encoding, and by
61 default, it C<ISO-8859-2>.
62
63 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
64
65 This function will also call low-level C<init> if it exists with same
66 parametars.
67
68 =cut
69
70 sub new {
71 my $class = shift;
72 my $self = {@_};
73 bless($self, $class);
74
75 my $log = $self->_get_logger;
76
77 # check if required subclasses are implemented
78 foreach my $subclass (qw/open_db fetch_rec/) {
79 $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));
80 }
81
82 if ($self->can('init')) {
83 $log->debug("calling init");
84 $self->init(@_);
85 }
86
87 $self->{'code_page'} ||= 'ISO-8859-2';
88
89 # running with low_mem flag? well, use DBM::Deep then.
90 if ($self->{'low_mem'}) {
91 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
92
93 my $db_file = "data.db";
94
95 if (-e $db_file) {
96 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
97 $log->debug("removed '$db_file' from last run");
98 }
99
100 require DBM::Deep;
101
102 my $db = new DBM::Deep $db_file;
103
104 $log->logdie("DBM::Deep error: $!") unless ($db);
105
106 if ($db->error()) {
107 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
108 } else {
109 $log->debug("using file '$db_file' for DBM::Deep");
110 }
111
112 $self->{'db'} = $db;
113 }
114
115 $self ? return $self : return undef;
116 }
117
118 =head2 open
119
120 This function will read whole database in memory and produce lookups.
121
122 $isis->open(
123 path => '/path/to/database/file',
124 code_page => '852',
125 limit_mfn => 500,
126 start_mfn => 6000,
127 lookup => $lookup_obj,
128 );
129
130 By default, C<code_page> is assumed to be C<852>.
131
132 If optional parametar C<start_mfn> is set, this will be first MFN to read
133 from database (so you can skip beginning of your database if you need to).
134
135 If optional parametar C<limit_mfn> is set, it will read just 500 records
136 from database in example above.
137
138 Returns size of database, regardless of C<start_mfn> and C<limit_mfn>
139 parametars, see also C<$isis->size>.
140
141 =cut
142
143 sub open {
144 my $self = shift;
145 my $arg = {@_};
146
147 my $log = $self->_get_logger();
148
149 $log->logcroak("need path") if (! $arg->{'path'});
150 my $code_page = $arg->{'code_page'} || '852';
151
152 # store data in object
153 $self->{'code_page'} = $code_page;
154 foreach my $v (qw/path start_mfn limit_mfn/) {
155 $self->{$v} = $arg->{$v} if ($arg->{$v});
156 }
157
158 # create Text::Iconv object
159 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
160
161 my ($db, $size) = $self->open_db(
162 path => $arg->{path},
163 );
164
165 unless ($db) {
166 $log->logwarn("can't open database $arg->{path}, skipping...");
167 return;
168 }
169
170 unless ($size) {
171 $log->logwarn("no records in database $arg->{path}, skipping...");
172 return;
173 }
174
175 my $startmfn = 1;
176 my $maxmfn = $size;
177
178 if (my $s = $self->{start_mfn}) {
179 $log->info("skipping to MFN $s");
180 $startmfn = $s;
181 } else {
182 $self->{start_mfn} = $startmfn;
183 }
184
185 if ($self->{limit_mfn}) {
186 $log->info("limiting to ",$self->{limit_mfn}," records");
187 $maxmfn = $startmfn + $self->{limit_mfn} - 1;
188 $maxmfn = $size if ($maxmfn > $size);
189 }
190
191 # store size for later
192 $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;
193
194 $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
195
196 # read database
197 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
198
199 $log->debug("mfn: $mfn\n");
200
201 my $rec = $self->fetch_rec( $db, $mfn );
202
203 if (! $rec) {
204 $log->warn("record $mfn empty? skipping...");
205 next;
206 }
207
208 # store
209 if ($self->{'low_mem'}) {
210 $self->{'db'}->put($mfn, $rec);
211 } else {
212 $self->{'data'}->{$mfn} = $rec;
213 }
214
215 # create lookup
216 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
217
218 $self->progress_bar($mfn,$maxmfn);
219
220 }
221
222 $self->{'current_mfn'} = -1;
223 $self->{'last_pcnt'} = 0;
224
225 $log->debug("max mfn: $maxmfn");
226
227 # store max mfn and return it.
228 $self->{'max_mfn'} = $maxmfn;
229
230 return $size;
231 }
232
233 =head2 fetch
234
235 Fetch next record from database. It will also displays progress bar.
236
237 my $rec = $isis->fetch;
238
239 Record from this function should probably go to C<data_structure> for
240 normalisation.
241
242 =cut
243
244 sub fetch {
245 my $self = shift;
246
247 my $log = $self->_get_logger();
248
249 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
250
251 if ($self->{'current_mfn'} == -1) {
252 $self->{'current_mfn'} = $self->{'start_mfn'};
253 } else {
254 $self->{'current_mfn'}++;
255 }
256
257 my $mfn = $self->{'current_mfn'};
258
259 if ($mfn > $self->{'max_mfn'}) {
260 $self->{'current_mfn'} = $self->{'max_mfn'};
261 $log->debug("at EOF");
262 return;
263 }
264
265 $self->progress_bar($mfn,$self->{'max_mfn'});
266
267 my $rec;
268
269 if ($self->{'low_mem'}) {
270 $rec = $self->{'db'}->get($mfn);
271 } else {
272 $rec = $self->{'data'}->{$mfn};
273 }
274
275 $rec ||= 0E0;
276 }
277
278 =head2 pos
279
280 Returns current record number (MFN).
281
282 print $isis->pos;
283
284 First record in database has position 1.
285
286 =cut
287
288 sub pos {
289 my $self = shift;
290 return $self->{'current_mfn'};
291 }
292
293
294 =head2 size
295
296 Returns number of records in database
297
298 print $isis->size;
299
300 Result from this function can be used to loop through all records
301
302 foreach my $mfn ( 1 ... $isis->size ) { ... }
303
304 because it takes into account C<start_mfn> and C<limit_mfn>.
305
306 =cut
307
308 sub size {
309 my $self = shift;
310 return $self->{'size'};
311 }
312
313 =head2 seek
314
315 Seek to specified MFN in file.
316
317 $isis->seek(42);
318
319 First record in database has position 1.
320
321 =cut
322
323 sub seek {
324 my $self = shift;
325 my $pos = shift || return;
326
327 my $log = $self->_get_logger();
328
329 if ($pos < 1) {
330 $log->warn("seek before first record");
331 $pos = 1;
332 } elsif ($pos > $self->{'max_mfn'}) {
333 $log->warn("seek beyond last record");
334 $pos = $self->{'max_mfn'};
335 }
336
337 return $self->{'current_mfn'} = (($pos - 1) || -1);
338 }
339
340
341 =head1 MEMORY USAGE
342
343 C<low_mem> options is double-edged sword. If enabled, WebPAC
344 will run on memory constraint machines (which doesn't have enough
345 physical RAM to create memory structure for whole source database).
346
347 If your machine has 512Mb or more of RAM and database is around 10000 records,
348 memory shouldn't be an issue. If you don't have enough physical RAM, you
349 might consider using virtual memory (if your operating system is handling it
350 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
351 parsed structure of ISIS database (this is what C<low_mem> option does).
352
353 Hitting swap at end of reading source database is probably o.k. However,
354 hitting swap before 90% will dramatically decrease performance and you will
355 be better off with C<low_mem> and using rest of availble memory for
356 operating system disk cache (Linux is particuallary good about this).
357 However, every access to database record will require disk access, so
358 generation phase will be slower 10-100 times.
359
360 Parsed structures are essential - you just have option to trade RAM memory
361 (which is fast) for disk space (which is slow). Be sure to have planty of
362 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
363
364 However, when WebPAC is running on desktop machines (or laptops :-), it's
365 highly undesireable for system to start swapping. Using C<low_mem> option can
366 reduce WecPAC memory usage to around 64Mb for same database with lookup
367 fields and sorted indexes which stay in RAM. Performance will suffer, but
368 memory usage will really be minimal. It might be also more confortable to
369 run WebPAC reniced on those machines.
370
371
372 =head1 AUTHOR
373
374 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
375
376 =head1 COPYRIGHT & LICENSE
377
378 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
379
380 This program is free software; you can redistribute it and/or modify it
381 under the same terms as Perl itself.
382
383 =cut
384
385 1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26