/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 285 by dpavlin, Sun Dec 18 21:06:39 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - core module for input file format
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.02
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.02';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  Quick summary of what the module does.  This module is used as base class for all database specific modules
27    (basically, files which have one handle, fixed size while indexing and some
28    kind of numeric idefinirier which goes from 1 to filesize).
29    
30  Perhaps a little code snippet.  Perhaps a little code snippet.
31    
32      use WebPAC::Input;      use WebPAC::Input;
33    
34      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
35      ...          format => 'NULL',
36            config => $config,
37            lookup => $lookup_obj,
38            low_mem => 1,
39        );
40    
41        $db->open('/path/to/database');
42        print "database size: ",$db->size,"\n";
43        while (my $row = $db->fetch) {
44            ...
45        }
46    
47  =head1 EXPORT  =head1 FUNCTIONS
48    
49  A list of functions that can be exported.  You can delete this section  =head2 new
 if you don't export anything, such as for a purely object-oriented module.  
50    
51  =head1 FUNCTIONS  Create new input database object.
52    
53      my $db = new WebPAC::Input(
54            format => 'NULL'
55            code_page => 'ISO-8859-2',
56            low_mem => 1,
57      );
58    
59    Optional parametar C<code_page> specify application code page (which will be
60    used internally). This should probably be your terminal encoding, and by
61    default, it C<ISO-8859-2>.
62    
63    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
64    
65    This function will also call low-level C<init> if it exists with same
66    parametars.
67    
68    =cut
69    
70    sub new {
71            my $class = shift;
72            my $self = {@_};
73            bless($self, $class);
74    
75            my $log = $self->_get_logger;
76    
77            # check if required subclasses are implemented
78            foreach my $subclass (qw/open_db fetch_rec/) {
79                    $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));
80            }
81    
82            if ($self->can('init')) {
83                    $log->debug("calling init");
84                    $self->init(@_);
85            }
86    
87            $self->{'code_page'} ||= 'ISO-8859-2';
88    
89            # running with low_mem flag? well, use DBM::Deep then.
90            if ($self->{'low_mem'}) {
91                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
92    
93                    my $db_file = "data.db";
94    
95                    if (-e $db_file) {
96                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
97                            $log->debug("removed '$db_file' from last run");
98                    }
99    
100                    require DBM::Deep;
101    
102                    my $db = new DBM::Deep $db_file;
103    
104                    $log->logdie("DBM::Deep error: $!") unless ($db);
105    
106                    if ($db->error()) {
107                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
108                    } else {
109                            $log->debug("using file '$db_file' for DBM::Deep");
110                    }
111    
112                    $self->{'db'} = $db;
113            }
114    
115            $self ? return $self : return undef;
116    }
117    
118    =head2 open
119    
120    This function will read whole database in memory and produce lookups.
121    
122     $isis->open(
123            path => '/path/to/database/file',
124            code_page => '852',
125            limit_mfn => 500,
126            start_mfn => 6000,
127            lookup => $lookup_obj,
128     );
129    
130    By default, C<code_page> is assumed to be C<852>.
131    
132    If optional parametar C<start_mfn> is set, this will be first MFN to read
133    from database (so you can skip beginning of your database if you need to).
134    
135    If optional parametar C<limit_mfn> is set, it will read just 500 records
136    from database in example above.
137    
138    Returns size of database, regardless of C<start_mfn> and C<limit_mfn>
139    parametars, see also C<$isis->size>.
140    
141    =cut
142    
143    sub open {
144            my $self = shift;
145            my $arg = {@_};
146    
147            my $log = $self->_get_logger();
148    
149            $log->logcroak("need path") if (! $arg->{'path'});
150            my $code_page = $arg->{'code_page'} || '852';
151    
152            # store data in object
153            $self->{'code_page'} = $code_page;
154            foreach my $v (qw/path start_mfn limit_mfn/) {
155                    $self->{$v} = $arg->{$v} if ($arg->{$v});
156            }
157    
158            # create Text::Iconv object
159            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
160    
161            my ($db, $size) = $self->open_db(
162                    path => $arg->{path},
163            );
164    
165            unless ($db) {
166                    $log->logwarn("can't open database $arg->{path}, skipping...");
167                    return;
168            }
169    
170            unless ($size) {
171                    $log->logwarn("no records in database $arg->{path}, skipping...");
172                    return;
173            }
174    
175            my $startmfn = 1;
176            my $maxmfn = $size;
177    
178            if (my $s = $self->{start_mfn}) {
179                    $log->info("skipping to MFN $s");
180                    $startmfn = $s;
181            } else {
182                    $self->{start_mfn} = $startmfn;
183            }
184    
185            if ($self->{limit_mfn}) {
186                    $log->info("limiting to ",$self->{limit_mfn}," records");
187                    $maxmfn = $startmfn + $self->{limit_mfn} - 1;
188                    $maxmfn = $size if ($maxmfn > $size);
189            }
190    
191            # store size for later
192            $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;
193    
194            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
195    
196            # read database
197            for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
198    
199                    $log->debug("mfn: $mfn\n");
200    
201                    my $rec = $self->fetch_rec( $db, $mfn );
202    
203  =head2 function1                  if (! $rec) {
204                            $log->warn("record $mfn empty? skipping...");
205                            next;
206                    }
207    
208                    # store
209                    if ($self->{'low_mem'}) {
210                            $self->{'db'}->put($mfn, $rec);
211                    } else {
212                            $self->{'data'}->{$mfn} = $rec;
213                    }
214    
215                    # create lookup
216                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
217    
218                    $self->progress_bar($mfn,$maxmfn);
219    
220            }
221    
222            $self->{'current_mfn'} = -1;
223            $self->{'last_pcnt'} = 0;
224    
225            $log->debug("max mfn: $maxmfn");
226    
227            # store max mfn and return it.
228            $self->{'max_mfn'} = $maxmfn;
229    
230            return $size;
231    }
232    
233    =head2 fetch
234    
235    Fetch next record from database. It will also displays progress bar.
236    
237     my $rec = $isis->fetch;
238    
239    Record from this function should probably go to C<data_structure> for
240    normalisation.
241    
242    =cut
243    
244    sub fetch {
245            my $self = shift;
246    
247            my $log = $self->_get_logger();
248    
249            $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
250    
251            if ($self->{'current_mfn'} == -1) {
252                    $self->{'current_mfn'} = $self->{'start_mfn'};
253            } else {
254                    $self->{'current_mfn'}++;
255            }
256    
257            my $mfn = $self->{'current_mfn'};
258    
259            if ($mfn > $self->{'max_mfn'}) {
260                    $self->{'current_mfn'} = $self->{'max_mfn'};
261                    $log->debug("at EOF");
262                    return;
263            }
264    
265            $self->progress_bar($mfn,$self->{'max_mfn'});
266    
267            my $rec;
268    
269            if ($self->{'low_mem'}) {
270                    $rec = $self->{'db'}->get($mfn);
271            } else {
272                    $rec = $self->{'data'}->{$mfn};
273            }
274    
275            $rec ||= 0E0;
276    }
277    
278    =head2 pos
279    
280    Returns current record number (MFN).
281    
282     print $isis->pos;
283    
284    First record in database has position 1.
285    
286  =cut  =cut
287    
288  sub function1 {  sub pos {
289            my $self = shift;
290            return $self->{'current_mfn'};
291  }  }
292    
293  =head2 function2  
294    =head2 size
295    
296    Returns number of records in database
297    
298     print $isis->size;
299    
300    Result from this function can be used to loop through all records
301    
302     foreach my $mfn ( 1 ... $isis->size ) { ... }
303    
304    because it takes into account C<start_mfn> and C<limit_mfn>.
305    
306  =cut  =cut
307    
308  sub function2 {  sub size {
309            my $self = shift;
310            return $self->{'size'};
311  }  }
312    
313  =head1 AUTHOR  =head2 seek
314    
315  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Seek to specified MFN in file.
316    
317  =head1 BUGS   $isis->seek(42);
318    
319  Please report any bugs or feature requests to  First record in database has position 1.
 C<bug-webpac-input@rt.cpan.org>, or through the web interface at  
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
320    
321  =head1 ACKNOWLEDGEMENTS  =cut
322    
323    sub seek {
324            my $self = shift;
325            my $pos = shift || return;
326    
327            my $log = $self->_get_logger();
328    
329            if ($pos < 1) {
330                    $log->warn("seek before first record");
331                    $pos = 1;
332            } elsif ($pos > $self->{'max_mfn'}) {
333                    $log->warn("seek beyond last record");
334                    $pos = $self->{'max_mfn'};
335            }
336    
337            return $self->{'current_mfn'} = (($pos - 1) || -1);
338    }
339    
340    
341    =head1 MEMORY USAGE
342    
343    C<low_mem> options is double-edged sword. If enabled, WebPAC
344    will run on memory constraint machines (which doesn't have enough
345    physical RAM to create memory structure for whole source database).
346    
347    If your machine has 512Mb or more of RAM and database is around 10000 records,
348    memory shouldn't be an issue. If you don't have enough physical RAM, you
349    might consider using virtual memory (if your operating system is handling it
350    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
351    parsed structure of ISIS database (this is what C<low_mem> option does).
352    
353    Hitting swap at end of reading source database is probably o.k. However,
354    hitting swap before 90% will dramatically decrease performance and you will
355    be better off with C<low_mem> and using rest of availble memory for
356    operating system disk cache (Linux is particuallary good about this).
357    However, every access to database record will require disk access, so
358    generation phase will be slower 10-100 times.
359    
360    Parsed structures are essential - you just have option to trade RAM memory
361    (which is fast) for disk space (which is slow). Be sure to have planty of
362    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
363    
364    However, when WebPAC is running on desktop machines (or laptops :-), it's
365    highly undesireable for system to start swapping. Using C<low_mem> option can
366    reduce WecPAC memory usage to around 64Mb for same database with lookup
367    fields and sorted indexes which stay in RAM. Performance will suffer, but
368    memory usage will really be minimal. It might be also more confortable to
369    run WebPAC reniced on those machines.
370    
371    
372    =head1 AUTHOR
373    
374    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
375    
376  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
377    

Legend:
Removed from v.1  
changed lines
  Added in v.285

  ViewVC Help
Powered by ViewVC 1.1.26