/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 286 - (hide annotations)
Sun Dec 18 21:06:46 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9124 byte(s)
 r11778@llin:  dpavlin | 2005-12-19 03:59:54 +0100
 move work on input

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use blib;
7    
8     use WebPAC::Common;
9     use base qw/WebPAC::Common/;
10     use Text::Iconv;
11    
12 dpavlin 1 =head1 NAME
13    
14 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
15 dpavlin 1
16     =head1 VERSION
17    
18 dpavlin 286 Version 0.03
19 dpavlin 1
20     =cut
21    
22 dpavlin 286 our $VERSION = '0.03';
23 dpavlin 1
24     =head1 SYNOPSIS
25    
26 dpavlin 286 This module implements input as database which have fixed and known
27     I<size> while indexing and single unique numeric identifier for database
28     position ranging from 1 to I<size>.
29 dpavlin 1
30 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32     Examples of such databases are CDS/ISIS files, MARC files, lines in
33     text file, and so on.
34    
35     Specific file formats are implemented using low-level interface modules,
36     located in C<WebPAC::Input::*> namespace which export C<open_db>,
37     C<fetch_rec> and optional C<init> functions.
38    
39 dpavlin 1 Perhaps a little code snippet.
40    
41     use WebPAC::Input;
42    
43 dpavlin 3 my $db = WebPAC::Input->new(
44 dpavlin 286 module => 'WebPAC::Input::ISIS',
45     config => $config,
46     lookup => $lookup_obj,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51     print "database size: ",$db->size,"\n";
52 dpavlin 286 while (my $rec = $db->fetch) {
53 dpavlin 3 }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 9 );
68 dpavlin 3
69 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70     L<WebPAC::Input::MARC>.
71    
72 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
73     used internally). This should probably be your terminal encoding, and by
74     default, it C<ISO-8859-2>.
75    
76 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78 dpavlin 285 This function will also call low-level C<init> if it exists with same
79     parametars.
80    
81 dpavlin 1 =cut
82    
83 dpavlin 3 sub new {
84 dpavlin 285 my $class = shift;
85     my $self = {@_};
86 dpavlin 3 bless($self, $class);
87    
88 dpavlin 285 my $log = $self->_get_logger;
89    
90 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
91     my $module = $self->{module};
92     $module =~ s#::#/#g;
93     $module .= '.pm';
94     $log->debug("require low-level module $self->{module} from $module");
95     require $module;
96     eval $self->{module} .'->import';
97    
98 dpavlin 285 # check if required subclasses are implemented
99     foreach my $subclass (qw/open_db fetch_rec/) {
100 dpavlin 286 if ( $self->can($subclass) ) {
101     $log->debug("imported $subclass");
102     } else {
103     $log->warn("missing $subclass in $self->{module}");
104     }
105 dpavlin 285 }
106    
107     if ($self->can('init')) {
108     $log->debug("calling init");
109     $self->init(@_);
110     }
111    
112 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
113    
114 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
115     if ($self->{'low_mem'}) {
116     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
117    
118     my $db_file = "data.db";
119    
120     if (-e $db_file) {
121     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
122     $log->debug("removed '$db_file' from last run");
123     }
124    
125     require DBM::Deep;
126    
127     my $db = new DBM::Deep $db_file;
128    
129     $log->logdie("DBM::Deep error: $!") unless ($db);
130    
131     if ($db->error()) {
132     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
133     } else {
134     $log->debug("using file '$db_file' for DBM::Deep");
135     }
136    
137     $self->{'db'} = $db;
138     }
139    
140 dpavlin 3 $self ? return $self : return undef;
141 dpavlin 1 }
142    
143 dpavlin 285 =head2 open
144    
145     This function will read whole database in memory and produce lookups.
146    
147 dpavlin 286 $input->open(
148 dpavlin 285 path => '/path/to/database/file',
149     code_page => '852',
150 dpavlin 286 limit => 500,
151     offset => 6000,
152 dpavlin 285 lookup => $lookup_obj,
153     );
154    
155     By default, C<code_page> is assumed to be C<852>.
156    
157 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
158 dpavlin 285
159 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
160 dpavlin 285
161 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
162     parametars, see also C<size>.
163 dpavlin 285
164     =cut
165    
166     sub open {
167     my $self = shift;
168     my $arg = {@_};
169    
170     my $log = $self->_get_logger();
171    
172     $log->logcroak("need path") if (! $arg->{'path'});
173     my $code_page = $arg->{'code_page'} || '852';
174    
175     # store data in object
176     $self->{'code_page'} = $code_page;
177 dpavlin 286 foreach my $v (qw/path offset limit/) {
178 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
179     }
180    
181     # create Text::Iconv object
182     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
183    
184     my ($db, $size) = $self->open_db(
185     path => $arg->{path},
186     );
187    
188     unless ($db) {
189     $log->logwarn("can't open database $arg->{path}, skipping...");
190     return;
191     }
192    
193     unless ($size) {
194     $log->logwarn("no records in database $arg->{path}, skipping...");
195     return;
196     }
197    
198 dpavlin 286 my $offset = 1;
199     my $limit = $size;
200 dpavlin 285
201 dpavlin 286 if (my $s = $self->{offset}) {
202 dpavlin 285 $log->info("skipping to MFN $s");
203 dpavlin 286 $offset = $s;
204 dpavlin 285 } else {
205 dpavlin 286 $self->{offset} = $offset;
206 dpavlin 285 }
207    
208 dpavlin 286 if ($self->{limit}) {
209     $log->info("limiting to ",$self->{limit}," records");
210     $limit = $offset + $self->{limit} - 1;
211     $limit = $size if ($limit > $size);
212 dpavlin 285 }
213    
214     # store size for later
215 dpavlin 286 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
216 dpavlin 285
217     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
218    
219     # read database
220 dpavlin 286 for (my $pos = $offset; $pos <= $limit; $mfn++) {
221 dpavlin 285
222 dpavlin 286 $log->debug("position: $pos\n");
223 dpavlin 285
224 dpavlin 286 my $rec = $self->fetch_rec( $db, $pos );
225 dpavlin 285
226     if (! $rec) {
227 dpavlin 286 $log->warn("record $pos empty? skipping...");
228 dpavlin 285 next;
229     }
230    
231     # store
232 dpavlin 286 if ($self->{low_mem}) {
233     $self->{db}->put($pos, $rec);
234 dpavlin 285 } else {
235 dpavlin 286 $self->{data}->{$pos} = $rec;
236 dpavlin 285 }
237    
238     # create lookup
239     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
240    
241 dpavlin 286 $self->progress_bar($pos,$limit);
242 dpavlin 285
243     }
244    
245 dpavlin 286 $self->{pos} = -1;
246     $self->{last_pcnt} = 0;
247 dpavlin 285
248     # store max mfn and return it.
249 dpavlin 286 $self->{max_pos} = $limit;
250     $log->debug("max_pos: $limit");
251 dpavlin 285
252     return $size;
253     }
254    
255     =head2 fetch
256    
257     Fetch next record from database. It will also displays progress bar.
258    
259     my $rec = $isis->fetch;
260    
261     Record from this function should probably go to C<data_structure> for
262     normalisation.
263    
264     =cut
265    
266     sub fetch {
267     my $self = shift;
268    
269     my $log = $self->_get_logger();
270    
271 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
272 dpavlin 285
273 dpavlin 286 if ($self->{pos} == -1) {
274     $self->{pos} = $self->{offset};
275 dpavlin 285 } else {
276 dpavlin 286 $self->{pos}++;
277 dpavlin 285 }
278    
279 dpavlin 286 my $mfn = $self->{pos};
280 dpavlin 285
281 dpavlin 286 if ($mfn > $self->{max_pos}) {
282     $self->{pos} = $self->{max_pos};
283 dpavlin 285 $log->debug("at EOF");
284     return;
285     }
286    
287 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
288 dpavlin 285
289     my $rec;
290    
291 dpavlin 286 if ($self->{low_mem}) {
292     $rec = $self->{db}->get($mfn);
293 dpavlin 285 } else {
294 dpavlin 286 $rec = $self->{data}->{$mfn};
295 dpavlin 285 }
296    
297     $rec ||= 0E0;
298     }
299    
300     =head2 pos
301    
302     Returns current record number (MFN).
303    
304     print $isis->pos;
305    
306     First record in database has position 1.
307    
308     =cut
309    
310     sub pos {
311     my $self = shift;
312 dpavlin 286 return $self->{pos};
313 dpavlin 285 }
314    
315    
316     =head2 size
317    
318     Returns number of records in database
319    
320     print $isis->size;
321    
322     Result from this function can be used to loop through all records
323    
324     foreach my $mfn ( 1 ... $isis->size ) { ... }
325    
326 dpavlin 286 because it takes into account C<offset> and C<limit>.
327 dpavlin 285
328     =cut
329    
330     sub size {
331     my $self = shift;
332 dpavlin 286 return $self->{size};
333 dpavlin 285 }
334    
335     =head2 seek
336    
337     Seek to specified MFN in file.
338    
339     $isis->seek(42);
340    
341     First record in database has position 1.
342    
343     =cut
344    
345     sub seek {
346     my $self = shift;
347     my $pos = shift || return;
348    
349     my $log = $self->_get_logger();
350    
351     if ($pos < 1) {
352     $log->warn("seek before first record");
353     $pos = 1;
354 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
355 dpavlin 285 $log->warn("seek beyond last record");
356 dpavlin 286 $pos = $self->{max_pos};
357 dpavlin 285 }
358    
359 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
360 dpavlin 285 }
361    
362    
363 dpavlin 3 =head1 MEMORY USAGE
364 dpavlin 1
365 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
366     will run on memory constraint machines (which doesn't have enough
367     physical RAM to create memory structure for whole source database).
368 dpavlin 1
369 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
370     memory shouldn't be an issue. If you don't have enough physical RAM, you
371     might consider using virtual memory (if your operating system is handling it
372     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
373     parsed structure of ISIS database (this is what C<low_mem> option does).
374 dpavlin 1
375 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
376     hitting swap before 90% will dramatically decrease performance and you will
377     be better off with C<low_mem> and using rest of availble memory for
378     operating system disk cache (Linux is particuallary good about this).
379     However, every access to database record will require disk access, so
380     generation phase will be slower 10-100 times.
381    
382     Parsed structures are essential - you just have option to trade RAM memory
383     (which is fast) for disk space (which is slow). Be sure to have planty of
384     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
385    
386     However, when WebPAC is running on desktop machines (or laptops :-), it's
387     highly undesireable for system to start swapping. Using C<low_mem> option can
388     reduce WecPAC memory usage to around 64Mb for same database with lookup
389     fields and sorted indexes which stay in RAM. Performance will suffer, but
390     memory usage will really be minimal. It might be also more confortable to
391     run WebPAC reniced on those machines.
392    
393    
394     =head1 AUTHOR
395    
396     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
397    
398 dpavlin 1 =head1 COPYRIGHT & LICENSE
399    
400     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
401    
402     This program is free software; you can redistribute it and/or modify it
403     under the same terms as Perl itself.
404    
405     =cut
406    
407     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26