/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 289 - (hide annotations)
Sun Dec 18 22:16:44 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9197 byte(s)
 r11784@llin:  dpavlin | 2005-12-19 05:17:24 +0100
 don't use Exporter after all

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use blib;
7    
8     use WebPAC::Common;
9     use base qw/WebPAC::Common/;
10     use Text::Iconv;
11    
12 dpavlin 1 =head1 NAME
13    
14 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
15 dpavlin 1
16     =head1 VERSION
17    
18 dpavlin 286 Version 0.03
19 dpavlin 1
20     =cut
21    
22 dpavlin 286 our $VERSION = '0.03';
23 dpavlin 1
24     =head1 SYNOPSIS
25    
26 dpavlin 286 This module implements input as database which have fixed and known
27     I<size> while indexing and single unique numeric identifier for database
28     position ranging from 1 to I<size>.
29 dpavlin 1
30 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32     Examples of such databases are CDS/ISIS files, MARC files, lines in
33     text file, and so on.
34    
35     Specific file formats are implemented using low-level interface modules,
36     located in C<WebPAC::Input::*> namespace which export C<open_db>,
37     C<fetch_rec> and optional C<init> functions.
38    
39 dpavlin 1 Perhaps a little code snippet.
40    
41     use WebPAC::Input;
42    
43 dpavlin 3 my $db = WebPAC::Input->new(
44 dpavlin 286 module => 'WebPAC::Input::ISIS',
45     config => $config,
46     lookup => $lookup_obj,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51     print "database size: ",$db->size,"\n";
52 dpavlin 286 while (my $rec = $db->fetch) {
53 dpavlin 3 }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 9 );
68 dpavlin 3
69 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70     L<WebPAC::Input::MARC>.
71    
72 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
73     used internally). This should probably be your terminal encoding, and by
74     default, it C<ISO-8859-2>.
75    
76 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78 dpavlin 285 This function will also call low-level C<init> if it exists with same
79     parametars.
80    
81 dpavlin 1 =cut
82    
83 dpavlin 3 sub new {
84 dpavlin 285 my $class = shift;
85     my $self = {@_};
86 dpavlin 3 bless($self, $class);
87    
88 dpavlin 285 my $log = $self->_get_logger;
89    
90 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
91     my $module = $self->{module};
92     $module =~ s#::#/#g;
93     $module .= '.pm';
94     $log->debug("require low-level module $self->{module} from $module");
95 dpavlin 289
96 dpavlin 286 require $module;
97 dpavlin 289 #eval $self->{module} .'->import';
98 dpavlin 286
99 dpavlin 285 # check if required subclasses are implemented
100 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
101     my $n = $self->{module} . '::' . $subclass;
102     if (! defined &{ $n }) {
103     $log->logwarn("missing implementation of $subclass");
104 dpavlin 286 } else {
105 dpavlin 289 $self->{$subclass} = \&{ $n };
106 dpavlin 286 }
107 dpavlin 285 }
108    
109 dpavlin 289 if ($self->{init}) {
110 dpavlin 285 $log->debug("calling init");
111 dpavlin 289 $self->{init}->($self, @_);
112 dpavlin 285 }
113    
114 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
115    
116 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
117     if ($self->{'low_mem'}) {
118     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
119    
120     my $db_file = "data.db";
121    
122     if (-e $db_file) {
123     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
124     $log->debug("removed '$db_file' from last run");
125     }
126    
127     require DBM::Deep;
128    
129     my $db = new DBM::Deep $db_file;
130    
131     $log->logdie("DBM::Deep error: $!") unless ($db);
132    
133     if ($db->error()) {
134     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
135     } else {
136     $log->debug("using file '$db_file' for DBM::Deep");
137     }
138    
139     $self->{'db'} = $db;
140     }
141    
142 dpavlin 3 $self ? return $self : return undef;
143 dpavlin 1 }
144    
145 dpavlin 285 =head2 open
146    
147     This function will read whole database in memory and produce lookups.
148    
149 dpavlin 286 $input->open(
150 dpavlin 285 path => '/path/to/database/file',
151     code_page => '852',
152 dpavlin 286 limit => 500,
153     offset => 6000,
154 dpavlin 285 lookup => $lookup_obj,
155     );
156    
157     By default, C<code_page> is assumed to be C<852>.
158    
159 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
160 dpavlin 285
161 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
162 dpavlin 285
163 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
164     parametars, see also C<size>.
165 dpavlin 285
166     =cut
167    
168     sub open {
169     my $self = shift;
170     my $arg = {@_};
171    
172     my $log = $self->_get_logger();
173    
174     $log->logcroak("need path") if (! $arg->{'path'});
175     my $code_page = $arg->{'code_page'} || '852';
176    
177     # store data in object
178     $self->{'code_page'} = $code_page;
179 dpavlin 286 foreach my $v (qw/path offset limit/) {
180 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
181     }
182    
183     # create Text::Iconv object
184     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
185    
186 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
187 dpavlin 285 path => $arg->{path},
188     );
189    
190     unless ($db) {
191     $log->logwarn("can't open database $arg->{path}, skipping...");
192     return;
193     }
194    
195     unless ($size) {
196     $log->logwarn("no records in database $arg->{path}, skipping...");
197     return;
198     }
199    
200 dpavlin 286 my $offset = 1;
201     my $limit = $size;
202 dpavlin 285
203 dpavlin 286 if (my $s = $self->{offset}) {
204 dpavlin 285 $log->info("skipping to MFN $s");
205 dpavlin 286 $offset = $s;
206 dpavlin 285 } else {
207 dpavlin 286 $self->{offset} = $offset;
208 dpavlin 285 }
209    
210 dpavlin 286 if ($self->{limit}) {
211     $log->info("limiting to ",$self->{limit}," records");
212     $limit = $offset + $self->{limit} - 1;
213     $limit = $size if ($limit > $size);
214 dpavlin 285 }
215    
216     # store size for later
217 dpavlin 286 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
218 dpavlin 285
219     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
220    
221     # read database
222 dpavlin 287 for (my $pos = $offset; $pos <= $limit; $pos++) {
223 dpavlin 285
224 dpavlin 286 $log->debug("position: $pos\n");
225 dpavlin 285
226 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
227 dpavlin 285
228     if (! $rec) {
229 dpavlin 286 $log->warn("record $pos empty? skipping...");
230 dpavlin 285 next;
231     }
232    
233     # store
234 dpavlin 286 if ($self->{low_mem}) {
235     $self->{db}->put($pos, $rec);
236 dpavlin 285 } else {
237 dpavlin 286 $self->{data}->{$pos} = $rec;
238 dpavlin 285 }
239    
240     # create lookup
241     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
242    
243 dpavlin 286 $self->progress_bar($pos,$limit);
244 dpavlin 285
245     }
246    
247 dpavlin 286 $self->{pos} = -1;
248     $self->{last_pcnt} = 0;
249 dpavlin 285
250     # store max mfn and return it.
251 dpavlin 286 $self->{max_pos} = $limit;
252     $log->debug("max_pos: $limit");
253 dpavlin 285
254     return $size;
255     }
256    
257     =head2 fetch
258    
259     Fetch next record from database. It will also displays progress bar.
260    
261     my $rec = $isis->fetch;
262    
263     Record from this function should probably go to C<data_structure> for
264     normalisation.
265    
266     =cut
267    
268     sub fetch {
269     my $self = shift;
270    
271     my $log = $self->_get_logger();
272    
273 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
274 dpavlin 285
275 dpavlin 286 if ($self->{pos} == -1) {
276     $self->{pos} = $self->{offset};
277 dpavlin 285 } else {
278 dpavlin 286 $self->{pos}++;
279 dpavlin 285 }
280    
281 dpavlin 286 my $mfn = $self->{pos};
282 dpavlin 285
283 dpavlin 286 if ($mfn > $self->{max_pos}) {
284     $self->{pos} = $self->{max_pos};
285 dpavlin 285 $log->debug("at EOF");
286     return;
287     }
288    
289 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
290 dpavlin 285
291     my $rec;
292    
293 dpavlin 286 if ($self->{low_mem}) {
294     $rec = $self->{db}->get($mfn);
295 dpavlin 285 } else {
296 dpavlin 286 $rec = $self->{data}->{$mfn};
297 dpavlin 285 }
298    
299     $rec ||= 0E0;
300     }
301    
302     =head2 pos
303    
304     Returns current record number (MFN).
305    
306     print $isis->pos;
307    
308     First record in database has position 1.
309    
310     =cut
311    
312     sub pos {
313     my $self = shift;
314 dpavlin 286 return $self->{pos};
315 dpavlin 285 }
316    
317    
318     =head2 size
319    
320     Returns number of records in database
321    
322     print $isis->size;
323    
324     Result from this function can be used to loop through all records
325    
326     foreach my $mfn ( 1 ... $isis->size ) { ... }
327    
328 dpavlin 286 because it takes into account C<offset> and C<limit>.
329 dpavlin 285
330     =cut
331    
332     sub size {
333     my $self = shift;
334 dpavlin 286 return $self->{size};
335 dpavlin 285 }
336    
337     =head2 seek
338    
339     Seek to specified MFN in file.
340    
341     $isis->seek(42);
342    
343     First record in database has position 1.
344    
345     =cut
346    
347     sub seek {
348     my $self = shift;
349     my $pos = shift || return;
350    
351     my $log = $self->_get_logger();
352    
353     if ($pos < 1) {
354     $log->warn("seek before first record");
355     $pos = 1;
356 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
357 dpavlin 285 $log->warn("seek beyond last record");
358 dpavlin 286 $pos = $self->{max_pos};
359 dpavlin 285 }
360    
361 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
362 dpavlin 285 }
363    
364    
365 dpavlin 3 =head1 MEMORY USAGE
366 dpavlin 1
367 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
368     will run on memory constraint machines (which doesn't have enough
369     physical RAM to create memory structure for whole source database).
370 dpavlin 1
371 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
372     memory shouldn't be an issue. If you don't have enough physical RAM, you
373     might consider using virtual memory (if your operating system is handling it
374     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
375     parsed structure of ISIS database (this is what C<low_mem> option does).
376 dpavlin 1
377 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
378     hitting swap before 90% will dramatically decrease performance and you will
379     be better off with C<low_mem> and using rest of availble memory for
380     operating system disk cache (Linux is particuallary good about this).
381     However, every access to database record will require disk access, so
382     generation phase will be slower 10-100 times.
383    
384     Parsed structures are essential - you just have option to trade RAM memory
385     (which is fast) for disk space (which is slow). Be sure to have planty of
386     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
387    
388     However, when WebPAC is running on desktop machines (or laptops :-), it's
389     highly undesireable for system to start swapping. Using C<low_mem> option can
390     reduce WecPAC memory usage to around 64Mb for same database with lookup
391     fields and sorted indexes which stay in RAM. Performance will suffer, but
392     memory usage will really be minimal. It might be also more confortable to
393     run WebPAC reniced on those machines.
394    
395    
396     =head1 AUTHOR
397    
398     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
399    
400 dpavlin 1 =head1 COPYRIGHT & LICENSE
401    
402     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
403    
404     This program is free software; you can redistribute it and/or modify it
405     under the same terms as Perl itself.
406    
407     =cut
408    
409     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26