/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 339 - (hide annotations)
Sat Dec 31 16:50:11 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9352 byte(s)
 r346@llin:  dpavlin | 2005-12-31 17:53:29 +0100
 rename $offset and $limit variables to $from_rec and $to_rec to avoid confusion
 with parametars which have same names

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use WebPAC::Common;
7     use base qw/WebPAC::Common/;
8     use Text::Iconv;
9 dpavlin 308 use Data::Dumper;
10 dpavlin 285
11 dpavlin 1 =head1 NAME
12    
13 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
14 dpavlin 1
15     =head1 VERSION
16    
17 dpavlin 286 Version 0.03
18 dpavlin 1
19     =cut
20    
21 dpavlin 286 our $VERSION = '0.03';
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 286 This module implements input as database which have fixed and known
26     I<size> while indexing and single unique numeric identifier for database
27     position ranging from 1 to I<size>.
28 dpavlin 1
29 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31     Examples of such databases are CDS/ISIS files, MARC files, lines in
32     text file, and so on.
33    
34     Specific file formats are implemented using low-level interface modules,
35     located in C<WebPAC::Input::*> namespace which export C<open_db>,
36     C<fetch_rec> and optional C<init> functions.
37    
38 dpavlin 1 Perhaps a little code snippet.
39    
40     use WebPAC::Input;
41    
42 dpavlin 3 my $db = WebPAC::Input->new(
43 dpavlin 286 module => 'WebPAC::Input::ISIS',
44     config => $config,
45     lookup => $lookup_obj,
46     low_mem => 1,
47 dpavlin 3 );
48 dpavlin 1
49 dpavlin 3 $db->open('/path/to/database');
50     print "database size: ",$db->size,"\n";
51 dpavlin 286 while (my $rec = $db->fetch) {
52 dpavlin 3 }
53 dpavlin 1
54 dpavlin 286
55    
56 dpavlin 1 =head1 FUNCTIONS
57    
58 dpavlin 3 =head2 new
59 dpavlin 1
60 dpavlin 3 Create new input database object.
61    
62 dpavlin 9 my $db = new WebPAC::Input(
63 dpavlin 286 module => 'WebPAC::Input::MARC',
64 dpavlin 9 code_page => 'ISO-8859-2',
65 dpavlin 10 low_mem => 1,
66 dpavlin 9 );
67 dpavlin 3
68 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
69     L<WebPAC::Input::MARC>.
70    
71 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
72     used internally). This should probably be your terminal encoding, and by
73     default, it C<ISO-8859-2>.
74    
75 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
76    
77 dpavlin 285 This function will also call low-level C<init> if it exists with same
78     parametars.
79    
80 dpavlin 1 =cut
81    
82 dpavlin 3 sub new {
83 dpavlin 285 my $class = shift;
84     my $self = {@_};
85 dpavlin 3 bless($self, $class);
86    
87 dpavlin 285 my $log = $self->_get_logger;
88    
89 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
90     my $module = $self->{module};
91     $module =~ s#::#/#g;
92     $module .= '.pm';
93     $log->debug("require low-level module $self->{module} from $module");
94 dpavlin 289
95 dpavlin 286 require $module;
96 dpavlin 289 #eval $self->{module} .'->import';
97 dpavlin 286
98 dpavlin 285 # check if required subclasses are implemented
99 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
100     my $n = $self->{module} . '::' . $subclass;
101     if (! defined &{ $n }) {
102 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
103 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
104 dpavlin 286 } else {
105 dpavlin 289 $self->{$subclass} = \&{ $n };
106 dpavlin 286 }
107 dpavlin 285 }
108    
109 dpavlin 289 if ($self->{init}) {
110 dpavlin 285 $log->debug("calling init");
111 dpavlin 289 $self->{init}->($self, @_);
112 dpavlin 285 }
113    
114 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
115    
116 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
117     if ($self->{'low_mem'}) {
118     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
119    
120     my $db_file = "data.db";
121    
122     if (-e $db_file) {
123     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
124     $log->debug("removed '$db_file' from last run");
125     }
126    
127     require DBM::Deep;
128    
129     my $db = new DBM::Deep $db_file;
130    
131     $log->logdie("DBM::Deep error: $!") unless ($db);
132    
133     if ($db->error()) {
134     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
135     } else {
136     $log->debug("using file '$db_file' for DBM::Deep");
137     }
138    
139     $self->{'db'} = $db;
140     }
141    
142 dpavlin 3 $self ? return $self : return undef;
143 dpavlin 1 }
144    
145 dpavlin 285 =head2 open
146    
147     This function will read whole database in memory and produce lookups.
148    
149 dpavlin 286 $input->open(
150 dpavlin 285 path => '/path/to/database/file',
151     code_page => '852',
152 dpavlin 286 limit => 500,
153     offset => 6000,
154 dpavlin 285 lookup => $lookup_obj,
155     );
156    
157     By default, C<code_page> is assumed to be C<852>.
158    
159 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
160 dpavlin 285
161 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
162 dpavlin 285
163 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
164     parametars, see also C<size>.
165 dpavlin 285
166     =cut
167    
168     sub open {
169     my $self = shift;
170     my $arg = {@_};
171    
172     my $log = $self->_get_logger();
173    
174     $log->logcroak("need path") if (! $arg->{'path'});
175     my $code_page = $arg->{'code_page'} || '852';
176    
177     # store data in object
178 dpavlin 292 $self->{'input_code_page'} = $code_page;
179 dpavlin 286 foreach my $v (qw/path offset limit/) {
180 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
181     }
182    
183     # create Text::Iconv object
184     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
185    
186 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
187 dpavlin 285 path => $arg->{path},
188     );
189    
190     unless ($db) {
191     $log->logwarn("can't open database $arg->{path}, skipping...");
192     return;
193     }
194    
195     unless ($size) {
196     $log->logwarn("no records in database $arg->{path}, skipping...");
197     return;
198     }
199    
200 dpavlin 339 my $from_rec = 1;
201     my $to_rec = $size;
202 dpavlin 285
203 dpavlin 286 if (my $s = $self->{offset}) {
204 dpavlin 285 $log->info("skipping to MFN $s");
205 dpavlin 339 $from_rec = $s;
206 dpavlin 285 } else {
207 dpavlin 339 $self->{offset} = $from_rec;
208 dpavlin 285 }
209    
210 dpavlin 286 if ($self->{limit}) {
211 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
212 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
213     $to_rec = $size if ($to_rec > $size);
214 dpavlin 285 }
215    
216     # store size for later
217 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
218 dpavlin 285
219 dpavlin 339 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
220 dpavlin 285
221     # read database
222 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
223 dpavlin 285
224 dpavlin 286 $log->debug("position: $pos\n");
225 dpavlin 285
226 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
227 dpavlin 285
228 dpavlin 308 $log->debug(sub { Dumper($rec) });
229    
230 dpavlin 285 if (! $rec) {
231 dpavlin 286 $log->warn("record $pos empty? skipping...");
232 dpavlin 285 next;
233     }
234    
235     # store
236 dpavlin 286 if ($self->{low_mem}) {
237     $self->{db}->put($pos, $rec);
238 dpavlin 285 } else {
239 dpavlin 286 $self->{data}->{$pos} = $rec;
240 dpavlin 285 }
241    
242     # create lookup
243     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
244    
245 dpavlin 339 $self->progress_bar($pos,$to_rec);
246 dpavlin 285
247     }
248    
249 dpavlin 286 $self->{pos} = -1;
250     $self->{last_pcnt} = 0;
251 dpavlin 285
252     # store max mfn and return it.
253 dpavlin 339 $self->{max_pos} = $to_rec;
254     $log->debug("max_pos: $to_rec");
255 dpavlin 285
256     return $size;
257     }
258    
259     =head2 fetch
260    
261     Fetch next record from database. It will also displays progress bar.
262    
263     my $rec = $isis->fetch;
264    
265     Record from this function should probably go to C<data_structure> for
266     normalisation.
267    
268     =cut
269    
270     sub fetch {
271     my $self = shift;
272    
273     my $log = $self->_get_logger();
274    
275 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
276 dpavlin 285
277 dpavlin 286 if ($self->{pos} == -1) {
278     $self->{pos} = $self->{offset};
279 dpavlin 285 } else {
280 dpavlin 286 $self->{pos}++;
281 dpavlin 285 }
282    
283 dpavlin 286 my $mfn = $self->{pos};
284 dpavlin 285
285 dpavlin 286 if ($mfn > $self->{max_pos}) {
286     $self->{pos} = $self->{max_pos};
287 dpavlin 285 $log->debug("at EOF");
288     return;
289     }
290    
291 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
292 dpavlin 285
293     my $rec;
294    
295 dpavlin 286 if ($self->{low_mem}) {
296     $rec = $self->{db}->get($mfn);
297 dpavlin 285 } else {
298 dpavlin 286 $rec = $self->{data}->{$mfn};
299 dpavlin 285 }
300    
301     $rec ||= 0E0;
302     }
303    
304     =head2 pos
305    
306     Returns current record number (MFN).
307    
308     print $isis->pos;
309    
310     First record in database has position 1.
311    
312     =cut
313    
314     sub pos {
315     my $self = shift;
316 dpavlin 286 return $self->{pos};
317 dpavlin 285 }
318    
319    
320     =head2 size
321    
322     Returns number of records in database
323    
324     print $isis->size;
325    
326     Result from this function can be used to loop through all records
327    
328     foreach my $mfn ( 1 ... $isis->size ) { ... }
329    
330 dpavlin 286 because it takes into account C<offset> and C<limit>.
331 dpavlin 285
332     =cut
333    
334     sub size {
335     my $self = shift;
336 dpavlin 286 return $self->{size};
337 dpavlin 285 }
338    
339     =head2 seek
340    
341     Seek to specified MFN in file.
342    
343     $isis->seek(42);
344    
345     First record in database has position 1.
346    
347     =cut
348    
349     sub seek {
350     my $self = shift;
351     my $pos = shift || return;
352    
353     my $log = $self->_get_logger();
354    
355     if ($pos < 1) {
356     $log->warn("seek before first record");
357     $pos = 1;
358 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
359 dpavlin 285 $log->warn("seek beyond last record");
360 dpavlin 286 $pos = $self->{max_pos};
361 dpavlin 285 }
362    
363 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
364 dpavlin 285 }
365    
366    
367 dpavlin 3 =head1 MEMORY USAGE
368 dpavlin 1
369 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
370     will run on memory constraint machines (which doesn't have enough
371     physical RAM to create memory structure for whole source database).
372 dpavlin 1
373 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
374     memory shouldn't be an issue. If you don't have enough physical RAM, you
375     might consider using virtual memory (if your operating system is handling it
376     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
377     parsed structure of ISIS database (this is what C<low_mem> option does).
378 dpavlin 1
379 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
380     hitting swap before 90% will dramatically decrease performance and you will
381     be better off with C<low_mem> and using rest of availble memory for
382     operating system disk cache (Linux is particuallary good about this).
383     However, every access to database record will require disk access, so
384     generation phase will be slower 10-100 times.
385    
386     Parsed structures are essential - you just have option to trade RAM memory
387     (which is fast) for disk space (which is slow). Be sure to have planty of
388     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
389    
390     However, when WebPAC is running on desktop machines (or laptops :-), it's
391     highly undesireable for system to start swapping. Using C<low_mem> option can
392     reduce WecPAC memory usage to around 64Mb for same database with lookup
393     fields and sorted indexes which stay in RAM. Performance will suffer, but
394     memory usage will really be minimal. It might be also more confortable to
395     run WebPAC reniced on those machines.
396    
397    
398     =head1 AUTHOR
399    
400     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
401    
402 dpavlin 1 =head1 COPYRIGHT & LICENSE
403    
404     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
405    
406     This program is free software; you can redistribute it and/or modify it
407     under the same terms as Perl itself.
408    
409     =cut
410    
411     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26