/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 301 - (hide annotations)
Mon Dec 19 21:26:04 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9261 byte(s)
 r322@athlon:  dpavlin | 2005-12-19 22:27:06 +0100
 make run.pl moderatly chatty (along with other modules), added command line options
 (try perldoc run.pl) new target index (to reindex all) and run (to index
 first 100 records of each database)

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use blib;
7    
8     use WebPAC::Common;
9     use base qw/WebPAC::Common/;
10     use Text::Iconv;
11    
12 dpavlin 1 =head1 NAME
13    
14 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
15 dpavlin 1
16     =head1 VERSION
17    
18 dpavlin 286 Version 0.03
19 dpavlin 1
20     =cut
21    
22 dpavlin 286 our $VERSION = '0.03';
23 dpavlin 1
24     =head1 SYNOPSIS
25    
26 dpavlin 286 This module implements input as database which have fixed and known
27     I<size> while indexing and single unique numeric identifier for database
28     position ranging from 1 to I<size>.
29 dpavlin 1
30 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32     Examples of such databases are CDS/ISIS files, MARC files, lines in
33     text file, and so on.
34    
35     Specific file formats are implemented using low-level interface modules,
36     located in C<WebPAC::Input::*> namespace which export C<open_db>,
37     C<fetch_rec> and optional C<init> functions.
38    
39 dpavlin 1 Perhaps a little code snippet.
40    
41     use WebPAC::Input;
42    
43 dpavlin 3 my $db = WebPAC::Input->new(
44 dpavlin 286 module => 'WebPAC::Input::ISIS',
45     config => $config,
46     lookup => $lookup_obj,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51     print "database size: ",$db->size,"\n";
52 dpavlin 286 while (my $rec = $db->fetch) {
53 dpavlin 3 }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 9 );
68 dpavlin 3
69 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70     L<WebPAC::Input::MARC>.
71    
72 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
73     used internally). This should probably be your terminal encoding, and by
74     default, it C<ISO-8859-2>.
75    
76 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78 dpavlin 285 This function will also call low-level C<init> if it exists with same
79     parametars.
80    
81 dpavlin 1 =cut
82    
83 dpavlin 3 sub new {
84 dpavlin 285 my $class = shift;
85     my $self = {@_};
86 dpavlin 3 bless($self, $class);
87    
88 dpavlin 285 my $log = $self->_get_logger;
89    
90 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
91     my $module = $self->{module};
92     $module =~ s#::#/#g;
93     $module .= '.pm';
94     $log->debug("require low-level module $self->{module} from $module");
95 dpavlin 289
96 dpavlin 286 require $module;
97 dpavlin 289 #eval $self->{module} .'->import';
98 dpavlin 286
99 dpavlin 285 # check if required subclasses are implemented
100 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
101     my $n = $self->{module} . '::' . $subclass;
102     if (! defined &{ $n }) {
103 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
104 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
105 dpavlin 286 } else {
106 dpavlin 289 $self->{$subclass} = \&{ $n };
107 dpavlin 286 }
108 dpavlin 285 }
109    
110 dpavlin 289 if ($self->{init}) {
111 dpavlin 285 $log->debug("calling init");
112 dpavlin 289 $self->{init}->($self, @_);
113 dpavlin 285 }
114    
115 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
116    
117 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
118     if ($self->{'low_mem'}) {
119     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
120    
121     my $db_file = "data.db";
122    
123     if (-e $db_file) {
124     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
125     $log->debug("removed '$db_file' from last run");
126     }
127    
128     require DBM::Deep;
129    
130     my $db = new DBM::Deep $db_file;
131    
132     $log->logdie("DBM::Deep error: $!") unless ($db);
133    
134     if ($db->error()) {
135     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
136     } else {
137     $log->debug("using file '$db_file' for DBM::Deep");
138     }
139    
140     $self->{'db'} = $db;
141     }
142    
143 dpavlin 3 $self ? return $self : return undef;
144 dpavlin 1 }
145    
146 dpavlin 285 =head2 open
147    
148     This function will read whole database in memory and produce lookups.
149    
150 dpavlin 286 $input->open(
151 dpavlin 285 path => '/path/to/database/file',
152     code_page => '852',
153 dpavlin 286 limit => 500,
154     offset => 6000,
155 dpavlin 285 lookup => $lookup_obj,
156     );
157    
158     By default, C<code_page> is assumed to be C<852>.
159    
160 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
161 dpavlin 285
162 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
163 dpavlin 285
164 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
165     parametars, see also C<size>.
166 dpavlin 285
167     =cut
168    
169     sub open {
170     my $self = shift;
171     my $arg = {@_};
172    
173     my $log = $self->_get_logger();
174    
175     $log->logcroak("need path") if (! $arg->{'path'});
176     my $code_page = $arg->{'code_page'} || '852';
177    
178     # store data in object
179 dpavlin 292 $self->{'input_code_page'} = $code_page;
180 dpavlin 286 foreach my $v (qw/path offset limit/) {
181 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
182     }
183    
184     # create Text::Iconv object
185     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
186    
187 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
188 dpavlin 285 path => $arg->{path},
189     );
190    
191     unless ($db) {
192     $log->logwarn("can't open database $arg->{path}, skipping...");
193     return;
194     }
195    
196     unless ($size) {
197     $log->logwarn("no records in database $arg->{path}, skipping...");
198     return;
199     }
200    
201 dpavlin 286 my $offset = 1;
202     my $limit = $size;
203 dpavlin 285
204 dpavlin 286 if (my $s = $self->{offset}) {
205 dpavlin 285 $log->info("skipping to MFN $s");
206 dpavlin 286 $offset = $s;
207 dpavlin 285 } else {
208 dpavlin 286 $self->{offset} = $offset;
209 dpavlin 285 }
210    
211 dpavlin 286 if ($self->{limit}) {
212 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
213 dpavlin 286 $limit = $offset + $self->{limit} - 1;
214     $limit = $size if ($limit > $size);
215 dpavlin 285 }
216    
217     # store size for later
218 dpavlin 286 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
219 dpavlin 285
220     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
221    
222     # read database
223 dpavlin 287 for (my $pos = $offset; $pos <= $limit; $pos++) {
224 dpavlin 285
225 dpavlin 286 $log->debug("position: $pos\n");
226 dpavlin 285
227 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
228 dpavlin 285
229     if (! $rec) {
230 dpavlin 286 $log->warn("record $pos empty? skipping...");
231 dpavlin 285 next;
232     }
233    
234     # store
235 dpavlin 286 if ($self->{low_mem}) {
236     $self->{db}->put($pos, $rec);
237 dpavlin 285 } else {
238 dpavlin 286 $self->{data}->{$pos} = $rec;
239 dpavlin 285 }
240    
241     # create lookup
242     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
243    
244 dpavlin 286 $self->progress_bar($pos,$limit);
245 dpavlin 285
246     }
247    
248 dpavlin 286 $self->{pos} = -1;
249     $self->{last_pcnt} = 0;
250 dpavlin 285
251     # store max mfn and return it.
252 dpavlin 286 $self->{max_pos} = $limit;
253     $log->debug("max_pos: $limit");
254 dpavlin 285
255     return $size;
256     }
257    
258     =head2 fetch
259    
260     Fetch next record from database. It will also displays progress bar.
261    
262     my $rec = $isis->fetch;
263    
264     Record from this function should probably go to C<data_structure> for
265     normalisation.
266    
267     =cut
268    
269     sub fetch {
270     my $self = shift;
271    
272     my $log = $self->_get_logger();
273    
274 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
275 dpavlin 285
276 dpavlin 286 if ($self->{pos} == -1) {
277     $self->{pos} = $self->{offset};
278 dpavlin 285 } else {
279 dpavlin 286 $self->{pos}++;
280 dpavlin 285 }
281    
282 dpavlin 286 my $mfn = $self->{pos};
283 dpavlin 285
284 dpavlin 286 if ($mfn > $self->{max_pos}) {
285     $self->{pos} = $self->{max_pos};
286 dpavlin 285 $log->debug("at EOF");
287     return;
288     }
289    
290 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
291 dpavlin 285
292     my $rec;
293    
294 dpavlin 286 if ($self->{low_mem}) {
295     $rec = $self->{db}->get($mfn);
296 dpavlin 285 } else {
297 dpavlin 286 $rec = $self->{data}->{$mfn};
298 dpavlin 285 }
299    
300     $rec ||= 0E0;
301     }
302    
303     =head2 pos
304    
305     Returns current record number (MFN).
306    
307     print $isis->pos;
308    
309     First record in database has position 1.
310    
311     =cut
312    
313     sub pos {
314     my $self = shift;
315 dpavlin 286 return $self->{pos};
316 dpavlin 285 }
317    
318    
319     =head2 size
320    
321     Returns number of records in database
322    
323     print $isis->size;
324    
325     Result from this function can be used to loop through all records
326    
327     foreach my $mfn ( 1 ... $isis->size ) { ... }
328    
329 dpavlin 286 because it takes into account C<offset> and C<limit>.
330 dpavlin 285
331     =cut
332    
333     sub size {
334     my $self = shift;
335 dpavlin 286 return $self->{size};
336 dpavlin 285 }
337    
338     =head2 seek
339    
340     Seek to specified MFN in file.
341    
342     $isis->seek(42);
343    
344     First record in database has position 1.
345    
346     =cut
347    
348     sub seek {
349     my $self = shift;
350     my $pos = shift || return;
351    
352     my $log = $self->_get_logger();
353    
354     if ($pos < 1) {
355     $log->warn("seek before first record");
356     $pos = 1;
357 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
358 dpavlin 285 $log->warn("seek beyond last record");
359 dpavlin 286 $pos = $self->{max_pos};
360 dpavlin 285 }
361    
362 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
363 dpavlin 285 }
364    
365    
366 dpavlin 3 =head1 MEMORY USAGE
367 dpavlin 1
368 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
369     will run on memory constraint machines (which doesn't have enough
370     physical RAM to create memory structure for whole source database).
371 dpavlin 1
372 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
373     memory shouldn't be an issue. If you don't have enough physical RAM, you
374     might consider using virtual memory (if your operating system is handling it
375     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
376     parsed structure of ISIS database (this is what C<low_mem> option does).
377 dpavlin 1
378 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
379     hitting swap before 90% will dramatically decrease performance and you will
380     be better off with C<low_mem> and using rest of availble memory for
381     operating system disk cache (Linux is particuallary good about this).
382     However, every access to database record will require disk access, so
383     generation phase will be slower 10-100 times.
384    
385     Parsed structures are essential - you just have option to trade RAM memory
386     (which is fast) for disk space (which is slow). Be sure to have planty of
387     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
388    
389     However, when WebPAC is running on desktop machines (or laptops :-), it's
390     highly undesireable for system to start swapping. Using C<low_mem> option can
391     reduce WecPAC memory usage to around 64Mb for same database with lookup
392     fields and sorted indexes which stay in RAM. Performance will suffer, but
393     memory usage will really be minimal. It might be also more confortable to
394     run WebPAC reniced on those machines.
395    
396    
397     =head1 AUTHOR
398    
399     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
400    
401 dpavlin 1 =head1 COPYRIGHT & LICENSE
402    
403     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
404    
405     This program is free software; you can redistribute it and/or modify it
406     under the same terms as Perl itself.
407    
408     =cut
409    
410     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26