/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 307 - (hide annotations)
Tue Dec 20 00:03:04 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9250 byte(s)
moved clean into WebPAC::Output::Estraier, cleanup

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use WebPAC::Common;
7     use base qw/WebPAC::Common/;
8     use Text::Iconv;
9    
10 dpavlin 1 =head1 NAME
11    
12 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
13 dpavlin 1
14     =head1 VERSION
15    
16 dpavlin 286 Version 0.03
17 dpavlin 1
18     =cut
19    
20 dpavlin 286 our $VERSION = '0.03';
21 dpavlin 1
22     =head1 SYNOPSIS
23    
24 dpavlin 286 This module implements input as database which have fixed and known
25     I<size> while indexing and single unique numeric identifier for database
26     position ranging from 1 to I<size>.
27 dpavlin 1
28 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
29    
30     Examples of such databases are CDS/ISIS files, MARC files, lines in
31     text file, and so on.
32    
33     Specific file formats are implemented using low-level interface modules,
34     located in C<WebPAC::Input::*> namespace which export C<open_db>,
35     C<fetch_rec> and optional C<init> functions.
36    
37 dpavlin 1 Perhaps a little code snippet.
38    
39     use WebPAC::Input;
40    
41 dpavlin 3 my $db = WebPAC::Input->new(
42 dpavlin 286 module => 'WebPAC::Input::ISIS',
43     config => $config,
44     lookup => $lookup_obj,
45     low_mem => 1,
46 dpavlin 3 );
47 dpavlin 1
48 dpavlin 3 $db->open('/path/to/database');
49     print "database size: ",$db->size,"\n";
50 dpavlin 286 while (my $rec = $db->fetch) {
51 dpavlin 3 }
52 dpavlin 1
53 dpavlin 286
54    
55 dpavlin 1 =head1 FUNCTIONS
56    
57 dpavlin 3 =head2 new
58 dpavlin 1
59 dpavlin 3 Create new input database object.
60    
61 dpavlin 9 my $db = new WebPAC::Input(
62 dpavlin 286 module => 'WebPAC::Input::MARC',
63 dpavlin 9 code_page => 'ISO-8859-2',
64 dpavlin 10 low_mem => 1,
65 dpavlin 9 );
66 dpavlin 3
67 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
68     L<WebPAC::Input::MARC>.
69    
70 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
71     used internally). This should probably be your terminal encoding, and by
72     default, it C<ISO-8859-2>.
73    
74 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
75    
76 dpavlin 285 This function will also call low-level C<init> if it exists with same
77     parametars.
78    
79 dpavlin 1 =cut
80    
81 dpavlin 3 sub new {
82 dpavlin 285 my $class = shift;
83     my $self = {@_};
84 dpavlin 3 bless($self, $class);
85    
86 dpavlin 285 my $log = $self->_get_logger;
87    
88 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
89     my $module = $self->{module};
90     $module =~ s#::#/#g;
91     $module .= '.pm';
92     $log->debug("require low-level module $self->{module} from $module");
93 dpavlin 289
94 dpavlin 286 require $module;
95 dpavlin 289 #eval $self->{module} .'->import';
96 dpavlin 286
97 dpavlin 285 # check if required subclasses are implemented
98 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
99     my $n = $self->{module} . '::' . $subclass;
100     if (! defined &{ $n }) {
101 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
102 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
103 dpavlin 286 } else {
104 dpavlin 289 $self->{$subclass} = \&{ $n };
105 dpavlin 286 }
106 dpavlin 285 }
107    
108 dpavlin 289 if ($self->{init}) {
109 dpavlin 285 $log->debug("calling init");
110 dpavlin 289 $self->{init}->($self, @_);
111 dpavlin 285 }
112    
113 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
114    
115 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126     require DBM::Deep;
127    
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135     $log->debug("using file '$db_file' for DBM::Deep");
136     }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 3 $self ? return $self : return undef;
142 dpavlin 1 }
143    
144 dpavlin 285 =head2 open
145    
146     This function will read whole database in memory and produce lookups.
147    
148 dpavlin 286 $input->open(
149 dpavlin 285 path => '/path/to/database/file',
150     code_page => '852',
151 dpavlin 286 limit => 500,
152     offset => 6000,
153 dpavlin 285 lookup => $lookup_obj,
154     );
155    
156     By default, C<code_page> is assumed to be C<852>.
157    
158 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
159 dpavlin 285
160 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
161 dpavlin 285
162 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
163     parametars, see also C<size>.
164 dpavlin 285
165     =cut
166    
167     sub open {
168     my $self = shift;
169     my $arg = {@_};
170    
171     my $log = $self->_get_logger();
172    
173     $log->logcroak("need path") if (! $arg->{'path'});
174     my $code_page = $arg->{'code_page'} || '852';
175    
176     # store data in object
177 dpavlin 292 $self->{'input_code_page'} = $code_page;
178 dpavlin 286 foreach my $v (qw/path offset limit/) {
179 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
180     }
181    
182     # create Text::Iconv object
183     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
184    
185 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
186 dpavlin 285 path => $arg->{path},
187     );
188    
189     unless ($db) {
190     $log->logwarn("can't open database $arg->{path}, skipping...");
191     return;
192     }
193    
194     unless ($size) {
195     $log->logwarn("no records in database $arg->{path}, skipping...");
196     return;
197     }
198    
199 dpavlin 286 my $offset = 1;
200     my $limit = $size;
201 dpavlin 285
202 dpavlin 286 if (my $s = $self->{offset}) {
203 dpavlin 285 $log->info("skipping to MFN $s");
204 dpavlin 286 $offset = $s;
205 dpavlin 285 } else {
206 dpavlin 286 $self->{offset} = $offset;
207 dpavlin 285 }
208    
209 dpavlin 286 if ($self->{limit}) {
210 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
211 dpavlin 286 $limit = $offset + $self->{limit} - 1;
212     $limit = $size if ($limit > $size);
213 dpavlin 285 }
214    
215     # store size for later
216 dpavlin 286 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
217 dpavlin 285
218     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
219    
220     # read database
221 dpavlin 287 for (my $pos = $offset; $pos <= $limit; $pos++) {
222 dpavlin 285
223 dpavlin 286 $log->debug("position: $pos\n");
224 dpavlin 285
225 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
226 dpavlin 285
227     if (! $rec) {
228 dpavlin 286 $log->warn("record $pos empty? skipping...");
229 dpavlin 285 next;
230     }
231    
232     # store
233 dpavlin 286 if ($self->{low_mem}) {
234     $self->{db}->put($pos, $rec);
235 dpavlin 285 } else {
236 dpavlin 286 $self->{data}->{$pos} = $rec;
237 dpavlin 285 }
238    
239     # create lookup
240     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
241    
242 dpavlin 286 $self->progress_bar($pos,$limit);
243 dpavlin 285
244     }
245    
246 dpavlin 286 $self->{pos} = -1;
247     $self->{last_pcnt} = 0;
248 dpavlin 285
249     # store max mfn and return it.
250 dpavlin 286 $self->{max_pos} = $limit;
251     $log->debug("max_pos: $limit");
252 dpavlin 285
253     return $size;
254     }
255    
256     =head2 fetch
257    
258     Fetch next record from database. It will also displays progress bar.
259    
260     my $rec = $isis->fetch;
261    
262     Record from this function should probably go to C<data_structure> for
263     normalisation.
264    
265     =cut
266    
267     sub fetch {
268     my $self = shift;
269    
270     my $log = $self->_get_logger();
271    
272 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
273 dpavlin 285
274 dpavlin 286 if ($self->{pos} == -1) {
275     $self->{pos} = $self->{offset};
276 dpavlin 285 } else {
277 dpavlin 286 $self->{pos}++;
278 dpavlin 285 }
279    
280 dpavlin 286 my $mfn = $self->{pos};
281 dpavlin 285
282 dpavlin 286 if ($mfn > $self->{max_pos}) {
283     $self->{pos} = $self->{max_pos};
284 dpavlin 285 $log->debug("at EOF");
285     return;
286     }
287    
288 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
289 dpavlin 285
290     my $rec;
291    
292 dpavlin 286 if ($self->{low_mem}) {
293     $rec = $self->{db}->get($mfn);
294 dpavlin 285 } else {
295 dpavlin 286 $rec = $self->{data}->{$mfn};
296 dpavlin 285 }
297    
298     $rec ||= 0E0;
299     }
300    
301     =head2 pos
302    
303     Returns current record number (MFN).
304    
305     print $isis->pos;
306    
307     First record in database has position 1.
308    
309     =cut
310    
311     sub pos {
312     my $self = shift;
313 dpavlin 286 return $self->{pos};
314 dpavlin 285 }
315    
316    
317     =head2 size
318    
319     Returns number of records in database
320    
321     print $isis->size;
322    
323     Result from this function can be used to loop through all records
324    
325     foreach my $mfn ( 1 ... $isis->size ) { ... }
326    
327 dpavlin 286 because it takes into account C<offset> and C<limit>.
328 dpavlin 285
329     =cut
330    
331     sub size {
332     my $self = shift;
333 dpavlin 286 return $self->{size};
334 dpavlin 285 }
335    
336     =head2 seek
337    
338     Seek to specified MFN in file.
339    
340     $isis->seek(42);
341    
342     First record in database has position 1.
343    
344     =cut
345    
346     sub seek {
347     my $self = shift;
348     my $pos = shift || return;
349    
350     my $log = $self->_get_logger();
351    
352     if ($pos < 1) {
353     $log->warn("seek before first record");
354     $pos = 1;
355 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
356 dpavlin 285 $log->warn("seek beyond last record");
357 dpavlin 286 $pos = $self->{max_pos};
358 dpavlin 285 }
359    
360 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
361 dpavlin 285 }
362    
363    
364 dpavlin 3 =head1 MEMORY USAGE
365 dpavlin 1
366 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
367     will run on memory constraint machines (which doesn't have enough
368     physical RAM to create memory structure for whole source database).
369 dpavlin 1
370 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
371     memory shouldn't be an issue. If you don't have enough physical RAM, you
372     might consider using virtual memory (if your operating system is handling it
373     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
374     parsed structure of ISIS database (this is what C<low_mem> option does).
375 dpavlin 1
376 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
377     hitting swap before 90% will dramatically decrease performance and you will
378     be better off with C<low_mem> and using rest of availble memory for
379     operating system disk cache (Linux is particuallary good about this).
380     However, every access to database record will require disk access, so
381     generation phase will be slower 10-100 times.
382    
383     Parsed structures are essential - you just have option to trade RAM memory
384     (which is fast) for disk space (which is slow). Be sure to have planty of
385     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
386    
387     However, when WebPAC is running on desktop machines (or laptops :-), it's
388     highly undesireable for system to start swapping. Using C<low_mem> option can
389     reduce WecPAC memory usage to around 64Mb for same database with lookup
390     fields and sorted indexes which stay in RAM. Performance will suffer, but
391     memory usage will really be minimal. It might be also more confortable to
392     run WebPAC reniced on those machines.
393    
394    
395     =head1 AUTHOR
396    
397     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
398    
399 dpavlin 1 =head1 COPYRIGHT & LICENSE
400    
401     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
402    
403     This program is free software; you can redistribute it and/or modify it
404     under the same terms as Perl itself.
405    
406     =cut
407    
408     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26