/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 292 - (hide annotations)
Sun Dec 18 23:34:30 2005 UTC (18 years, 4 months ago) by dpavlin
File size: 9282 byte(s)
 r11790@llin:  dpavlin | 2005-12-19 06:35:06 +0100
 and small fix for codepage

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use blib;
7    
8     use WebPAC::Common;
9     use base qw/WebPAC::Common/;
10     use Text::Iconv;
11    
12 dpavlin 1 =head1 NAME
13    
14 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
15 dpavlin 1
16     =head1 VERSION
17    
18 dpavlin 286 Version 0.03
19 dpavlin 1
20     =cut
21    
22 dpavlin 286 our $VERSION = '0.03';
23 dpavlin 1
24     =head1 SYNOPSIS
25    
26 dpavlin 286 This module implements input as database which have fixed and known
27     I<size> while indexing and single unique numeric identifier for database
28     position ranging from 1 to I<size>.
29 dpavlin 1
30 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32     Examples of such databases are CDS/ISIS files, MARC files, lines in
33     text file, and so on.
34    
35     Specific file formats are implemented using low-level interface modules,
36     located in C<WebPAC::Input::*> namespace which export C<open_db>,
37     C<fetch_rec> and optional C<init> functions.
38    
39 dpavlin 1 Perhaps a little code snippet.
40    
41     use WebPAC::Input;
42    
43 dpavlin 3 my $db = WebPAC::Input->new(
44 dpavlin 286 module => 'WebPAC::Input::ISIS',
45     config => $config,
46     lookup => $lookup_obj,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51     print "database size: ",$db->size,"\n";
52 dpavlin 286 while (my $rec = $db->fetch) {
53 dpavlin 3 }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 9 );
68 dpavlin 3
69 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70     L<WebPAC::Input::MARC>.
71    
72 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
73     used internally). This should probably be your terminal encoding, and by
74     default, it C<ISO-8859-2>.
75    
76 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78 dpavlin 285 This function will also call low-level C<init> if it exists with same
79     parametars.
80    
81 dpavlin 1 =cut
82    
83 dpavlin 3 sub new {
84 dpavlin 285 my $class = shift;
85     my $self = {@_};
86 dpavlin 3 bless($self, $class);
87    
88 dpavlin 285 my $log = $self->_get_logger;
89    
90 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
91     my $module = $self->{module};
92     $module =~ s#::#/#g;
93     $module .= '.pm';
94     $log->debug("require low-level module $self->{module} from $module");
95 dpavlin 289
96 dpavlin 286 require $module;
97 dpavlin 289 #eval $self->{module} .'->import';
98 dpavlin 286
99 dpavlin 285 # check if required subclasses are implemented
100 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
101     my $n = $self->{module} . '::' . $subclass;
102     if (! defined &{ $n }) {
103 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
104     $log->logwarn($missing);
105     $self->{$subclass} = sub { warn "$missing\n" };
106 dpavlin 286 } else {
107 dpavlin 289 $self->{$subclass} = \&{ $n };
108 dpavlin 286 }
109 dpavlin 285 }
110    
111 dpavlin 289 if ($self->{init}) {
112 dpavlin 285 $log->debug("calling init");
113 dpavlin 289 $self->{init}->($self, @_);
114 dpavlin 285 }
115    
116 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
117    
118 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
119     if ($self->{'low_mem'}) {
120     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
121    
122     my $db_file = "data.db";
123    
124     if (-e $db_file) {
125     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
126     $log->debug("removed '$db_file' from last run");
127     }
128    
129     require DBM::Deep;
130    
131     my $db = new DBM::Deep $db_file;
132    
133     $log->logdie("DBM::Deep error: $!") unless ($db);
134    
135     if ($db->error()) {
136     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
137     } else {
138     $log->debug("using file '$db_file' for DBM::Deep");
139     }
140    
141     $self->{'db'} = $db;
142     }
143    
144 dpavlin 3 $self ? return $self : return undef;
145 dpavlin 1 }
146    
147 dpavlin 285 =head2 open
148    
149     This function will read whole database in memory and produce lookups.
150    
151 dpavlin 286 $input->open(
152 dpavlin 285 path => '/path/to/database/file',
153     code_page => '852',
154 dpavlin 286 limit => 500,
155     offset => 6000,
156 dpavlin 285 lookup => $lookup_obj,
157     );
158    
159     By default, C<code_page> is assumed to be C<852>.
160    
161 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
162 dpavlin 285
163 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
164 dpavlin 285
165 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
166     parametars, see also C<size>.
167 dpavlin 285
168     =cut
169    
170     sub open {
171     my $self = shift;
172     my $arg = {@_};
173    
174     my $log = $self->_get_logger();
175    
176     $log->logcroak("need path") if (! $arg->{'path'});
177     my $code_page = $arg->{'code_page'} || '852';
178    
179     # store data in object
180 dpavlin 292 $self->{'input_code_page'} = $code_page;
181 dpavlin 286 foreach my $v (qw/path offset limit/) {
182 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
183     }
184    
185     # create Text::Iconv object
186     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
189 dpavlin 285 path => $arg->{path},
190     );
191    
192     unless ($db) {
193     $log->logwarn("can't open database $arg->{path}, skipping...");
194     return;
195     }
196    
197     unless ($size) {
198     $log->logwarn("no records in database $arg->{path}, skipping...");
199     return;
200     }
201    
202 dpavlin 286 my $offset = 1;
203     my $limit = $size;
204 dpavlin 285
205 dpavlin 286 if (my $s = $self->{offset}) {
206 dpavlin 285 $log->info("skipping to MFN $s");
207 dpavlin 286 $offset = $s;
208 dpavlin 285 } else {
209 dpavlin 286 $self->{offset} = $offset;
210 dpavlin 285 }
211    
212 dpavlin 286 if ($self->{limit}) {
213     $log->info("limiting to ",$self->{limit}," records");
214     $limit = $offset + $self->{limit} - 1;
215     $limit = $size if ($limit > $size);
216 dpavlin 285 }
217    
218     # store size for later
219 dpavlin 286 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
220 dpavlin 285
221     $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
222    
223     # read database
224 dpavlin 287 for (my $pos = $offset; $pos <= $limit; $pos++) {
225 dpavlin 285
226 dpavlin 286 $log->debug("position: $pos\n");
227 dpavlin 285
228 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
229 dpavlin 285
230     if (! $rec) {
231 dpavlin 286 $log->warn("record $pos empty? skipping...");
232 dpavlin 285 next;
233     }
234    
235     # store
236 dpavlin 286 if ($self->{low_mem}) {
237     $self->{db}->put($pos, $rec);
238 dpavlin 285 } else {
239 dpavlin 286 $self->{data}->{$pos} = $rec;
240 dpavlin 285 }
241    
242     # create lookup
243     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
244    
245 dpavlin 286 $self->progress_bar($pos,$limit);
246 dpavlin 285
247     }
248    
249 dpavlin 286 $self->{pos} = -1;
250     $self->{last_pcnt} = 0;
251 dpavlin 285
252     # store max mfn and return it.
253 dpavlin 286 $self->{max_pos} = $limit;
254     $log->debug("max_pos: $limit");
255 dpavlin 285
256     return $size;
257     }
258    
259     =head2 fetch
260    
261     Fetch next record from database. It will also displays progress bar.
262    
263     my $rec = $isis->fetch;
264    
265     Record from this function should probably go to C<data_structure> for
266     normalisation.
267    
268     =cut
269    
270     sub fetch {
271     my $self = shift;
272    
273     my $log = $self->_get_logger();
274    
275 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
276 dpavlin 285
277 dpavlin 286 if ($self->{pos} == -1) {
278     $self->{pos} = $self->{offset};
279 dpavlin 285 } else {
280 dpavlin 286 $self->{pos}++;
281 dpavlin 285 }
282    
283 dpavlin 286 my $mfn = $self->{pos};
284 dpavlin 285
285 dpavlin 286 if ($mfn > $self->{max_pos}) {
286     $self->{pos} = $self->{max_pos};
287 dpavlin 285 $log->debug("at EOF");
288     return;
289     }
290    
291 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
292 dpavlin 285
293     my $rec;
294    
295 dpavlin 286 if ($self->{low_mem}) {
296     $rec = $self->{db}->get($mfn);
297 dpavlin 285 } else {
298 dpavlin 286 $rec = $self->{data}->{$mfn};
299 dpavlin 285 }
300    
301     $rec ||= 0E0;
302     }
303    
304     =head2 pos
305    
306     Returns current record number (MFN).
307    
308     print $isis->pos;
309    
310     First record in database has position 1.
311    
312     =cut
313    
314     sub pos {
315     my $self = shift;
316 dpavlin 286 return $self->{pos};
317 dpavlin 285 }
318    
319    
320     =head2 size
321    
322     Returns number of records in database
323    
324     print $isis->size;
325    
326     Result from this function can be used to loop through all records
327    
328     foreach my $mfn ( 1 ... $isis->size ) { ... }
329    
330 dpavlin 286 because it takes into account C<offset> and C<limit>.
331 dpavlin 285
332     =cut
333    
334     sub size {
335     my $self = shift;
336 dpavlin 286 return $self->{size};
337 dpavlin 285 }
338    
339     =head2 seek
340    
341     Seek to specified MFN in file.
342    
343     $isis->seek(42);
344    
345     First record in database has position 1.
346    
347     =cut
348    
349     sub seek {
350     my $self = shift;
351     my $pos = shift || return;
352    
353     my $log = $self->_get_logger();
354    
355     if ($pos < 1) {
356     $log->warn("seek before first record");
357     $pos = 1;
358 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
359 dpavlin 285 $log->warn("seek beyond last record");
360 dpavlin 286 $pos = $self->{max_pos};
361 dpavlin 285 }
362    
363 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
364 dpavlin 285 }
365    
366    
367 dpavlin 3 =head1 MEMORY USAGE
368 dpavlin 1
369 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
370     will run on memory constraint machines (which doesn't have enough
371     physical RAM to create memory structure for whole source database).
372 dpavlin 1
373 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
374     memory shouldn't be an issue. If you don't have enough physical RAM, you
375     might consider using virtual memory (if your operating system is handling it
376     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
377     parsed structure of ISIS database (this is what C<low_mem> option does).
378 dpavlin 1
379 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
380     hitting swap before 90% will dramatically decrease performance and you will
381     be better off with C<low_mem> and using rest of availble memory for
382     operating system disk cache (Linux is particuallary good about this).
383     However, every access to database record will require disk access, so
384     generation phase will be slower 10-100 times.
385    
386     Parsed structures are essential - you just have option to trade RAM memory
387     (which is fast) for disk space (which is slow). Be sure to have planty of
388     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
389    
390     However, when WebPAC is running on desktop machines (or laptops :-), it's
391     highly undesireable for system to start swapping. Using C<low_mem> option can
392     reduce WecPAC memory usage to around 64Mb for same database with lookup
393     fields and sorted indexes which stay in RAM. Performance will suffer, but
394     memory usage will really be minimal. It might be also more confortable to
395     run WebPAC reniced on those machines.
396    
397    
398     =head1 AUTHOR
399    
400     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
401    
402 dpavlin 1 =head1 COPYRIGHT & LICENSE
403    
404     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
405    
406     This program is free software; you can redistribute it and/or modify it
407     under the same terms as Perl itself.
408    
409     =cut
410    
411     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26