/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 416 - (hide annotations)
Sun Feb 26 23:21:50 2006 UTC (18 years, 2 months ago) by dpavlin
File size: 9943 byte(s)
 r494@llin:  dpavlin | 2006-02-27 00:22:59 +0100
 implemented recode option to input (for now, just for MARC)

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 285 use WebPAC::Common;
7     use base qw/WebPAC::Common/;
8     use Text::Iconv;
9 dpavlin 308 use Data::Dumper;
10 dpavlin 285
11 dpavlin 1 =head1 NAME
12    
13 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
14 dpavlin 1
15     =head1 VERSION
16    
17 dpavlin 416 Version 0.04
18 dpavlin 1
19     =cut
20    
21 dpavlin 416 our $VERSION = '0.04';
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 286 This module implements input as database which have fixed and known
26     I<size> while indexing and single unique numeric identifier for database
27     position ranging from 1 to I<size>.
28 dpavlin 1
29 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31     Examples of such databases are CDS/ISIS files, MARC files, lines in
32     text file, and so on.
33    
34     Specific file formats are implemented using low-level interface modules,
35     located in C<WebPAC::Input::*> namespace which export C<open_db>,
36     C<fetch_rec> and optional C<init> functions.
37    
38 dpavlin 1 Perhaps a little code snippet.
39    
40     use WebPAC::Input;
41    
42 dpavlin 3 my $db = WebPAC::Input->new(
43 dpavlin 286 module => 'WebPAC::Input::ISIS',
44     config => $config,
45     lookup => $lookup_obj,
46     low_mem => 1,
47 dpavlin 3 );
48 dpavlin 1
49 dpavlin 3 $db->open('/path/to/database');
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 9 );
69 dpavlin 3
70 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
71     L<WebPAC::Input::MARC>.
72    
73 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
74     used internally). This should probably be your terminal encoding, and by
75     default, it C<ISO-8859-2>.
76    
77 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
78    
79 dpavlin 285 This function will also call low-level C<init> if it exists with same
80     parametars.
81    
82 dpavlin 1 =cut
83    
84 dpavlin 3 sub new {
85 dpavlin 285 my $class = shift;
86     my $self = {@_};
87 dpavlin 3 bless($self, $class);
88    
89 dpavlin 285 my $log = $self->_get_logger;
90    
91 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
92     my $module = $self->{module};
93     $module =~ s#::#/#g;
94     $module .= '.pm';
95     $log->debug("require low-level module $self->{module} from $module");
96 dpavlin 289
97 dpavlin 286 require $module;
98 dpavlin 289 #eval $self->{module} .'->import';
99 dpavlin 286
100 dpavlin 285 # check if required subclasses are implemented
101 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
102     my $n = $self->{module} . '::' . $subclass;
103     if (! defined &{ $n }) {
104 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
105 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
106 dpavlin 286 } else {
107 dpavlin 289 $self->{$subclass} = \&{ $n };
108 dpavlin 286 }
109 dpavlin 285 }
110    
111 dpavlin 289 if ($self->{init}) {
112 dpavlin 285 $log->debug("calling init");
113 dpavlin 289 $self->{init}->($self, @_);
114 dpavlin 285 }
115    
116 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
117    
118 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
119     if ($self->{'low_mem'}) {
120     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
121    
122     my $db_file = "data.db";
123    
124     if (-e $db_file) {
125     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
126     $log->debug("removed '$db_file' from last run");
127     }
128    
129     require DBM::Deep;
130    
131     my $db = new DBM::Deep $db_file;
132    
133     $log->logdie("DBM::Deep error: $!") unless ($db);
134    
135     if ($db->error()) {
136     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
137     } else {
138     $log->debug("using file '$db_file' for DBM::Deep");
139     }
140    
141     $self->{'db'} = $db;
142     }
143    
144 dpavlin 3 $self ? return $self : return undef;
145 dpavlin 1 }
146    
147 dpavlin 285 =head2 open
148    
149     This function will read whole database in memory and produce lookups.
150    
151 dpavlin 286 $input->open(
152 dpavlin 285 path => '/path/to/database/file',
153     code_page => '852',
154 dpavlin 286 limit => 500,
155     offset => 6000,
156 dpavlin 285 lookup => $lookup_obj,
157     );
158    
159     By default, C<code_page> is assumed to be C<852>.
160    
161 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
162 dpavlin 285
163 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
164 dpavlin 285
165 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
166     parametars, see also C<size>.
167 dpavlin 285
168     =cut
169    
170     sub open {
171     my $self = shift;
172     my $arg = {@_};
173    
174     my $log = $self->_get_logger();
175    
176     $log->logcroak("need path") if (! $arg->{'path'});
177     my $code_page = $arg->{'code_page'} || '852';
178    
179     # store data in object
180 dpavlin 292 $self->{'input_code_page'} = $code_page;
181 dpavlin 286 foreach my $v (qw/path offset limit/) {
182 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
183     }
184    
185     # create Text::Iconv object
186     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188 dpavlin 416 my $filter_ref;
189    
190     if ($self->{recode}) {
191     my @r = split(/\s/, $self->{recode});
192     if ($#r % 2 != 1) {
193     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
194     } else {
195     my $recode;
196     while (@r) {
197     my $from = shift @r;
198     my $to = shift @r;
199     $recode->{$from} = $to;
200     }
201    
202     my $regex = join '|' => keys %{ $recode };
203    
204     $log->debug("using recode regex: $regex");
205    
206     $filter_ref = sub {
207     my $t = shift;
208     $t =~ s/($regex)/$recode->{$1}/g;
209     return $t;
210     };
211    
212     }
213    
214     }
215    
216 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
217 dpavlin 285 path => $arg->{path},
218 dpavlin 416 filter => $filter_ref,
219 dpavlin 285 );
220    
221     unless ($db) {
222     $log->logwarn("can't open database $arg->{path}, skipping...");
223     return;
224     }
225    
226     unless ($size) {
227     $log->logwarn("no records in database $arg->{path}, skipping...");
228     return;
229     }
230    
231 dpavlin 339 my $from_rec = 1;
232     my $to_rec = $size;
233 dpavlin 285
234 dpavlin 286 if (my $s = $self->{offset}) {
235 dpavlin 285 $log->info("skipping to MFN $s");
236 dpavlin 339 $from_rec = $s;
237 dpavlin 285 } else {
238 dpavlin 339 $self->{offset} = $from_rec;
239 dpavlin 285 }
240    
241 dpavlin 286 if ($self->{limit}) {
242 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
243 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
244     $to_rec = $size if ($to_rec > $size);
245 dpavlin 285 }
246    
247     # store size for later
248 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
249 dpavlin 285
250 dpavlin 339 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
251 dpavlin 285
252     # read database
253 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
254 dpavlin 285
255 dpavlin 286 $log->debug("position: $pos\n");
256 dpavlin 285
257 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
258 dpavlin 285
259 dpavlin 308 $log->debug(sub { Dumper($rec) });
260    
261 dpavlin 285 if (! $rec) {
262 dpavlin 286 $log->warn("record $pos empty? skipping...");
263 dpavlin 285 next;
264     }
265    
266     # store
267 dpavlin 286 if ($self->{low_mem}) {
268     $self->{db}->put($pos, $rec);
269 dpavlin 285 } else {
270 dpavlin 286 $self->{data}->{$pos} = $rec;
271 dpavlin 285 }
272    
273     # create lookup
274     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
275    
276 dpavlin 339 $self->progress_bar($pos,$to_rec);
277 dpavlin 285
278     }
279    
280 dpavlin 286 $self->{pos} = -1;
281     $self->{last_pcnt} = 0;
282 dpavlin 285
283     # store max mfn and return it.
284 dpavlin 339 $self->{max_pos} = $to_rec;
285     $log->debug("max_pos: $to_rec");
286 dpavlin 285
287     return $size;
288     }
289    
290     =head2 fetch
291    
292     Fetch next record from database. It will also displays progress bar.
293    
294     my $rec = $isis->fetch;
295    
296     Record from this function should probably go to C<data_structure> for
297     normalisation.
298    
299     =cut
300    
301     sub fetch {
302     my $self = shift;
303    
304     my $log = $self->_get_logger();
305    
306 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
307 dpavlin 285
308 dpavlin 286 if ($self->{pos} == -1) {
309     $self->{pos} = $self->{offset};
310 dpavlin 285 } else {
311 dpavlin 286 $self->{pos}++;
312 dpavlin 285 }
313    
314 dpavlin 286 my $mfn = $self->{pos};
315 dpavlin 285
316 dpavlin 286 if ($mfn > $self->{max_pos}) {
317     $self->{pos} = $self->{max_pos};
318 dpavlin 285 $log->debug("at EOF");
319     return;
320     }
321    
322 dpavlin 286 $self->progress_bar($mfn,$self->{max_pos});
323 dpavlin 285
324     my $rec;
325    
326 dpavlin 286 if ($self->{low_mem}) {
327     $rec = $self->{db}->get($mfn);
328 dpavlin 285 } else {
329 dpavlin 286 $rec = $self->{data}->{$mfn};
330 dpavlin 285 }
331    
332     $rec ||= 0E0;
333     }
334    
335     =head2 pos
336    
337     Returns current record number (MFN).
338    
339     print $isis->pos;
340    
341     First record in database has position 1.
342    
343     =cut
344    
345     sub pos {
346     my $self = shift;
347 dpavlin 286 return $self->{pos};
348 dpavlin 285 }
349    
350    
351     =head2 size
352    
353     Returns number of records in database
354    
355     print $isis->size;
356    
357     Result from this function can be used to loop through all records
358    
359     foreach my $mfn ( 1 ... $isis->size ) { ... }
360    
361 dpavlin 286 because it takes into account C<offset> and C<limit>.
362 dpavlin 285
363     =cut
364    
365     sub size {
366     my $self = shift;
367 dpavlin 286 return $self->{size};
368 dpavlin 285 }
369    
370     =head2 seek
371    
372     Seek to specified MFN in file.
373    
374     $isis->seek(42);
375    
376     First record in database has position 1.
377    
378     =cut
379    
380     sub seek {
381     my $self = shift;
382     my $pos = shift || return;
383    
384     my $log = $self->_get_logger();
385    
386     if ($pos < 1) {
387     $log->warn("seek before first record");
388     $pos = 1;
389 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
390 dpavlin 285 $log->warn("seek beyond last record");
391 dpavlin 286 $pos = $self->{max_pos};
392 dpavlin 285 }
393    
394 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
395 dpavlin 285 }
396    
397    
398 dpavlin 3 =head1 MEMORY USAGE
399 dpavlin 1
400 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
401     will run on memory constraint machines (which doesn't have enough
402     physical RAM to create memory structure for whole source database).
403 dpavlin 1
404 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
405     memory shouldn't be an issue. If you don't have enough physical RAM, you
406     might consider using virtual memory (if your operating system is handling it
407     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
408     parsed structure of ISIS database (this is what C<low_mem> option does).
409 dpavlin 1
410 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
411     hitting swap before 90% will dramatically decrease performance and you will
412     be better off with C<low_mem> and using rest of availble memory for
413     operating system disk cache (Linux is particuallary good about this).
414     However, every access to database record will require disk access, so
415     generation phase will be slower 10-100 times.
416    
417     Parsed structures are essential - you just have option to trade RAM memory
418     (which is fast) for disk space (which is slow). Be sure to have planty of
419     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
420    
421     However, when WebPAC is running on desktop machines (or laptops :-), it's
422     highly undesireable for system to start swapping. Using C<low_mem> option can
423     reduce WecPAC memory usage to around 64Mb for same database with lookup
424     fields and sorted indexes which stay in RAM. Performance will suffer, but
425     memory usage will really be minimal. It might be also more confortable to
426     run WebPAC reniced on those machines.
427    
428    
429     =head1 AUTHOR
430    
431     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
432    
433 dpavlin 1 =head1 COPYRIGHT & LICENSE
434    
435     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
436    
437     This program is free software; you can redistribute it and/or modify it
438     under the same terms as Perl itself.
439    
440     =cut
441    
442     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26