/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 523 - (hide annotations)
Sun May 21 19:29:26 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 11617 byte(s)
transfer all input variables to open_db in input module

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 506 Version 0.05
20 dpavlin 1
21     =cut
22    
23 dpavlin 506 our $VERSION = '0.05';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42     use WebPAC::Input;
43    
44 dpavlin 3 my $db = WebPAC::Input->new(
45 dpavlin 286 module => 'WebPAC::Input::ISIS',
46     config => $config,
47     lookup => $lookup_obj,
48     low_mem => 1,
49 dpavlin 3 );
50 dpavlin 1
51 dpavlin 3 $db->open('/path/to/database');
52 dpavlin 416 print "database size: ",$db->size,"\n";
53     while (my $rec = $db->fetch) {
54     # do something with $rec
55     }
56 dpavlin 1
57 dpavlin 286
58    
59 dpavlin 1 =head1 FUNCTIONS
60    
61 dpavlin 3 =head2 new
62 dpavlin 1
63 dpavlin 3 Create new input database object.
64    
65 dpavlin 9 my $db = new WebPAC::Input(
66 dpavlin 286 module => 'WebPAC::Input::MARC',
67 dpavlin 9 code_page => 'ISO-8859-2',
68 dpavlin 10 low_mem => 1,
69 dpavlin 416 recode => 'char pairs',
70 dpavlin 483 no_progress_bar => 1,
71 dpavlin 9 );
72 dpavlin 3
73 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
74     L<WebPAC::Input::MARC>.
75    
76 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
77     used internally). This should probably be your terminal encoding, and by
78     default, it C<ISO-8859-2>.
79    
80 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
81    
82 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
83     should be replaced in input stream.
84    
85     C<no_progress_bar> disables progress bar output on C<STDOUT>
86    
87 dpavlin 285 This function will also call low-level C<init> if it exists with same
88     parametars.
89    
90 dpavlin 1 =cut
91    
92 dpavlin 3 sub new {
93 dpavlin 285 my $class = shift;
94     my $self = {@_};
95 dpavlin 3 bless($self, $class);
96    
97 dpavlin 285 my $log = $self->_get_logger;
98    
99 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
100     my $module = $self->{module};
101     $module =~ s#::#/#g;
102     $module .= '.pm';
103     $log->debug("require low-level module $self->{module} from $module");
104 dpavlin 289
105 dpavlin 286 require $module;
106 dpavlin 289 #eval $self->{module} .'->import';
107 dpavlin 286
108 dpavlin 285 # check if required subclasses are implemented
109 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
110     my $n = $self->{module} . '::' . $subclass;
111     if (! defined &{ $n }) {
112 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
113 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
114 dpavlin 286 } else {
115 dpavlin 289 $self->{$subclass} = \&{ $n };
116 dpavlin 286 }
117 dpavlin 285 }
118    
119 dpavlin 289 if ($self->{init}) {
120 dpavlin 285 $log->debug("calling init");
121 dpavlin 289 $self->{init}->($self, @_);
122 dpavlin 285 }
123    
124 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
125    
126 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
127     if ($self->{'low_mem'}) {
128     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
129    
130     my $db_file = "data.db";
131    
132     if (-e $db_file) {
133     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
134     $log->debug("removed '$db_file' from last run");
135     }
136    
137     require DBM::Deep;
138    
139     my $db = new DBM::Deep $db_file;
140    
141     $log->logdie("DBM::Deep error: $!") unless ($db);
142    
143     if ($db->error()) {
144     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
145     } else {
146     $log->debug("using file '$db_file' for DBM::Deep");
147     }
148    
149     $self->{'db'} = $db;
150     }
151    
152 dpavlin 3 $self ? return $self : return undef;
153 dpavlin 1 }
154    
155 dpavlin 285 =head2 open
156    
157     This function will read whole database in memory and produce lookups.
158    
159 dpavlin 286 $input->open(
160 dpavlin 285 path => '/path/to/database/file',
161     code_page => '852',
162 dpavlin 286 limit => 500,
163     offset => 6000,
164 dpavlin 285 lookup => $lookup_obj,
165 dpavlin 506 stats => 1,
166 dpavlin 285 );
167    
168     By default, C<code_page> is assumed to be C<852>.
169    
170 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
171 dpavlin 285
172 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
173 dpavlin 285
174 dpavlin 506 C<stats> create optional report about usage of fields and subfields
175    
176 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
177     parametars, see also C<size>.
178 dpavlin 285
179     =cut
180    
181     sub open {
182     my $self = shift;
183     my $arg = {@_};
184    
185     my $log = $self->_get_logger();
186    
187     $log->logcroak("need path") if (! $arg->{'path'});
188     my $code_page = $arg->{'code_page'} || '852';
189    
190     # store data in object
191 dpavlin 292 $self->{'input_code_page'} = $code_page;
192 dpavlin 286 foreach my $v (qw/path offset limit/) {
193 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
194     }
195    
196     # create Text::Iconv object
197     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
198    
199 dpavlin 416 my $filter_ref;
200    
201     if ($self->{recode}) {
202     my @r = split(/\s/, $self->{recode});
203     if ($#r % 2 != 1) {
204     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
205     } else {
206     my $recode;
207     while (@r) {
208     my $from = shift @r;
209     my $to = shift @r;
210     $recode->{$from} = $to;
211     }
212    
213     my $regex = join '|' => keys %{ $recode };
214    
215     $log->debug("using recode regex: $regex");
216    
217     $filter_ref = sub {
218     my $t = shift;
219     $t =~ s/($regex)/$recode->{$1}/g;
220     return $t;
221     };
222    
223     }
224    
225     }
226    
227 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
228 dpavlin 285 path => $arg->{path},
229 dpavlin 416 filter => $filter_ref,
230 dpavlin 523 %{ $arg },
231 dpavlin 285 );
232    
233 dpavlin 496 unless (defined($db)) {
234 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
235     return;
236     }
237    
238     unless ($size) {
239     $log->logwarn("no records in database $arg->{path}, skipping...");
240     return;
241     }
242    
243 dpavlin 339 my $from_rec = 1;
244     my $to_rec = $size;
245 dpavlin 285
246 dpavlin 286 if (my $s = $self->{offset}) {
247 dpavlin 513 $log->debug("skipping to MFN $s");
248 dpavlin 339 $from_rec = $s;
249 dpavlin 285 } else {
250 dpavlin 339 $self->{offset} = $from_rec;
251 dpavlin 285 }
252    
253 dpavlin 286 if ($self->{limit}) {
254 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
255 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
256     $to_rec = $size if ($to_rec > $size);
257 dpavlin 285 }
258    
259     # store size for later
260 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
261 dpavlin 285
262 dpavlin 506 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}", $self->{stats} ? ' [stats]' : '');
263 dpavlin 285
264     # read database
265 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
266 dpavlin 285
267 dpavlin 286 $log->debug("position: $pos\n");
268 dpavlin 285
269 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
270 dpavlin 285
271 dpavlin 308 $log->debug(sub { Dumper($rec) });
272    
273 dpavlin 285 if (! $rec) {
274 dpavlin 286 $log->warn("record $pos empty? skipping...");
275 dpavlin 285 next;
276     }
277    
278     # store
279 dpavlin 286 if ($self->{low_mem}) {
280     $self->{db}->put($pos, $rec);
281 dpavlin 285 } else {
282 dpavlin 286 $self->{data}->{$pos} = $rec;
283 dpavlin 285 }
284    
285     # create lookup
286     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
287    
288 dpavlin 506 # update counters for statistics
289     if ($self->{stats}) {
290     map {
291     my $fld = $_;
292     $self->{_stats}->{fld}->{ $fld }++;
293     if (ref($rec->{ $fld }) eq 'ARRAY') {
294     map {
295     if (ref($_) eq 'HASH') {
296     map {
297     $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
298     } keys %{ $_ };
299     } else {
300     $self->{_stats}->{repeatable}->{ $fld }++;
301     }
302     } @{ $rec->{$fld} };
303     }
304     } keys %{ $rec };
305     }
306    
307 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
308 dpavlin 285
309     }
310    
311 dpavlin 286 $self->{pos} = -1;
312     $self->{last_pcnt} = 0;
313 dpavlin 285
314     # store max mfn and return it.
315 dpavlin 339 $self->{max_pos} = $to_rec;
316     $log->debug("max_pos: $to_rec");
317 dpavlin 285
318     return $size;
319     }
320    
321     =head2 fetch
322    
323     Fetch next record from database. It will also displays progress bar.
324    
325     my $rec = $isis->fetch;
326    
327     Record from this function should probably go to C<data_structure> for
328     normalisation.
329    
330     =cut
331    
332     sub fetch {
333     my $self = shift;
334    
335     my $log = $self->_get_logger();
336    
337 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
338 dpavlin 285
339 dpavlin 286 if ($self->{pos} == -1) {
340     $self->{pos} = $self->{offset};
341 dpavlin 285 } else {
342 dpavlin 286 $self->{pos}++;
343 dpavlin 285 }
344    
345 dpavlin 286 my $mfn = $self->{pos};
346 dpavlin 285
347 dpavlin 286 if ($mfn > $self->{max_pos}) {
348     $self->{pos} = $self->{max_pos};
349 dpavlin 285 $log->debug("at EOF");
350     return;
351     }
352    
353 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
354 dpavlin 285
355     my $rec;
356    
357 dpavlin 286 if ($self->{low_mem}) {
358     $rec = $self->{db}->get($mfn);
359 dpavlin 285 } else {
360 dpavlin 286 $rec = $self->{data}->{$mfn};
361 dpavlin 285 }
362    
363     $rec ||= 0E0;
364     }
365    
366     =head2 pos
367    
368     Returns current record number (MFN).
369    
370     print $isis->pos;
371    
372     First record in database has position 1.
373    
374     =cut
375    
376     sub pos {
377     my $self = shift;
378 dpavlin 286 return $self->{pos};
379 dpavlin 285 }
380    
381    
382     =head2 size
383    
384     Returns number of records in database
385    
386     print $isis->size;
387    
388     Result from this function can be used to loop through all records
389    
390     foreach my $mfn ( 1 ... $isis->size ) { ... }
391    
392 dpavlin 286 because it takes into account C<offset> and C<limit>.
393 dpavlin 285
394     =cut
395    
396     sub size {
397     my $self = shift;
398 dpavlin 286 return $self->{size};
399 dpavlin 285 }
400    
401     =head2 seek
402    
403     Seek to specified MFN in file.
404    
405     $isis->seek(42);
406    
407     First record in database has position 1.
408    
409     =cut
410    
411     sub seek {
412     my $self = shift;
413     my $pos = shift || return;
414    
415     my $log = $self->_get_logger();
416    
417     if ($pos < 1) {
418     $log->warn("seek before first record");
419     $pos = 1;
420 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
421 dpavlin 285 $log->warn("seek beyond last record");
422 dpavlin 286 $pos = $self->{max_pos};
423 dpavlin 285 }
424    
425 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
426 dpavlin 285 }
427    
428 dpavlin 506 =head2 stats
429 dpavlin 285
430 dpavlin 506 Dump statistics about field and subfield usage
431    
432 dpavlin 507 print $input->stats;
433 dpavlin 506
434     =cut
435    
436     sub stats {
437     my $self = shift;
438 dpavlin 507
439     my $log = $self->_get_logger();
440    
441     my $s = $self->{_stats};
442     if (! $s) {
443     $log->warn("called stats, but there is no statistics collected");
444     return;
445     }
446    
447     my $max_fld = 0;
448    
449     my $out = join("\n",
450     map {
451     my $f = $_ || die "no field";
452     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
453     $max_fld = $v if ($v > $max_fld);
454    
455 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
456 dpavlin 507
457     if (defined($s->{sf}->{$f})) {
458     map {
459     $o .= sprintf(" %s:%d", $_, $s->{sf}->{$f}->{$_});
460     } sort keys %{ $s->{sf}->{$f} };
461     }
462    
463     if (my $v_r = $s->{repeatable}->{$f}) {
464     $o .= " ($v_r)" if ($v_r != $v);
465     }
466    
467     $o;
468 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
469 dpavlin 507 );
470    
471     $log->debug( sub { Dumper($s) } );
472    
473     return $out;
474 dpavlin 506 }
475    
476 dpavlin 3 =head1 MEMORY USAGE
477 dpavlin 1
478 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
479     will run on memory constraint machines (which doesn't have enough
480     physical RAM to create memory structure for whole source database).
481 dpavlin 1
482 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
483     memory shouldn't be an issue. If you don't have enough physical RAM, you
484     might consider using virtual memory (if your operating system is handling it
485     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
486     parsed structure of ISIS database (this is what C<low_mem> option does).
487 dpavlin 1
488 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
489     hitting swap before 90% will dramatically decrease performance and you will
490     be better off with C<low_mem> and using rest of availble memory for
491     operating system disk cache (Linux is particuallary good about this).
492     However, every access to database record will require disk access, so
493     generation phase will be slower 10-100 times.
494    
495     Parsed structures are essential - you just have option to trade RAM memory
496     (which is fast) for disk space (which is slow). Be sure to have planty of
497     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
498    
499     However, when WebPAC is running on desktop machines (or laptops :-), it's
500     highly undesireable for system to start swapping. Using C<low_mem> option can
501     reduce WecPAC memory usage to around 64Mb for same database with lookup
502     fields and sorted indexes which stay in RAM. Performance will suffer, but
503     memory usage will really be minimal. It might be also more confortable to
504     run WebPAC reniced on those machines.
505    
506    
507     =head1 AUTHOR
508    
509     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
510    
511 dpavlin 1 =head1 COPYRIGHT & LICENSE
512    
513     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
514    
515     This program is free software; you can redistribute it and/or modify it
516     under the same terms as Perl itself.
517    
518     =cut
519    
520     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26