/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 506 - (hide annotations)
Mon May 15 09:59:05 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 10920 byte(s)
 r663@llin:  dpavlin | 2006-05-15 12:02:43 +0200
 added stats gathering

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 487 use WebPAC::Common;
7 dpavlin 285 use base qw/WebPAC::Common/;
8     use Text::Iconv;
9 dpavlin 308 use Data::Dumper;
10 dpavlin 285
11 dpavlin 1 =head1 NAME
12    
13 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
14 dpavlin 1
15     =head1 VERSION
16    
17 dpavlin 506 Version 0.05
18 dpavlin 1
19     =cut
20    
21 dpavlin 506 our $VERSION = '0.05';
22 dpavlin 1
23     =head1 SYNOPSIS
24    
25 dpavlin 286 This module implements input as database which have fixed and known
26     I<size> while indexing and single unique numeric identifier for database
27     position ranging from 1 to I<size>.
28 dpavlin 1
29 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31     Examples of such databases are CDS/ISIS files, MARC files, lines in
32     text file, and so on.
33    
34     Specific file formats are implemented using low-level interface modules,
35     located in C<WebPAC::Input::*> namespace which export C<open_db>,
36     C<fetch_rec> and optional C<init> functions.
37    
38 dpavlin 1 Perhaps a little code snippet.
39    
40     use WebPAC::Input;
41    
42 dpavlin 3 my $db = WebPAC::Input->new(
43 dpavlin 286 module => 'WebPAC::Input::ISIS',
44     config => $config,
45     lookup => $lookup_obj,
46     low_mem => 1,
47 dpavlin 3 );
48 dpavlin 1
49 dpavlin 3 $db->open('/path/to/database');
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 9 code_page => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72     L<WebPAC::Input::MARC>.
73    
74 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
75     used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
98     my $module = $self->{module};
99     $module =~ s#::#/#g;
100     $module .= '.pm';
101     $log->debug("require low-level module $self->{module} from $module");
102 dpavlin 289
103 dpavlin 286 require $module;
104 dpavlin 289 #eval $self->{module} .'->import';
105 dpavlin 286
106 dpavlin 285 # check if required subclasses are implemented
107 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
108     my $n = $self->{module} . '::' . $subclass;
109     if (! defined &{ $n }) {
110 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
111 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
112 dpavlin 286 } else {
113 dpavlin 289 $self->{$subclass} = \&{ $n };
114 dpavlin 286 }
115 dpavlin 285 }
116    
117 dpavlin 289 if ($self->{init}) {
118 dpavlin 285 $log->debug("calling init");
119 dpavlin 289 $self->{init}->($self, @_);
120 dpavlin 285 }
121    
122 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
123    
124 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
125     if ($self->{'low_mem'}) {
126     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
127    
128     my $db_file = "data.db";
129    
130     if (-e $db_file) {
131     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
132     $log->debug("removed '$db_file' from last run");
133     }
134    
135     require DBM::Deep;
136    
137     my $db = new DBM::Deep $db_file;
138    
139     $log->logdie("DBM::Deep error: $!") unless ($db);
140    
141     if ($db->error()) {
142     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
143     } else {
144     $log->debug("using file '$db_file' for DBM::Deep");
145     }
146    
147     $self->{'db'} = $db;
148     }
149    
150 dpavlin 3 $self ? return $self : return undef;
151 dpavlin 1 }
152    
153 dpavlin 285 =head2 open
154    
155     This function will read whole database in memory and produce lookups.
156    
157 dpavlin 286 $input->open(
158 dpavlin 285 path => '/path/to/database/file',
159     code_page => '852',
160 dpavlin 286 limit => 500,
161     offset => 6000,
162 dpavlin 285 lookup => $lookup_obj,
163 dpavlin 506 stats => 1,
164 dpavlin 285 );
165    
166     By default, C<code_page> is assumed to be C<852>.
167    
168 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
169 dpavlin 285
170 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
171 dpavlin 285
172 dpavlin 506 C<stats> create optional report about usage of fields and subfields
173    
174 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
175     parametars, see also C<size>.
176 dpavlin 285
177     =cut
178    
179     sub open {
180     my $self = shift;
181     my $arg = {@_};
182    
183     my $log = $self->_get_logger();
184    
185     $log->logcroak("need path") if (! $arg->{'path'});
186     my $code_page = $arg->{'code_page'} || '852';
187    
188     # store data in object
189 dpavlin 292 $self->{'input_code_page'} = $code_page;
190 dpavlin 286 foreach my $v (qw/path offset limit/) {
191 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
192     }
193    
194     # create Text::Iconv object
195     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
196    
197 dpavlin 416 my $filter_ref;
198    
199     if ($self->{recode}) {
200     my @r = split(/\s/, $self->{recode});
201     if ($#r % 2 != 1) {
202     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
203     } else {
204     my $recode;
205     while (@r) {
206     my $from = shift @r;
207     my $to = shift @r;
208     $recode->{$from} = $to;
209     }
210    
211     my $regex = join '|' => keys %{ $recode };
212    
213     $log->debug("using recode regex: $regex");
214    
215     $filter_ref = sub {
216     my $t = shift;
217     $t =~ s/($regex)/$recode->{$1}/g;
218     return $t;
219     };
220    
221     }
222    
223     }
224    
225 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
226 dpavlin 285 path => $arg->{path},
227 dpavlin 416 filter => $filter_ref,
228 dpavlin 285 );
229    
230 dpavlin 496 unless (defined($db)) {
231 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
232     return;
233     }
234    
235     unless ($size) {
236     $log->logwarn("no records in database $arg->{path}, skipping...");
237     return;
238     }
239    
240 dpavlin 339 my $from_rec = 1;
241     my $to_rec = $size;
242 dpavlin 285
243 dpavlin 286 if (my $s = $self->{offset}) {
244 dpavlin 285 $log->info("skipping to MFN $s");
245 dpavlin 339 $from_rec = $s;
246 dpavlin 285 } else {
247 dpavlin 339 $self->{offset} = $from_rec;
248 dpavlin 285 }
249    
250 dpavlin 286 if ($self->{limit}) {
251 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
252 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
253     $to_rec = $size if ($to_rec > $size);
254 dpavlin 285 }
255    
256     # store size for later
257 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
258 dpavlin 285
259 dpavlin 506 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}", $self->{stats} ? ' [stats]' : '');
260 dpavlin 285
261     # read database
262 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
263 dpavlin 285
264 dpavlin 286 $log->debug("position: $pos\n");
265 dpavlin 285
266 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
267 dpavlin 285
268 dpavlin 308 $log->debug(sub { Dumper($rec) });
269    
270 dpavlin 285 if (! $rec) {
271 dpavlin 286 $log->warn("record $pos empty? skipping...");
272 dpavlin 285 next;
273     }
274    
275     # store
276 dpavlin 286 if ($self->{low_mem}) {
277     $self->{db}->put($pos, $rec);
278 dpavlin 285 } else {
279 dpavlin 286 $self->{data}->{$pos} = $rec;
280 dpavlin 285 }
281    
282     # create lookup
283     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
284    
285 dpavlin 506 # update counters for statistics
286     if ($self->{stats}) {
287     map {
288     my $fld = $_;
289     $self->{_stats}->{fld}->{ $fld }++;
290     if (ref($rec->{ $fld }) eq 'ARRAY') {
291     map {
292     if (ref($_) eq 'HASH') {
293     map {
294     $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
295     } keys %{ $_ };
296     } else {
297     $self->{_stats}->{repeatable}->{ $fld }++;
298     }
299     } @{ $rec->{$fld} };
300     }
301     } keys %{ $rec };
302     }
303    
304 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
305 dpavlin 285
306     }
307    
308 dpavlin 286 $self->{pos} = -1;
309     $self->{last_pcnt} = 0;
310 dpavlin 285
311     # store max mfn and return it.
312 dpavlin 339 $self->{max_pos} = $to_rec;
313     $log->debug("max_pos: $to_rec");
314 dpavlin 285
315     return $size;
316     }
317    
318     =head2 fetch
319    
320     Fetch next record from database. It will also displays progress bar.
321    
322     my $rec = $isis->fetch;
323    
324     Record from this function should probably go to C<data_structure> for
325     normalisation.
326    
327     =cut
328    
329     sub fetch {
330     my $self = shift;
331    
332     my $log = $self->_get_logger();
333    
334 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
335 dpavlin 285
336 dpavlin 286 if ($self->{pos} == -1) {
337     $self->{pos} = $self->{offset};
338 dpavlin 285 } else {
339 dpavlin 286 $self->{pos}++;
340 dpavlin 285 }
341    
342 dpavlin 286 my $mfn = $self->{pos};
343 dpavlin 285
344 dpavlin 286 if ($mfn > $self->{max_pos}) {
345     $self->{pos} = $self->{max_pos};
346 dpavlin 285 $log->debug("at EOF");
347     return;
348     }
349    
350 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
351 dpavlin 285
352     my $rec;
353    
354 dpavlin 286 if ($self->{low_mem}) {
355     $rec = $self->{db}->get($mfn);
356 dpavlin 285 } else {
357 dpavlin 286 $rec = $self->{data}->{$mfn};
358 dpavlin 285 }
359    
360     $rec ||= 0E0;
361     }
362    
363     =head2 pos
364    
365     Returns current record number (MFN).
366    
367     print $isis->pos;
368    
369     First record in database has position 1.
370    
371     =cut
372    
373     sub pos {
374     my $self = shift;
375 dpavlin 286 return $self->{pos};
376 dpavlin 285 }
377    
378    
379     =head2 size
380    
381     Returns number of records in database
382    
383     print $isis->size;
384    
385     Result from this function can be used to loop through all records
386    
387     foreach my $mfn ( 1 ... $isis->size ) { ... }
388    
389 dpavlin 286 because it takes into account C<offset> and C<limit>.
390 dpavlin 285
391     =cut
392    
393     sub size {
394     my $self = shift;
395 dpavlin 286 return $self->{size};
396 dpavlin 285 }
397    
398     =head2 seek
399    
400     Seek to specified MFN in file.
401    
402     $isis->seek(42);
403    
404     First record in database has position 1.
405    
406     =cut
407    
408     sub seek {
409     my $self = shift;
410     my $pos = shift || return;
411    
412     my $log = $self->_get_logger();
413    
414     if ($pos < 1) {
415     $log->warn("seek before first record");
416     $pos = 1;
417 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
418 dpavlin 285 $log->warn("seek beyond last record");
419 dpavlin 286 $pos = $self->{max_pos};
420 dpavlin 285 }
421    
422 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
423 dpavlin 285 }
424    
425 dpavlin 506 =head2 stats
426 dpavlin 285
427 dpavlin 506 Dump statistics about field and subfield usage
428    
429     print Dumper( $input->stats );
430    
431     =cut
432    
433     sub stats {
434     my $self = shift;
435     return $self->{_stats};
436     }
437    
438 dpavlin 3 =head1 MEMORY USAGE
439 dpavlin 1
440 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
441     will run on memory constraint machines (which doesn't have enough
442     physical RAM to create memory structure for whole source database).
443 dpavlin 1
444 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
445     memory shouldn't be an issue. If you don't have enough physical RAM, you
446     might consider using virtual memory (if your operating system is handling it
447     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
448     parsed structure of ISIS database (this is what C<low_mem> option does).
449 dpavlin 1
450 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
451     hitting swap before 90% will dramatically decrease performance and you will
452     be better off with C<low_mem> and using rest of availble memory for
453     operating system disk cache (Linux is particuallary good about this).
454     However, every access to database record will require disk access, so
455     generation phase will be slower 10-100 times.
456    
457     Parsed structures are essential - you just have option to trade RAM memory
458     (which is fast) for disk space (which is slow). Be sure to have planty of
459     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
460    
461     However, when WebPAC is running on desktop machines (or laptops :-), it's
462     highly undesireable for system to start swapping. Using C<low_mem> option can
463     reduce WecPAC memory usage to around 64Mb for same database with lookup
464     fields and sorted indexes which stay in RAM. Performance will suffer, but
465     memory usage will really be minimal. It might be also more confortable to
466     run WebPAC reniced on those machines.
467    
468    
469     =head1 AUTHOR
470    
471     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
472    
473 dpavlin 1 =head1 COPYRIGHT & LICENSE
474    
475     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
476    
477     This program is free software; you can redistribute it and/or modify it
478     under the same terms as Perl itself.
479    
480     =cut
481    
482     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26