/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 585 - (hide annotations)
Wed Jul 5 19:52:45 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 12267 byte(s)
 r810@llin:  dpavlin | 2006-07-05 21:53:01 +0200
 change of parametars to WebPAC::Input

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 585 Version 0.07
20 dpavlin 1
21     =cut
22    
23 dpavlin 585 our $VERSION = '0.07';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42     use WebPAC::Input;
43    
44 dpavlin 3 my $db = WebPAC::Input->new(
45 dpavlin 286 module => 'WebPAC::Input::ISIS',
46     config => $config,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51 dpavlin 416 print "database size: ",$db->size,"\n";
52     while (my $rec = $db->fetch) {
53     # do something with $rec
54     }
55 dpavlin 1
56 dpavlin 286
57    
58 dpavlin 1 =head1 FUNCTIONS
59    
60 dpavlin 3 =head2 new
61 dpavlin 1
62 dpavlin 3 Create new input database object.
63    
64 dpavlin 9 my $db = new WebPAC::Input(
65 dpavlin 286 module => 'WebPAC::Input::MARC',
66 dpavlin 585 encoding => 'ISO-8859-2',
67 dpavlin 10 low_mem => 1,
68 dpavlin 416 recode => 'char pairs',
69 dpavlin 483 no_progress_bar => 1,
70 dpavlin 9 );
71 dpavlin 3
72 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
73     L<WebPAC::Input::MARC>.
74    
75 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
76 dpavlin 9 used internally). This should probably be your terminal encoding, and by
77     default, it C<ISO-8859-2>.
78    
79 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
80    
81 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
82     should be replaced in input stream.
83    
84     C<no_progress_bar> disables progress bar output on C<STDOUT>
85    
86 dpavlin 285 This function will also call low-level C<init> if it exists with same
87     parametars.
88    
89 dpavlin 1 =cut
90    
91 dpavlin 3 sub new {
92 dpavlin 285 my $class = shift;
93     my $self = {@_};
94 dpavlin 3 bless($self, $class);
95    
96 dpavlin 285 my $log = $self->_get_logger;
97    
98 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
99     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
100    
101 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
102     my $module = $self->{module};
103     $module =~ s#::#/#g;
104     $module .= '.pm';
105     $log->debug("require low-level module $self->{module} from $module");
106 dpavlin 289
107 dpavlin 286 require $module;
108 dpavlin 289 #eval $self->{module} .'->import';
109 dpavlin 286
110 dpavlin 285 # check if required subclasses are implemented
111 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
112     my $n = $self->{module} . '::' . $subclass;
113     if (! defined &{ $n }) {
114 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
115 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
116 dpavlin 286 } else {
117 dpavlin 289 $self->{$subclass} = \&{ $n };
118 dpavlin 286 }
119 dpavlin 285 }
120    
121 dpavlin 289 if ($self->{init}) {
122 dpavlin 285 $log->debug("calling init");
123 dpavlin 289 $self->{init}->($self, @_);
124 dpavlin 285 }
125    
126 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
127 dpavlin 9
128 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
129     if ($self->{'low_mem'}) {
130     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
131    
132     my $db_file = "data.db";
133    
134     if (-e $db_file) {
135     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
136     $log->debug("removed '$db_file' from last run");
137     }
138    
139     require DBM::Deep;
140    
141     my $db = new DBM::Deep $db_file;
142    
143     $log->logdie("DBM::Deep error: $!") unless ($db);
144    
145     if ($db->error()) {
146     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
147     } else {
148     $log->debug("using file '$db_file' for DBM::Deep");
149     }
150    
151     $self->{'db'} = $db;
152     }
153    
154 dpavlin 3 $self ? return $self : return undef;
155 dpavlin 1 }
156    
157 dpavlin 285 =head2 open
158    
159     This function will read whole database in memory and produce lookups.
160    
161 dpavlin 286 $input->open(
162 dpavlin 285 path => '/path/to/database/file',
163     code_page => '852',
164 dpavlin 286 limit => 500,
165     offset => 6000,
166 dpavlin 285 lookup => $lookup_obj,
167 dpavlin 506 stats => 1,
168 dpavlin 585 lookup_ref => sub {
169     my ($k,$v) = @_;
170     # store lookup $k => $v
171     },
172 dpavlin 285 );
173    
174     By default, C<code_page> is assumed to be C<852>.
175    
176 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
177 dpavlin 285
178 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
179 dpavlin 285
180 dpavlin 506 C<stats> create optional report about usage of fields and subfields
181    
182 dpavlin 585 C<lookup_coderef> is closure to call when adding key => $value combinations to
183     lookup.
184    
185 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
186     parametars, see also C<size>.
187 dpavlin 285
188     =cut
189    
190     sub open {
191     my $self = shift;
192     my $arg = {@_};
193    
194     my $log = $self->_get_logger();
195    
196 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
197     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
198     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
199    
200 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
201     my $code_page = $arg->{'code_page'} || '852';
202    
203     # store data in object
204 dpavlin 292 $self->{'input_code_page'} = $code_page;
205 dpavlin 286 foreach my $v (qw/path offset limit/) {
206 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
207     }
208    
209     # create Text::Iconv object
210 dpavlin 585 $self->{iconv} = Text::Iconv->new($code_page,$self->{'encoding'}); ## FIXME remove!
211 dpavlin 285
212 dpavlin 416 my $filter_ref;
213    
214     if ($self->{recode}) {
215     my @r = split(/\s/, $self->{recode});
216     if ($#r % 2 != 1) {
217     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
218     } else {
219     my $recode;
220     while (@r) {
221     my $from = shift @r;
222     my $to = shift @r;
223     $recode->{$from} = $to;
224     }
225    
226     my $regex = join '|' => keys %{ $recode };
227    
228     $log->debug("using recode regex: $regex");
229    
230     $filter_ref = sub {
231     my $t = shift;
232     $t =~ s/($regex)/$recode->{$1}/g;
233     return $t;
234     };
235    
236     }
237    
238     }
239    
240 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
241 dpavlin 285 path => $arg->{path},
242 dpavlin 416 filter => $filter_ref,
243 dpavlin 523 %{ $arg },
244 dpavlin 285 );
245    
246 dpavlin 496 unless (defined($db)) {
247 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
248     return;
249     }
250    
251     unless ($size) {
252     $log->logwarn("no records in database $arg->{path}, skipping...");
253     return;
254     }
255    
256 dpavlin 339 my $from_rec = 1;
257     my $to_rec = $size;
258 dpavlin 285
259 dpavlin 286 if (my $s = $self->{offset}) {
260 dpavlin 513 $log->debug("skipping to MFN $s");
261 dpavlin 339 $from_rec = $s;
262 dpavlin 285 } else {
263 dpavlin 339 $self->{offset} = $from_rec;
264 dpavlin 285 }
265    
266 dpavlin 286 if ($self->{limit}) {
267 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
268 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
269     $to_rec = $size if ($to_rec > $size);
270 dpavlin 285 }
271    
272     # store size for later
273 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
274 dpavlin 285
275 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
276 dpavlin 285
277     # read database
278 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
279 dpavlin 285
280 dpavlin 286 $log->debug("position: $pos\n");
281 dpavlin 285
282 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
283 dpavlin 285
284 dpavlin 308 $log->debug(sub { Dumper($rec) });
285    
286 dpavlin 285 if (! $rec) {
287 dpavlin 286 $log->warn("record $pos empty? skipping...");
288 dpavlin 285 next;
289     }
290    
291     # store
292 dpavlin 286 if ($self->{low_mem}) {
293     $self->{db}->put($pos, $rec);
294 dpavlin 285 } else {
295 dpavlin 286 $self->{data}->{$pos} = $rec;
296 dpavlin 285 }
297    
298     # create lookup
299 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
300 dpavlin 285
301 dpavlin 506 # update counters for statistics
302     if ($self->{stats}) {
303     map {
304     my $fld = $_;
305     $self->{_stats}->{fld}->{ $fld }++;
306     if (ref($rec->{ $fld }) eq 'ARRAY') {
307     map {
308     if (ref($_) eq 'HASH') {
309     map {
310     $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
311     } keys %{ $_ };
312     } else {
313     $self->{_stats}->{repeatable}->{ $fld }++;
314     }
315     } @{ $rec->{$fld} };
316     }
317     } keys %{ $rec };
318     }
319    
320 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
321 dpavlin 285
322     }
323    
324 dpavlin 286 $self->{pos} = -1;
325     $self->{last_pcnt} = 0;
326 dpavlin 285
327     # store max mfn and return it.
328 dpavlin 339 $self->{max_pos} = $to_rec;
329     $log->debug("max_pos: $to_rec");
330 dpavlin 285
331     return $size;
332     }
333    
334     =head2 fetch
335    
336     Fetch next record from database. It will also displays progress bar.
337    
338     my $rec = $isis->fetch;
339    
340     Record from this function should probably go to C<data_structure> for
341     normalisation.
342    
343     =cut
344    
345     sub fetch {
346     my $self = shift;
347    
348     my $log = $self->_get_logger();
349    
350 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
351 dpavlin 285
352 dpavlin 286 if ($self->{pos} == -1) {
353     $self->{pos} = $self->{offset};
354 dpavlin 285 } else {
355 dpavlin 286 $self->{pos}++;
356 dpavlin 285 }
357    
358 dpavlin 286 my $mfn = $self->{pos};
359 dpavlin 285
360 dpavlin 286 if ($mfn > $self->{max_pos}) {
361     $self->{pos} = $self->{max_pos};
362 dpavlin 285 $log->debug("at EOF");
363     return;
364     }
365    
366 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
367 dpavlin 285
368     my $rec;
369    
370 dpavlin 286 if ($self->{low_mem}) {
371     $rec = $self->{db}->get($mfn);
372 dpavlin 285 } else {
373 dpavlin 286 $rec = $self->{data}->{$mfn};
374 dpavlin 285 }
375    
376     $rec ||= 0E0;
377     }
378    
379     =head2 pos
380    
381     Returns current record number (MFN).
382    
383     print $isis->pos;
384    
385     First record in database has position 1.
386    
387     =cut
388    
389     sub pos {
390     my $self = shift;
391 dpavlin 286 return $self->{pos};
392 dpavlin 285 }
393    
394    
395     =head2 size
396    
397     Returns number of records in database
398    
399     print $isis->size;
400    
401     Result from this function can be used to loop through all records
402    
403     foreach my $mfn ( 1 ... $isis->size ) { ... }
404    
405 dpavlin 286 because it takes into account C<offset> and C<limit>.
406 dpavlin 285
407     =cut
408    
409     sub size {
410     my $self = shift;
411 dpavlin 286 return $self->{size};
412 dpavlin 285 }
413    
414     =head2 seek
415    
416     Seek to specified MFN in file.
417    
418     $isis->seek(42);
419    
420     First record in database has position 1.
421    
422     =cut
423    
424     sub seek {
425     my $self = shift;
426     my $pos = shift || return;
427    
428     my $log = $self->_get_logger();
429    
430     if ($pos < 1) {
431     $log->warn("seek before first record");
432     $pos = 1;
433 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
434 dpavlin 285 $log->warn("seek beyond last record");
435 dpavlin 286 $pos = $self->{max_pos};
436 dpavlin 285 }
437    
438 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
439 dpavlin 285 }
440    
441 dpavlin 506 =head2 stats
442 dpavlin 285
443 dpavlin 506 Dump statistics about field and subfield usage
444    
445 dpavlin 507 print $input->stats;
446 dpavlin 506
447     =cut
448    
449     sub stats {
450     my $self = shift;
451 dpavlin 507
452     my $log = $self->_get_logger();
453    
454     my $s = $self->{_stats};
455     if (! $s) {
456     $log->warn("called stats, but there is no statistics collected");
457     return;
458     }
459    
460     my $max_fld = 0;
461    
462     my $out = join("\n",
463     map {
464     my $f = $_ || die "no field";
465     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
466     $max_fld = $v if ($v > $max_fld);
467    
468 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
469 dpavlin 507
470     if (defined($s->{sf}->{$f})) {
471     map {
472     $o .= sprintf(" %s:%d", $_, $s->{sf}->{$f}->{$_});
473     } sort keys %{ $s->{sf}->{$f} };
474     }
475    
476     if (my $v_r = $s->{repeatable}->{$f}) {
477     $o .= " ($v_r)" if ($v_r != $v);
478     }
479    
480     $o;
481 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
482 dpavlin 507 );
483    
484     $log->debug( sub { Dumper($s) } );
485    
486     return $out;
487 dpavlin 506 }
488    
489 dpavlin 3 =head1 MEMORY USAGE
490 dpavlin 1
491 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
492     will run on memory constraint machines (which doesn't have enough
493     physical RAM to create memory structure for whole source database).
494 dpavlin 1
495 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
496     memory shouldn't be an issue. If you don't have enough physical RAM, you
497     might consider using virtual memory (if your operating system is handling it
498     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
499     parsed structure of ISIS database (this is what C<low_mem> option does).
500 dpavlin 1
501 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
502     hitting swap before 90% will dramatically decrease performance and you will
503     be better off with C<low_mem> and using rest of availble memory for
504     operating system disk cache (Linux is particuallary good about this).
505     However, every access to database record will require disk access, so
506     generation phase will be slower 10-100 times.
507    
508     Parsed structures are essential - you just have option to trade RAM memory
509     (which is fast) for disk space (which is slow). Be sure to have planty of
510     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
511    
512     However, when WebPAC is running on desktop machines (or laptops :-), it's
513     highly undesireable for system to start swapping. Using C<low_mem> option can
514     reduce WecPAC memory usage to around 64Mb for same database with lookup
515     fields and sorted indexes which stay in RAM. Performance will suffer, but
516     memory usage will really be minimal. It might be also more confortable to
517     run WebPAC reniced on those machines.
518    
519    
520     =head1 AUTHOR
521    
522     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
523    
524 dpavlin 1 =head1 COPYRIGHT & LICENSE
525    
526     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
527    
528     This program is free software; you can redistribute it and/or modify it
529     under the same terms as Perl itself.
530    
531     =cut
532    
533     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26