/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 593 - (hide annotations)
Sun Jul 9 15:22:39 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 12538 byte(s)
 r823@llin:  dpavlin | 2006-07-09 17:23:28 +0200
 stats not report repeatable subfields

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 593 Version 0.08
20 dpavlin 1
21     =cut
22    
23 dpavlin 593 our $VERSION = '0.08';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42     use WebPAC::Input;
43    
44 dpavlin 3 my $db = WebPAC::Input->new(
45 dpavlin 286 module => 'WebPAC::Input::ISIS',
46     config => $config,
47     low_mem => 1,
48 dpavlin 3 );
49 dpavlin 1
50 dpavlin 3 $db->open('/path/to/database');
51 dpavlin 416 print "database size: ",$db->size,"\n";
52     while (my $rec = $db->fetch) {
53     # do something with $rec
54     }
55 dpavlin 1
56 dpavlin 286
57    
58 dpavlin 1 =head1 FUNCTIONS
59    
60 dpavlin 3 =head2 new
61 dpavlin 1
62 dpavlin 3 Create new input database object.
63    
64 dpavlin 9 my $db = new WebPAC::Input(
65 dpavlin 286 module => 'WebPAC::Input::MARC',
66 dpavlin 585 encoding => 'ISO-8859-2',
67 dpavlin 10 low_mem => 1,
68 dpavlin 416 recode => 'char pairs',
69 dpavlin 483 no_progress_bar => 1,
70 dpavlin 9 );
71 dpavlin 3
72 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
73     L<WebPAC::Input::MARC>.
74    
75 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
76 dpavlin 9 used internally). This should probably be your terminal encoding, and by
77     default, it C<ISO-8859-2>.
78    
79 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
80    
81 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
82     should be replaced in input stream.
83    
84     C<no_progress_bar> disables progress bar output on C<STDOUT>
85    
86 dpavlin 285 This function will also call low-level C<init> if it exists with same
87     parametars.
88    
89 dpavlin 1 =cut
90    
91 dpavlin 3 sub new {
92 dpavlin 285 my $class = shift;
93     my $self = {@_};
94 dpavlin 3 bless($self, $class);
95    
96 dpavlin 285 my $log = $self->_get_logger;
97    
98 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
99     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
100    
101 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
102     my $module = $self->{module};
103     $module =~ s#::#/#g;
104     $module .= '.pm';
105     $log->debug("require low-level module $self->{module} from $module");
106 dpavlin 289
107 dpavlin 286 require $module;
108 dpavlin 289 #eval $self->{module} .'->import';
109 dpavlin 286
110 dpavlin 285 # check if required subclasses are implemented
111 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
112     my $n = $self->{module} . '::' . $subclass;
113     if (! defined &{ $n }) {
114 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
115 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
116 dpavlin 286 } else {
117 dpavlin 289 $self->{$subclass} = \&{ $n };
118 dpavlin 286 }
119 dpavlin 285 }
120    
121 dpavlin 289 if ($self->{init}) {
122 dpavlin 285 $log->debug("calling init");
123 dpavlin 289 $self->{init}->($self, @_);
124 dpavlin 285 }
125    
126 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
127 dpavlin 9
128 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
129     if ($self->{'low_mem'}) {
130     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
131    
132     my $db_file = "data.db";
133    
134     if (-e $db_file) {
135     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
136     $log->debug("removed '$db_file' from last run");
137     }
138    
139     require DBM::Deep;
140    
141     my $db = new DBM::Deep $db_file;
142    
143     $log->logdie("DBM::Deep error: $!") unless ($db);
144    
145     if ($db->error()) {
146     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
147     } else {
148     $log->debug("using file '$db_file' for DBM::Deep");
149     }
150    
151     $self->{'db'} = $db;
152     }
153    
154 dpavlin 3 $self ? return $self : return undef;
155 dpavlin 1 }
156    
157 dpavlin 285 =head2 open
158    
159     This function will read whole database in memory and produce lookups.
160    
161 dpavlin 286 $input->open(
162 dpavlin 285 path => '/path/to/database/file',
163     code_page => '852',
164 dpavlin 286 limit => 500,
165     offset => 6000,
166 dpavlin 285 lookup => $lookup_obj,
167 dpavlin 506 stats => 1,
168 dpavlin 585 lookup_ref => sub {
169     my ($k,$v) = @_;
170     # store lookup $k => $v
171     },
172 dpavlin 285 );
173    
174     By default, C<code_page> is assumed to be C<852>.
175    
176 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
177 dpavlin 285
178 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
179 dpavlin 285
180 dpavlin 506 C<stats> create optional report about usage of fields and subfields
181    
182 dpavlin 585 C<lookup_coderef> is closure to call when adding key => $value combinations to
183     lookup.
184    
185 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
186     parametars, see also C<size>.
187 dpavlin 285
188     =cut
189    
190     sub open {
191     my $self = shift;
192     my $arg = {@_};
193    
194     my $log = $self->_get_logger();
195    
196 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
197     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
198     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
199    
200 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
201     my $code_page = $arg->{'code_page'} || '852';
202    
203     # store data in object
204 dpavlin 292 $self->{'input_code_page'} = $code_page;
205 dpavlin 286 foreach my $v (qw/path offset limit/) {
206 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
207     }
208    
209     # create Text::Iconv object
210 dpavlin 585 $self->{iconv} = Text::Iconv->new($code_page,$self->{'encoding'}); ## FIXME remove!
211 dpavlin 285
212 dpavlin 416 my $filter_ref;
213    
214     if ($self->{recode}) {
215     my @r = split(/\s/, $self->{recode});
216     if ($#r % 2 != 1) {
217     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
218     } else {
219     my $recode;
220     while (@r) {
221     my $from = shift @r;
222     my $to = shift @r;
223     $recode->{$from} = $to;
224     }
225    
226     my $regex = join '|' => keys %{ $recode };
227    
228     $log->debug("using recode regex: $regex");
229    
230     $filter_ref = sub {
231     my $t = shift;
232     $t =~ s/($regex)/$recode->{$1}/g;
233     return $t;
234     };
235    
236     }
237    
238     }
239    
240 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
241 dpavlin 285 path => $arg->{path},
242 dpavlin 416 filter => $filter_ref,
243 dpavlin 523 %{ $arg },
244 dpavlin 285 );
245    
246 dpavlin 496 unless (defined($db)) {
247 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
248     return;
249     }
250    
251     unless ($size) {
252     $log->logwarn("no records in database $arg->{path}, skipping...");
253     return;
254     }
255    
256 dpavlin 339 my $from_rec = 1;
257     my $to_rec = $size;
258 dpavlin 285
259 dpavlin 286 if (my $s = $self->{offset}) {
260 dpavlin 513 $log->debug("skipping to MFN $s");
261 dpavlin 339 $from_rec = $s;
262 dpavlin 285 } else {
263 dpavlin 339 $self->{offset} = $from_rec;
264 dpavlin 285 }
265    
266 dpavlin 286 if ($self->{limit}) {
267 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
268 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
269     $to_rec = $size if ($to_rec > $size);
270 dpavlin 285 }
271    
272     # store size for later
273 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
274 dpavlin 285
275 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
276 dpavlin 285
277     # read database
278 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
279 dpavlin 285
280 dpavlin 286 $log->debug("position: $pos\n");
281 dpavlin 285
282 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
283 dpavlin 285
284 dpavlin 308 $log->debug(sub { Dumper($rec) });
285    
286 dpavlin 285 if (! $rec) {
287 dpavlin 286 $log->warn("record $pos empty? skipping...");
288 dpavlin 285 next;
289     }
290    
291     # store
292 dpavlin 286 if ($self->{low_mem}) {
293     $self->{db}->put($pos, $rec);
294 dpavlin 285 } else {
295 dpavlin 286 $self->{data}->{$pos} = $rec;
296 dpavlin 285 }
297    
298     # create lookup
299 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
300 dpavlin 285
301 dpavlin 506 # update counters for statistics
302     if ($self->{stats}) {
303 dpavlin 593
304     foreach my $fld (keys %{ $rec }) {
305 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
306 dpavlin 593
307     $log->logdie("invalid record fild $fld, not ARRAY")
308     unless (ref($rec->{ $fld }) eq 'ARRAY');
309    
310     foreach my $row (@{ $rec->{$fld} }) {
311    
312     if (ref($row) eq 'HASH') {
313    
314     foreach my $sf (keys %{ $row }) {
315     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
316     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
317     if (ref($row->{$sf}) eq 'ARRAY');
318 dpavlin 506 }
319 dpavlin 593
320     } else {
321     $self->{_stats}->{repeatable}->{ $fld }++;
322     }
323 dpavlin 506 }
324 dpavlin 593 }
325 dpavlin 506 }
326    
327 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
328 dpavlin 285
329     }
330    
331 dpavlin 286 $self->{pos} = -1;
332     $self->{last_pcnt} = 0;
333 dpavlin 285
334     # store max mfn and return it.
335 dpavlin 339 $self->{max_pos} = $to_rec;
336     $log->debug("max_pos: $to_rec");
337 dpavlin 285
338     return $size;
339     }
340    
341     =head2 fetch
342    
343     Fetch next record from database. It will also displays progress bar.
344    
345     my $rec = $isis->fetch;
346    
347     Record from this function should probably go to C<data_structure> for
348     normalisation.
349    
350     =cut
351    
352     sub fetch {
353     my $self = shift;
354    
355     my $log = $self->_get_logger();
356    
357 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
358 dpavlin 285
359 dpavlin 286 if ($self->{pos} == -1) {
360     $self->{pos} = $self->{offset};
361 dpavlin 285 } else {
362 dpavlin 286 $self->{pos}++;
363 dpavlin 285 }
364    
365 dpavlin 286 my $mfn = $self->{pos};
366 dpavlin 285
367 dpavlin 286 if ($mfn > $self->{max_pos}) {
368     $self->{pos} = $self->{max_pos};
369 dpavlin 285 $log->debug("at EOF");
370     return;
371     }
372    
373 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
374 dpavlin 285
375     my $rec;
376    
377 dpavlin 286 if ($self->{low_mem}) {
378     $rec = $self->{db}->get($mfn);
379 dpavlin 285 } else {
380 dpavlin 286 $rec = $self->{data}->{$mfn};
381 dpavlin 285 }
382    
383     $rec ||= 0E0;
384     }
385    
386     =head2 pos
387    
388     Returns current record number (MFN).
389    
390     print $isis->pos;
391    
392     First record in database has position 1.
393    
394     =cut
395    
396     sub pos {
397     my $self = shift;
398 dpavlin 286 return $self->{pos};
399 dpavlin 285 }
400    
401    
402     =head2 size
403    
404     Returns number of records in database
405    
406     print $isis->size;
407    
408     Result from this function can be used to loop through all records
409    
410     foreach my $mfn ( 1 ... $isis->size ) { ... }
411    
412 dpavlin 286 because it takes into account C<offset> and C<limit>.
413 dpavlin 285
414     =cut
415    
416     sub size {
417     my $self = shift;
418 dpavlin 286 return $self->{size};
419 dpavlin 285 }
420    
421     =head2 seek
422    
423     Seek to specified MFN in file.
424    
425     $isis->seek(42);
426    
427     First record in database has position 1.
428    
429     =cut
430    
431     sub seek {
432     my $self = shift;
433     my $pos = shift || return;
434    
435     my $log = $self->_get_logger();
436    
437     if ($pos < 1) {
438     $log->warn("seek before first record");
439     $pos = 1;
440 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
441 dpavlin 285 $log->warn("seek beyond last record");
442 dpavlin 286 $pos = $self->{max_pos};
443 dpavlin 285 }
444    
445 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
446 dpavlin 285 }
447    
448 dpavlin 506 =head2 stats
449 dpavlin 285
450 dpavlin 506 Dump statistics about field and subfield usage
451    
452 dpavlin 507 print $input->stats;
453 dpavlin 506
454     =cut
455    
456     sub stats {
457     my $self = shift;
458 dpavlin 507
459     my $log = $self->_get_logger();
460    
461     my $s = $self->{_stats};
462     if (! $s) {
463     $log->warn("called stats, but there is no statistics collected");
464     return;
465     }
466    
467     my $max_fld = 0;
468    
469     my $out = join("\n",
470     map {
471     my $f = $_ || die "no field";
472     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
473     $max_fld = $v if ($v > $max_fld);
474    
475 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
476 dpavlin 507
477     if (defined($s->{sf}->{$f})) {
478     map {
479 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
480     $s->{sf}->{$f}->{$_}->{count},
481     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
482     );
483 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
484     }
485    
486     if (my $v_r = $s->{repeatable}->{$f}) {
487     $o .= " ($v_r)" if ($v_r != $v);
488     }
489    
490     $o;
491 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
492 dpavlin 507 );
493    
494     $log->debug( sub { Dumper($s) } );
495    
496     return $out;
497 dpavlin 506 }
498    
499 dpavlin 3 =head1 MEMORY USAGE
500 dpavlin 1
501 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
502     will run on memory constraint machines (which doesn't have enough
503     physical RAM to create memory structure for whole source database).
504 dpavlin 1
505 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
506     memory shouldn't be an issue. If you don't have enough physical RAM, you
507     might consider using virtual memory (if your operating system is handling it
508     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
509     parsed structure of ISIS database (this is what C<low_mem> option does).
510 dpavlin 1
511 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
512     hitting swap before 90% will dramatically decrease performance and you will
513     be better off with C<low_mem> and using rest of availble memory for
514     operating system disk cache (Linux is particuallary good about this).
515     However, every access to database record will require disk access, so
516     generation phase will be slower 10-100 times.
517    
518     Parsed structures are essential - you just have option to trade RAM memory
519     (which is fast) for disk space (which is slow). Be sure to have planty of
520     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
521    
522     However, when WebPAC is running on desktop machines (or laptops :-), it's
523     highly undesireable for system to start swapping. Using C<low_mem> option can
524     reduce WecPAC memory usage to around 64Mb for same database with lookup
525     fields and sorted indexes which stay in RAM. Performance will suffer, but
526     memory usage will really be minimal. It might be also more confortable to
527     run WebPAC reniced on those machines.
528    
529    
530     =head1 AUTHOR
531    
532     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
533    
534 dpavlin 1 =head1 COPYRIGHT & LICENSE
535    
536     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
537    
538     This program is free software; you can redistribute it and/or modify it
539     under the same terms as Perl itself.
540    
541     =cut
542    
543     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26