/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 519 - (hide annotations)
Thu May 18 13:48:51 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 11604 byte(s)
 r689@llin:  dpavlin | 2006-05-18 15:45:23 +0200
 treat field names as strings, not numbers (Excel field names are chars, not numbers)

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 506 Version 0.05
20 dpavlin 1
21     =cut
22    
23 dpavlin 506 our $VERSION = '0.05';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42     use WebPAC::Input;
43    
44 dpavlin 3 my $db = WebPAC::Input->new(
45 dpavlin 286 module => 'WebPAC::Input::ISIS',
46     config => $config,
47     lookup => $lookup_obj,
48     low_mem => 1,
49 dpavlin 3 );
50 dpavlin 1
51 dpavlin 3 $db->open('/path/to/database');
52 dpavlin 416 print "database size: ",$db->size,"\n";
53     while (my $rec = $db->fetch) {
54     # do something with $rec
55     }
56 dpavlin 1
57 dpavlin 286
58    
59 dpavlin 1 =head1 FUNCTIONS
60    
61 dpavlin 3 =head2 new
62 dpavlin 1
63 dpavlin 3 Create new input database object.
64    
65 dpavlin 9 my $db = new WebPAC::Input(
66 dpavlin 286 module => 'WebPAC::Input::MARC',
67 dpavlin 9 code_page => 'ISO-8859-2',
68 dpavlin 10 low_mem => 1,
69 dpavlin 416 recode => 'char pairs',
70 dpavlin 483 no_progress_bar => 1,
71 dpavlin 9 );
72 dpavlin 3
73 dpavlin 286 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
74     L<WebPAC::Input::MARC>.
75    
76 dpavlin 9 Optional parametar C<code_page> specify application code page (which will be
77     used internally). This should probably be your terminal encoding, and by
78     default, it C<ISO-8859-2>.
79    
80 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
81    
82 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
83     should be replaced in input stream.
84    
85     C<no_progress_bar> disables progress bar output on C<STDOUT>
86    
87 dpavlin 285 This function will also call low-level C<init> if it exists with same
88     parametars.
89    
90 dpavlin 1 =cut
91    
92 dpavlin 3 sub new {
93 dpavlin 285 my $class = shift;
94     my $self = {@_};
95 dpavlin 3 bless($self, $class);
96    
97 dpavlin 285 my $log = $self->_get_logger;
98    
99 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
100     my $module = $self->{module};
101     $module =~ s#::#/#g;
102     $module .= '.pm';
103     $log->debug("require low-level module $self->{module} from $module");
104 dpavlin 289
105 dpavlin 286 require $module;
106 dpavlin 289 #eval $self->{module} .'->import';
107 dpavlin 286
108 dpavlin 285 # check if required subclasses are implemented
109 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
110     my $n = $self->{module} . '::' . $subclass;
111     if (! defined &{ $n }) {
112 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
113 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
114 dpavlin 286 } else {
115 dpavlin 289 $self->{$subclass} = \&{ $n };
116 dpavlin 286 }
117 dpavlin 285 }
118    
119 dpavlin 289 if ($self->{init}) {
120 dpavlin 285 $log->debug("calling init");
121 dpavlin 289 $self->{init}->($self, @_);
122 dpavlin 285 }
123    
124 dpavlin 9 $self->{'code_page'} ||= 'ISO-8859-2';
125    
126 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
127     if ($self->{'low_mem'}) {
128     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
129    
130     my $db_file = "data.db";
131    
132     if (-e $db_file) {
133     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
134     $log->debug("removed '$db_file' from last run");
135     }
136    
137     require DBM::Deep;
138    
139     my $db = new DBM::Deep $db_file;
140    
141     $log->logdie("DBM::Deep error: $!") unless ($db);
142    
143     if ($db->error()) {
144     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
145     } else {
146     $log->debug("using file '$db_file' for DBM::Deep");
147     }
148    
149     $self->{'db'} = $db;
150     }
151    
152 dpavlin 3 $self ? return $self : return undef;
153 dpavlin 1 }
154    
155 dpavlin 285 =head2 open
156    
157     This function will read whole database in memory and produce lookups.
158    
159 dpavlin 286 $input->open(
160 dpavlin 285 path => '/path/to/database/file',
161     code_page => '852',
162 dpavlin 286 limit => 500,
163     offset => 6000,
164 dpavlin 285 lookup => $lookup_obj,
165 dpavlin 506 stats => 1,
166 dpavlin 285 );
167    
168     By default, C<code_page> is assumed to be C<852>.
169    
170 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
171 dpavlin 285
172 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
173 dpavlin 285
174 dpavlin 506 C<stats> create optional report about usage of fields and subfields
175    
176 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
177     parametars, see also C<size>.
178 dpavlin 285
179     =cut
180    
181     sub open {
182     my $self = shift;
183     my $arg = {@_};
184    
185     my $log = $self->_get_logger();
186    
187     $log->logcroak("need path") if (! $arg->{'path'});
188     my $code_page = $arg->{'code_page'} || '852';
189    
190     # store data in object
191 dpavlin 292 $self->{'input_code_page'} = $code_page;
192 dpavlin 286 foreach my $v (qw/path offset limit/) {
193 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
194     }
195    
196     # create Text::Iconv object
197     $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
198    
199 dpavlin 416 my $filter_ref;
200    
201     if ($self->{recode}) {
202     my @r = split(/\s/, $self->{recode});
203     if ($#r % 2 != 1) {
204     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
205     } else {
206     my $recode;
207     while (@r) {
208     my $from = shift @r;
209     my $to = shift @r;
210     $recode->{$from} = $to;
211     }
212    
213     my $regex = join '|' => keys %{ $recode };
214    
215     $log->debug("using recode regex: $regex");
216    
217     $filter_ref = sub {
218     my $t = shift;
219     $t =~ s/($regex)/$recode->{$1}/g;
220     return $t;
221     };
222    
223     }
224    
225     }
226    
227 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
228 dpavlin 285 path => $arg->{path},
229 dpavlin 416 filter => $filter_ref,
230 dpavlin 285 );
231    
232 dpavlin 496 unless (defined($db)) {
233 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
234     return;
235     }
236    
237     unless ($size) {
238     $log->logwarn("no records in database $arg->{path}, skipping...");
239     return;
240     }
241    
242 dpavlin 339 my $from_rec = 1;
243     my $to_rec = $size;
244 dpavlin 285
245 dpavlin 286 if (my $s = $self->{offset}) {
246 dpavlin 513 $log->debug("skipping to MFN $s");
247 dpavlin 339 $from_rec = $s;
248 dpavlin 285 } else {
249 dpavlin 339 $self->{offset} = $from_rec;
250 dpavlin 285 }
251    
252 dpavlin 286 if ($self->{limit}) {
253 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
254 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
255     $to_rec = $size if ($to_rec > $size);
256 dpavlin 285 }
257    
258     # store size for later
259 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
260 dpavlin 285
261 dpavlin 506 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}", $self->{stats} ? ' [stats]' : '');
262 dpavlin 285
263     # read database
264 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
265 dpavlin 285
266 dpavlin 286 $log->debug("position: $pos\n");
267 dpavlin 285
268 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
269 dpavlin 285
270 dpavlin 308 $log->debug(sub { Dumper($rec) });
271    
272 dpavlin 285 if (! $rec) {
273 dpavlin 286 $log->warn("record $pos empty? skipping...");
274 dpavlin 285 next;
275     }
276    
277     # store
278 dpavlin 286 if ($self->{low_mem}) {
279     $self->{db}->put($pos, $rec);
280 dpavlin 285 } else {
281 dpavlin 286 $self->{data}->{$pos} = $rec;
282 dpavlin 285 }
283    
284     # create lookup
285     $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
286    
287 dpavlin 506 # update counters for statistics
288     if ($self->{stats}) {
289     map {
290     my $fld = $_;
291     $self->{_stats}->{fld}->{ $fld }++;
292     if (ref($rec->{ $fld }) eq 'ARRAY') {
293     map {
294     if (ref($_) eq 'HASH') {
295     map {
296     $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
297     } keys %{ $_ };
298     } else {
299     $self->{_stats}->{repeatable}->{ $fld }++;
300     }
301     } @{ $rec->{$fld} };
302     }
303     } keys %{ $rec };
304     }
305    
306 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
307 dpavlin 285
308     }
309    
310 dpavlin 286 $self->{pos} = -1;
311     $self->{last_pcnt} = 0;
312 dpavlin 285
313     # store max mfn and return it.
314 dpavlin 339 $self->{max_pos} = $to_rec;
315     $log->debug("max_pos: $to_rec");
316 dpavlin 285
317     return $size;
318     }
319    
320     =head2 fetch
321    
322     Fetch next record from database. It will also displays progress bar.
323    
324     my $rec = $isis->fetch;
325    
326     Record from this function should probably go to C<data_structure> for
327     normalisation.
328    
329     =cut
330    
331     sub fetch {
332     my $self = shift;
333    
334     my $log = $self->_get_logger();
335    
336 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
337 dpavlin 285
338 dpavlin 286 if ($self->{pos} == -1) {
339     $self->{pos} = $self->{offset};
340 dpavlin 285 } else {
341 dpavlin 286 $self->{pos}++;
342 dpavlin 285 }
343    
344 dpavlin 286 my $mfn = $self->{pos};
345 dpavlin 285
346 dpavlin 286 if ($mfn > $self->{max_pos}) {
347     $self->{pos} = $self->{max_pos};
348 dpavlin 285 $log->debug("at EOF");
349     return;
350     }
351    
352 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
353 dpavlin 285
354     my $rec;
355    
356 dpavlin 286 if ($self->{low_mem}) {
357     $rec = $self->{db}->get($mfn);
358 dpavlin 285 } else {
359 dpavlin 286 $rec = $self->{data}->{$mfn};
360 dpavlin 285 }
361    
362     $rec ||= 0E0;
363     }
364    
365     =head2 pos
366    
367     Returns current record number (MFN).
368    
369     print $isis->pos;
370    
371     First record in database has position 1.
372    
373     =cut
374    
375     sub pos {
376     my $self = shift;
377 dpavlin 286 return $self->{pos};
378 dpavlin 285 }
379    
380    
381     =head2 size
382    
383     Returns number of records in database
384    
385     print $isis->size;
386    
387     Result from this function can be used to loop through all records
388    
389     foreach my $mfn ( 1 ... $isis->size ) { ... }
390    
391 dpavlin 286 because it takes into account C<offset> and C<limit>.
392 dpavlin 285
393     =cut
394    
395     sub size {
396     my $self = shift;
397 dpavlin 286 return $self->{size};
398 dpavlin 285 }
399    
400     =head2 seek
401    
402     Seek to specified MFN in file.
403    
404     $isis->seek(42);
405    
406     First record in database has position 1.
407    
408     =cut
409    
410     sub seek {
411     my $self = shift;
412     my $pos = shift || return;
413    
414     my $log = $self->_get_logger();
415    
416     if ($pos < 1) {
417     $log->warn("seek before first record");
418     $pos = 1;
419 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
420 dpavlin 285 $log->warn("seek beyond last record");
421 dpavlin 286 $pos = $self->{max_pos};
422 dpavlin 285 }
423    
424 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
425 dpavlin 285 }
426    
427 dpavlin 506 =head2 stats
428 dpavlin 285
429 dpavlin 506 Dump statistics about field and subfield usage
430    
431 dpavlin 507 print $input->stats;
432 dpavlin 506
433     =cut
434    
435     sub stats {
436     my $self = shift;
437 dpavlin 507
438     my $log = $self->_get_logger();
439    
440     my $s = $self->{_stats};
441     if (! $s) {
442     $log->warn("called stats, but there is no statistics collected");
443     return;
444     }
445    
446     my $max_fld = 0;
447    
448     my $out = join("\n",
449     map {
450     my $f = $_ || die "no field";
451     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
452     $max_fld = $v if ($v > $max_fld);
453    
454 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
455 dpavlin 507
456     if (defined($s->{sf}->{$f})) {
457     map {
458     $o .= sprintf(" %s:%d", $_, $s->{sf}->{$f}->{$_});
459     } sort keys %{ $s->{sf}->{$f} };
460     }
461    
462     if (my $v_r = $s->{repeatable}->{$f}) {
463     $o .= " ($v_r)" if ($v_r != $v);
464     }
465    
466     $o;
467 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
468 dpavlin 507 );
469    
470     $log->debug( sub { Dumper($s) } );
471    
472     return $out;
473 dpavlin 506 }
474    
475 dpavlin 3 =head1 MEMORY USAGE
476 dpavlin 1
477 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
478     will run on memory constraint machines (which doesn't have enough
479     physical RAM to create memory structure for whole source database).
480 dpavlin 1
481 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
482     memory shouldn't be an issue. If you don't have enough physical RAM, you
483     might consider using virtual memory (if your operating system is handling it
484     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
485     parsed structure of ISIS database (this is what C<low_mem> option does).
486 dpavlin 1
487 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
488     hitting swap before 90% will dramatically decrease performance and you will
489     be better off with C<low_mem> and using rest of availble memory for
490     operating system disk cache (Linux is particuallary good about this).
491     However, every access to database record will require disk access, so
492     generation phase will be slower 10-100 times.
493    
494     Parsed structures are essential - you just have option to trade RAM memory
495     (which is fast) for disk space (which is slow). Be sure to have planty of
496     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
497    
498     However, when WebPAC is running on desktop machines (or laptops :-), it's
499     highly undesireable for system to start swapping. Using C<low_mem> option can
500     reduce WecPAC memory usage to around 64Mb for same database with lookup
501     fields and sorted indexes which stay in RAM. Performance will suffer, but
502     memory usage will really be minimal. It might be also more confortable to
503     run WebPAC reniced on those machines.
504    
505    
506     =head1 AUTHOR
507    
508     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
509    
510 dpavlin 1 =head1 COPYRIGHT & LICENSE
511    
512     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
513    
514     This program is free software; you can redistribute it and/or modify it
515     under the same terms as Perl itself.
516    
517     =cut
518    
519     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26