/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 506 - (show annotations)
Mon May 15 09:59:05 2006 UTC (17 years, 11 months ago) by dpavlin
File size: 10920 byte(s)
 r663@llin:  dpavlin | 2006-05-15 12:02:43 +0200
 added stats gathering

1 package WebPAC::Input;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Common;
7 use base qw/WebPAC::Common/;
8 use Text::Iconv;
9 use Data::Dumper;
10
11 =head1 NAME
12
13 WebPAC::Input - read different file formats into WebPAC
14
15 =head1 VERSION
16
17 Version 0.05
18
19 =cut
20
21 our $VERSION = '0.05';
22
23 =head1 SYNOPSIS
24
25 This module implements input as database which have fixed and known
26 I<size> while indexing and single unique numeric identifier for database
27 position ranging from 1 to I<size>.
28
29 Simply, something that is indexed by unmber from 1 .. I<size>.
30
31 Examples of such databases are CDS/ISIS files, MARC files, lines in
32 text file, and so on.
33
34 Specific file formats are implemented using low-level interface modules,
35 located in C<WebPAC::Input::*> namespace which export C<open_db>,
36 C<fetch_rec> and optional C<init> functions.
37
38 Perhaps a little code snippet.
39
40 use WebPAC::Input;
41
42 my $db = WebPAC::Input->new(
43 module => 'WebPAC::Input::ISIS',
44 config => $config,
45 lookup => $lookup_obj,
46 low_mem => 1,
47 );
48
49 $db->open('/path/to/database');
50 print "database size: ",$db->size,"\n";
51 while (my $rec = $db->fetch) {
52 # do something with $rec
53 }
54
55
56
57 =head1 FUNCTIONS
58
59 =head2 new
60
61 Create new input database object.
62
63 my $db = new WebPAC::Input(
64 module => 'WebPAC::Input::MARC',
65 code_page => 'ISO-8859-2',
66 low_mem => 1,
67 recode => 'char pairs',
68 no_progress_bar => 1,
69 );
70
71 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72 L<WebPAC::Input::MARC>.
73
74 Optional parametar C<code_page> specify application code page (which will be
75 used internally). This should probably be your terminal encoding, and by
76 default, it C<ISO-8859-2>.
77
78 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79
80 C<recode> is optional string constisting of character or words pairs that
81 should be replaced in input stream.
82
83 C<no_progress_bar> disables progress bar output on C<STDOUT>
84
85 This function will also call low-level C<init> if it exists with same
86 parametars.
87
88 =cut
89
90 sub new {
91 my $class = shift;
92 my $self = {@_};
93 bless($self, $class);
94
95 my $log = $self->_get_logger;
96
97 $log->logconfess("specify low-level file format module") unless ($self->{module});
98 my $module = $self->{module};
99 $module =~ s#::#/#g;
100 $module .= '.pm';
101 $log->debug("require low-level module $self->{module} from $module");
102
103 require $module;
104 #eval $self->{module} .'->import';
105
106 # check if required subclasses are implemented
107 foreach my $subclass (qw/open_db fetch_rec init/) {
108 my $n = $self->{module} . '::' . $subclass;
109 if (! defined &{ $n }) {
110 my $missing = "missing $subclass in $self->{module}";
111 $self->{$subclass} = sub { $log->logwarn($missing) };
112 } else {
113 $self->{$subclass} = \&{ $n };
114 }
115 }
116
117 if ($self->{init}) {
118 $log->debug("calling init");
119 $self->{init}->($self, @_);
120 }
121
122 $self->{'code_page'} ||= 'ISO-8859-2';
123
124 # running with low_mem flag? well, use DBM::Deep then.
125 if ($self->{'low_mem'}) {
126 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
127
128 my $db_file = "data.db";
129
130 if (-e $db_file) {
131 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
132 $log->debug("removed '$db_file' from last run");
133 }
134
135 require DBM::Deep;
136
137 my $db = new DBM::Deep $db_file;
138
139 $log->logdie("DBM::Deep error: $!") unless ($db);
140
141 if ($db->error()) {
142 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
143 } else {
144 $log->debug("using file '$db_file' for DBM::Deep");
145 }
146
147 $self->{'db'} = $db;
148 }
149
150 $self ? return $self : return undef;
151 }
152
153 =head2 open
154
155 This function will read whole database in memory and produce lookups.
156
157 $input->open(
158 path => '/path/to/database/file',
159 code_page => '852',
160 limit => 500,
161 offset => 6000,
162 lookup => $lookup_obj,
163 stats => 1,
164 );
165
166 By default, C<code_page> is assumed to be C<852>.
167
168 C<offset> is optional parametar to position at some offset before reading from database.
169
170 C<limit> is optional parametar to read just C<limit> records from database
171
172 C<stats> create optional report about usage of fields and subfields
173
174 Returns size of database, regardless of C<offset> and C<limit>
175 parametars, see also C<size>.
176
177 =cut
178
179 sub open {
180 my $self = shift;
181 my $arg = {@_};
182
183 my $log = $self->_get_logger();
184
185 $log->logcroak("need path") if (! $arg->{'path'});
186 my $code_page = $arg->{'code_page'} || '852';
187
188 # store data in object
189 $self->{'input_code_page'} = $code_page;
190 foreach my $v (qw/path offset limit/) {
191 $self->{$v} = $arg->{$v} if ($arg->{$v});
192 }
193
194 # create Text::Iconv object
195 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
196
197 my $filter_ref;
198
199 if ($self->{recode}) {
200 my @r = split(/\s/, $self->{recode});
201 if ($#r % 2 != 1) {
202 $log->logwarn("recode needs even number of elements (some number of valid pairs)");
203 } else {
204 my $recode;
205 while (@r) {
206 my $from = shift @r;
207 my $to = shift @r;
208 $recode->{$from} = $to;
209 }
210
211 my $regex = join '|' => keys %{ $recode };
212
213 $log->debug("using recode regex: $regex");
214
215 $filter_ref = sub {
216 my $t = shift;
217 $t =~ s/($regex)/$recode->{$1}/g;
218 return $t;
219 };
220
221 }
222
223 }
224
225 my ($db, $size) = $self->{open_db}->( $self,
226 path => $arg->{path},
227 filter => $filter_ref,
228 );
229
230 unless (defined($db)) {
231 $log->logwarn("can't open database $arg->{path}, skipping...");
232 return;
233 }
234
235 unless ($size) {
236 $log->logwarn("no records in database $arg->{path}, skipping...");
237 return;
238 }
239
240 my $from_rec = 1;
241 my $to_rec = $size;
242
243 if (my $s = $self->{offset}) {
244 $log->info("skipping to MFN $s");
245 $from_rec = $s;
246 } else {
247 $self->{offset} = $from_rec;
248 }
249
250 if ($self->{limit}) {
251 $log->debug("limiting to ",$self->{limit}," records");
252 $to_rec = $from_rec + $self->{limit} - 1;
253 $to_rec = $size if ($to_rec > $size);
254 }
255
256 # store size for later
257 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
258
259 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}", $self->{stats} ? ' [stats]' : '');
260
261 # read database
262 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
263
264 $log->debug("position: $pos\n");
265
266 my $rec = $self->{fetch_rec}->($self, $db, $pos );
267
268 $log->debug(sub { Dumper($rec) });
269
270 if (! $rec) {
271 $log->warn("record $pos empty? skipping...");
272 next;
273 }
274
275 # store
276 if ($self->{low_mem}) {
277 $self->{db}->put($pos, $rec);
278 } else {
279 $self->{data}->{$pos} = $rec;
280 }
281
282 # create lookup
283 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
284
285 # update counters for statistics
286 if ($self->{stats}) {
287 map {
288 my $fld = $_;
289 $self->{_stats}->{fld}->{ $fld }++;
290 if (ref($rec->{ $fld }) eq 'ARRAY') {
291 map {
292 if (ref($_) eq 'HASH') {
293 map {
294 $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
295 } keys %{ $_ };
296 } else {
297 $self->{_stats}->{repeatable}->{ $fld }++;
298 }
299 } @{ $rec->{$fld} };
300 }
301 } keys %{ $rec };
302 }
303
304 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
305
306 }
307
308 $self->{pos} = -1;
309 $self->{last_pcnt} = 0;
310
311 # store max mfn and return it.
312 $self->{max_pos} = $to_rec;
313 $log->debug("max_pos: $to_rec");
314
315 return $size;
316 }
317
318 =head2 fetch
319
320 Fetch next record from database. It will also displays progress bar.
321
322 my $rec = $isis->fetch;
323
324 Record from this function should probably go to C<data_structure> for
325 normalisation.
326
327 =cut
328
329 sub fetch {
330 my $self = shift;
331
332 my $log = $self->_get_logger();
333
334 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
335
336 if ($self->{pos} == -1) {
337 $self->{pos} = $self->{offset};
338 } else {
339 $self->{pos}++;
340 }
341
342 my $mfn = $self->{pos};
343
344 if ($mfn > $self->{max_pos}) {
345 $self->{pos} = $self->{max_pos};
346 $log->debug("at EOF");
347 return;
348 }
349
350 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
351
352 my $rec;
353
354 if ($self->{low_mem}) {
355 $rec = $self->{db}->get($mfn);
356 } else {
357 $rec = $self->{data}->{$mfn};
358 }
359
360 $rec ||= 0E0;
361 }
362
363 =head2 pos
364
365 Returns current record number (MFN).
366
367 print $isis->pos;
368
369 First record in database has position 1.
370
371 =cut
372
373 sub pos {
374 my $self = shift;
375 return $self->{pos};
376 }
377
378
379 =head2 size
380
381 Returns number of records in database
382
383 print $isis->size;
384
385 Result from this function can be used to loop through all records
386
387 foreach my $mfn ( 1 ... $isis->size ) { ... }
388
389 because it takes into account C<offset> and C<limit>.
390
391 =cut
392
393 sub size {
394 my $self = shift;
395 return $self->{size};
396 }
397
398 =head2 seek
399
400 Seek to specified MFN in file.
401
402 $isis->seek(42);
403
404 First record in database has position 1.
405
406 =cut
407
408 sub seek {
409 my $self = shift;
410 my $pos = shift || return;
411
412 my $log = $self->_get_logger();
413
414 if ($pos < 1) {
415 $log->warn("seek before first record");
416 $pos = 1;
417 } elsif ($pos > $self->{max_pos}) {
418 $log->warn("seek beyond last record");
419 $pos = $self->{max_pos};
420 }
421
422 return $self->{pos} = (($pos - 1) || -1);
423 }
424
425 =head2 stats
426
427 Dump statistics about field and subfield usage
428
429 print Dumper( $input->stats );
430
431 =cut
432
433 sub stats {
434 my $self = shift;
435 return $self->{_stats};
436 }
437
438 =head1 MEMORY USAGE
439
440 C<low_mem> options is double-edged sword. If enabled, WebPAC
441 will run on memory constraint machines (which doesn't have enough
442 physical RAM to create memory structure for whole source database).
443
444 If your machine has 512Mb or more of RAM and database is around 10000 records,
445 memory shouldn't be an issue. If you don't have enough physical RAM, you
446 might consider using virtual memory (if your operating system is handling it
447 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
448 parsed structure of ISIS database (this is what C<low_mem> option does).
449
450 Hitting swap at end of reading source database is probably o.k. However,
451 hitting swap before 90% will dramatically decrease performance and you will
452 be better off with C<low_mem> and using rest of availble memory for
453 operating system disk cache (Linux is particuallary good about this).
454 However, every access to database record will require disk access, so
455 generation phase will be slower 10-100 times.
456
457 Parsed structures are essential - you just have option to trade RAM memory
458 (which is fast) for disk space (which is slow). Be sure to have planty of
459 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
460
461 However, when WebPAC is running on desktop machines (or laptops :-), it's
462 highly undesireable for system to start swapping. Using C<low_mem> option can
463 reduce WecPAC memory usage to around 64Mb for same database with lookup
464 fields and sorted indexes which stay in RAM. Performance will suffer, but
465 memory usage will really be minimal. It might be also more confortable to
466 run WebPAC reniced on those machines.
467
468
469 =head1 AUTHOR
470
471 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
472
473 =head1 COPYRIGHT & LICENSE
474
475 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
476
477 This program is free software; you can redistribute it and/or modify it
478 under the same terms as Perl itself.
479
480 =cut
481
482 1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26