/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 598 - (hide annotations)
Thu Jul 13 13:55:15 2006 UTC (17 years, 9 months ago) by dpavlin
File size: 14523 byte(s)
 r834@llin:  dpavlin | 2006-07-13 14:49:23 +0200
 fix pod

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 593 Version 0.08
20 dpavlin 1
21     =cut
22    
23 dpavlin 593 our $VERSION = '0.08';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101     my $module = $self->{module};
102     $module =~ s#::#/#g;
103     $module .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module");
105 dpavlin 289
106 dpavlin 286 require $module;
107 dpavlin 289 #eval $self->{module} .'->import';
108 dpavlin 286
109 dpavlin 285 # check if required subclasses are implemented
110 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
111     my $n = $self->{module} . '::' . $subclass;
112     if (! defined &{ $n }) {
113 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
114 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
115 dpavlin 286 } else {
116 dpavlin 289 $self->{$subclass} = \&{ $n };
117 dpavlin 286 }
118 dpavlin 285 }
119    
120 dpavlin 289 if ($self->{init}) {
121 dpavlin 285 $log->debug("calling init");
122 dpavlin 289 $self->{init}->($self, @_);
123 dpavlin 285 }
124    
125 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
126 dpavlin 9
127 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
128     if ($self->{'low_mem'}) {
129     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
130    
131     my $db_file = "data.db";
132    
133     if (-e $db_file) {
134     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
135     $log->debug("removed '$db_file' from last run");
136     }
137    
138     require DBM::Deep;
139    
140     my $db = new DBM::Deep $db_file;
141    
142     $log->logdie("DBM::Deep error: $!") unless ($db);
143    
144     if ($db->error()) {
145     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
146     } else {
147     $log->debug("using file '$db_file' for DBM::Deep");
148     }
149    
150     $self->{'db'} = $db;
151     }
152    
153 dpavlin 3 $self ? return $self : return undef;
154 dpavlin 1 }
155    
156 dpavlin 285 =head2 open
157    
158     This function will read whole database in memory and produce lookups.
159    
160 dpavlin 286 $input->open(
161 dpavlin 285 path => '/path/to/database/file',
162     code_page => '852',
163 dpavlin 286 limit => 500,
164     offset => 6000,
165 dpavlin 285 lookup => $lookup_obj,
166 dpavlin 506 stats => 1,
167 dpavlin 585 lookup_ref => sub {
168     my ($k,$v) = @_;
169     # store lookup $k => $v
170     },
171 dpavlin 597 modify_records => {
172     900 => { '^a' => { ' : ' => '^b' } },
173     901 => { '*' => { '^b' => ' ; ' } },
174     },
175 dpavlin 285 );
176    
177     By default, C<code_page> is assumed to be C<852>.
178    
179 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
180 dpavlin 285
181 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
182 dpavlin 285
183 dpavlin 506 C<stats> create optional report about usage of fields and subfields
184    
185 dpavlin 597 C<lookup_coderef> is closure to call when adding C<< key => 'value' >> combinations to
186 dpavlin 585 lookup.
187    
188 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
189     delimiters to subfields, as well as oprations on fields (if subfield is
190     defined as C<*>.
191    
192 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
193     parametars, see also C<size>.
194 dpavlin 285
195     =cut
196    
197     sub open {
198     my $self = shift;
199     my $arg = {@_};
200    
201     my $log = $self->_get_logger();
202    
203 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
204     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
205     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
206    
207 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
208     my $code_page = $arg->{'code_page'} || '852';
209    
210     # store data in object
211 dpavlin 292 $self->{'input_code_page'} = $code_page;
212 dpavlin 286 foreach my $v (qw/path offset limit/) {
213 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
214     }
215    
216     # create Text::Iconv object
217 dpavlin 585 $self->{iconv} = Text::Iconv->new($code_page,$self->{'encoding'}); ## FIXME remove!
218 dpavlin 285
219 dpavlin 416 my $filter_ref;
220 dpavlin 597 my $recode_regex;
221     my $recode_map;
222 dpavlin 416
223     if ($self->{recode}) {
224     my @r = split(/\s/, $self->{recode});
225     if ($#r % 2 != 1) {
226     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
227     } else {
228     while (@r) {
229     my $from = shift @r;
230     my $to = shift @r;
231 dpavlin 597 $recode_map->{$from} = $to;
232 dpavlin 416 }
233    
234 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
235 dpavlin 416
236 dpavlin 597 $log->debug("using recode regex: $recode_regex");
237 dpavlin 416 }
238    
239     }
240    
241 dpavlin 597 my $rec_regex = $self->modify_record_regexps(%{ $arg->{modify_records} });
242     $log->debug("rec_regex: ", Dumper($rec_regex));
243    
244 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
245 dpavlin 285 path => $arg->{path},
246 dpavlin 597 filter => sub {
247     my ($l,$f_nr) = @_;
248     return unless defined($l);
249    
250     ## FIXME remove iconv!
251     $l = $self->{iconv}->convert($l) if ($self->{iconv});
252    
253     $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
254    
255     return $l unless ($rec_regex);
256    
257     # apply regexps
258     if ($rec_regex && defined($rec_regex->{$f_nr})) {
259     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
260     my $c = 0;
261     foreach my $r (@{ $rec_regex->{$f_nr} }) {
262     while ( eval '$l =~ ' . $r ) { $c++ };
263     }
264     warn "## field $f_nr triggered $c regexpes\n" if ($c && $self->{debug});
265     }
266    
267     return $l;
268     },
269 dpavlin 523 %{ $arg },
270 dpavlin 285 );
271    
272 dpavlin 496 unless (defined($db)) {
273 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
274     return;
275     }
276    
277     unless ($size) {
278     $log->logwarn("no records in database $arg->{path}, skipping...");
279     return;
280     }
281    
282 dpavlin 339 my $from_rec = 1;
283     my $to_rec = $size;
284 dpavlin 285
285 dpavlin 286 if (my $s = $self->{offset}) {
286 dpavlin 513 $log->debug("skipping to MFN $s");
287 dpavlin 339 $from_rec = $s;
288 dpavlin 285 } else {
289 dpavlin 339 $self->{offset} = $from_rec;
290 dpavlin 285 }
291    
292 dpavlin 286 if ($self->{limit}) {
293 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
294 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
295     $to_rec = $size if ($to_rec > $size);
296 dpavlin 285 }
297    
298     # store size for later
299 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
300 dpavlin 285
301 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
302 dpavlin 285
303     # read database
304 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
305 dpavlin 285
306 dpavlin 286 $log->debug("position: $pos\n");
307 dpavlin 285
308 dpavlin 289 my $rec = $self->{fetch_rec}->($self, $db, $pos );
309 dpavlin 285
310 dpavlin 308 $log->debug(sub { Dumper($rec) });
311    
312 dpavlin 285 if (! $rec) {
313 dpavlin 286 $log->warn("record $pos empty? skipping...");
314 dpavlin 285 next;
315     }
316    
317     # store
318 dpavlin 286 if ($self->{low_mem}) {
319     $self->{db}->put($pos, $rec);
320 dpavlin 285 } else {
321 dpavlin 286 $self->{data}->{$pos} = $rec;
322 dpavlin 285 }
323    
324     # create lookup
325 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
326 dpavlin 285
327 dpavlin 506 # update counters for statistics
328     if ($self->{stats}) {
329 dpavlin 593
330     foreach my $fld (keys %{ $rec }) {
331 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
332 dpavlin 593
333     $log->logdie("invalid record fild $fld, not ARRAY")
334     unless (ref($rec->{ $fld }) eq 'ARRAY');
335    
336     foreach my $row (@{ $rec->{$fld} }) {
337    
338     if (ref($row) eq 'HASH') {
339    
340     foreach my $sf (keys %{ $row }) {
341     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
342     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
343     if (ref($row->{$sf}) eq 'ARRAY');
344 dpavlin 506 }
345 dpavlin 593
346     } else {
347     $self->{_stats}->{repeatable}->{ $fld }++;
348     }
349 dpavlin 506 }
350 dpavlin 593 }
351 dpavlin 506 }
352    
353 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
354 dpavlin 285
355     }
356    
357 dpavlin 286 $self->{pos} = -1;
358     $self->{last_pcnt} = 0;
359 dpavlin 285
360     # store max mfn and return it.
361 dpavlin 339 $self->{max_pos} = $to_rec;
362     $log->debug("max_pos: $to_rec");
363 dpavlin 285
364     return $size;
365     }
366    
367     =head2 fetch
368    
369     Fetch next record from database. It will also displays progress bar.
370    
371     my $rec = $isis->fetch;
372    
373     Record from this function should probably go to C<data_structure> for
374     normalisation.
375    
376     =cut
377    
378     sub fetch {
379     my $self = shift;
380    
381     my $log = $self->_get_logger();
382    
383 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
384 dpavlin 285
385 dpavlin 286 if ($self->{pos} == -1) {
386     $self->{pos} = $self->{offset};
387 dpavlin 285 } else {
388 dpavlin 286 $self->{pos}++;
389 dpavlin 285 }
390    
391 dpavlin 286 my $mfn = $self->{pos};
392 dpavlin 285
393 dpavlin 286 if ($mfn > $self->{max_pos}) {
394     $self->{pos} = $self->{max_pos};
395 dpavlin 285 $log->debug("at EOF");
396     return;
397     }
398    
399 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
400 dpavlin 285
401     my $rec;
402    
403 dpavlin 286 if ($self->{low_mem}) {
404     $rec = $self->{db}->get($mfn);
405 dpavlin 285 } else {
406 dpavlin 286 $rec = $self->{data}->{$mfn};
407 dpavlin 285 }
408    
409     $rec ||= 0E0;
410     }
411    
412     =head2 pos
413    
414     Returns current record number (MFN).
415    
416     print $isis->pos;
417    
418     First record in database has position 1.
419    
420     =cut
421    
422     sub pos {
423     my $self = shift;
424 dpavlin 286 return $self->{pos};
425 dpavlin 285 }
426    
427    
428     =head2 size
429    
430     Returns number of records in database
431    
432     print $isis->size;
433    
434     Result from this function can be used to loop through all records
435    
436     foreach my $mfn ( 1 ... $isis->size ) { ... }
437    
438 dpavlin 286 because it takes into account C<offset> and C<limit>.
439 dpavlin 285
440     =cut
441    
442     sub size {
443     my $self = shift;
444 dpavlin 286 return $self->{size};
445 dpavlin 285 }
446    
447     =head2 seek
448    
449     Seek to specified MFN in file.
450    
451     $isis->seek(42);
452    
453     First record in database has position 1.
454    
455     =cut
456    
457     sub seek {
458     my $self = shift;
459     my $pos = shift || return;
460    
461     my $log = $self->_get_logger();
462    
463     if ($pos < 1) {
464     $log->warn("seek before first record");
465     $pos = 1;
466 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
467 dpavlin 285 $log->warn("seek beyond last record");
468 dpavlin 286 $pos = $self->{max_pos};
469 dpavlin 285 }
470    
471 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
472 dpavlin 285 }
473    
474 dpavlin 506 =head2 stats
475 dpavlin 285
476 dpavlin 506 Dump statistics about field and subfield usage
477    
478 dpavlin 507 print $input->stats;
479 dpavlin 506
480     =cut
481    
482     sub stats {
483     my $self = shift;
484 dpavlin 507
485     my $log = $self->_get_logger();
486    
487     my $s = $self->{_stats};
488     if (! $s) {
489     $log->warn("called stats, but there is no statistics collected");
490     return;
491     }
492    
493     my $max_fld = 0;
494    
495     my $out = join("\n",
496     map {
497     my $f = $_ || die "no field";
498     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
499     $max_fld = $v if ($v > $max_fld);
500    
501 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
502 dpavlin 507
503     if (defined($s->{sf}->{$f})) {
504     map {
505 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
506     $s->{sf}->{$f}->{$_}->{count},
507     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
508     );
509 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
510     }
511    
512     if (my $v_r = $s->{repeatable}->{$f}) {
513     $o .= " ($v_r)" if ($v_r != $v);
514     }
515    
516     $o;
517 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
518 dpavlin 507 );
519    
520     $log->debug( sub { Dumper($s) } );
521    
522     return $out;
523 dpavlin 506 }
524    
525 dpavlin 598 =head2 modify_record_regexps
526 dpavlin 597
527     Generate hash with regexpes to be applied using L<filter>.
528    
529     my $regexpes = $input->modify_record_regexps(
530     900 => { '^a' => { ' : ' => '^b' } },
531     901 => { '*' => { '^b' => ' ; ' } },
532     );
533    
534     =cut
535    
536     sub modify_record_regexps {
537     my $self = shift;
538     my $modify_record = {@_};
539    
540     my $regexpes;
541    
542     foreach my $f (keys %$modify_record) {
543     warn "--- f: $f\n";
544     foreach my $sf (keys %{ $modify_record->{$f} }) {
545     warn "---- sf: $sf\n";
546     foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
547     my $to = $modify_record->{$f}->{$sf}->{$from};
548     #die "no field?" unless defined($to);
549     warn "----- transform: |$from| -> |$to|\n";
550    
551     if ($sf =~ /^\^/) {
552     my $regex =
553     's/\Q'. $sf .'\E([^\^]+)\Q'. $from .'\E([^\^]+)/'. $sf .'$1'. $to .'$2/g';
554     push @{ $regexpes->{$f} }, $regex;
555     warn ">>>>> $regex [sf]\n";
556     } else {
557     my $regex =
558     's/\Q'. $from .'\E/'. $to .'/g';
559     push @{ $regexpes->{$f} }, $regex;
560     warn ">>>>> $regex [global]\n";
561     }
562    
563     }
564     }
565     }
566    
567     return $regexpes;
568     }
569    
570 dpavlin 3 =head1 MEMORY USAGE
571 dpavlin 1
572 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
573     will run on memory constraint machines (which doesn't have enough
574     physical RAM to create memory structure for whole source database).
575 dpavlin 1
576 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
577     memory shouldn't be an issue. If you don't have enough physical RAM, you
578     might consider using virtual memory (if your operating system is handling it
579     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
580     parsed structure of ISIS database (this is what C<low_mem> option does).
581 dpavlin 1
582 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
583     hitting swap before 90% will dramatically decrease performance and you will
584     be better off with C<low_mem> and using rest of availble memory for
585     operating system disk cache (Linux is particuallary good about this).
586     However, every access to database record will require disk access, so
587     generation phase will be slower 10-100 times.
588    
589     Parsed structures are essential - you just have option to trade RAM memory
590     (which is fast) for disk space (which is slow). Be sure to have planty of
591     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
592    
593     However, when WebPAC is running on desktop machines (or laptops :-), it's
594     highly undesireable for system to start swapping. Using C<low_mem> option can
595     reduce WecPAC memory usage to around 64Mb for same database with lookup
596     fields and sorted indexes which stay in RAM. Performance will suffer, but
597     memory usage will really be minimal. It might be also more confortable to
598     run WebPAC reniced on those machines.
599    
600    
601     =head1 AUTHOR
602    
603     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
604    
605 dpavlin 1 =head1 COPYRIGHT & LICENSE
606    
607     Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
608    
609     This program is free software; you can redistribute it and/or modify it
610     under the same terms as Perl itself.
611    
612     =cut
613    
614     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26