/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 619 - (hide annotations)
Fri Aug 25 12:31:06 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 14936 byte(s)
 r867@llin:  dpavlin | 2006-08-25 14:32:05 +0200
 statistics now show data before modify_records

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10     use Text::Iconv;
11 dpavlin 308 use Data::Dumper;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 619 Version 0.11
20 dpavlin 1
21     =cut
22    
23 dpavlin 619 our $VERSION = '0.11';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101     my $module = $self->{module};
102     $module =~ s#::#/#g;
103     $module .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module");
105 dpavlin 289
106 dpavlin 286 require $module;
107 dpavlin 289 #eval $self->{module} .'->import';
108 dpavlin 286
109 dpavlin 285 # check if required subclasses are implemented
110 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
111     my $n = $self->{module} . '::' . $subclass;
112     if (! defined &{ $n }) {
113 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
114 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
115 dpavlin 286 } else {
116 dpavlin 289 $self->{$subclass} = \&{ $n };
117 dpavlin 286 }
118 dpavlin 285 }
119    
120 dpavlin 289 if ($self->{init}) {
121 dpavlin 285 $log->debug("calling init");
122 dpavlin 289 $self->{init}->($self, @_);
123 dpavlin 285 }
124    
125 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
126 dpavlin 9
127 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
128     if ($self->{'low_mem'}) {
129     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
130    
131     my $db_file = "data.db";
132    
133     if (-e $db_file) {
134     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
135     $log->debug("removed '$db_file' from last run");
136     }
137    
138     require DBM::Deep;
139    
140     my $db = new DBM::Deep $db_file;
141    
142     $log->logdie("DBM::Deep error: $!") unless ($db);
143    
144     if ($db->error()) {
145     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
146     } else {
147     $log->debug("using file '$db_file' for DBM::Deep");
148     }
149    
150     $self->{'db'} = $db;
151     }
152    
153 dpavlin 3 $self ? return $self : return undef;
154 dpavlin 1 }
155    
156 dpavlin 285 =head2 open
157    
158     This function will read whole database in memory and produce lookups.
159    
160 dpavlin 286 $input->open(
161 dpavlin 285 path => '/path/to/database/file',
162     code_page => '852',
163 dpavlin 286 limit => 500,
164     offset => 6000,
165 dpavlin 285 lookup => $lookup_obj,
166 dpavlin 506 stats => 1,
167 dpavlin 585 lookup_ref => sub {
168     my ($k,$v) = @_;
169     # store lookup $k => $v
170     },
171 dpavlin 597 modify_records => {
172     900 => { '^a' => { ' : ' => '^b' } },
173     901 => { '*' => { '^b' => ' ; ' } },
174     },
175 dpavlin 285 );
176    
177     By default, C<code_page> is assumed to be C<852>.
178    
179 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
180 dpavlin 285
181 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
182 dpavlin 285
183 dpavlin 506 C<stats> create optional report about usage of fields and subfields
184    
185 dpavlin 597 C<lookup_coderef> is closure to call when adding C<< key => 'value' >> combinations to
186 dpavlin 585 lookup.
187    
188 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
189     delimiters to subfields, as well as oprations on fields (if subfield is
190     defined as C<*>.
191    
192 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
193     parametars, see also C<size>.
194 dpavlin 285
195     =cut
196    
197     sub open {
198     my $self = shift;
199     my $arg = {@_};
200    
201     my $log = $self->_get_logger();
202    
203 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
204     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
205     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
206    
207 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
208     my $code_page = $arg->{'code_page'} || '852';
209    
210     # store data in object
211 dpavlin 292 $self->{'input_code_page'} = $code_page;
212 dpavlin 286 foreach my $v (qw/path offset limit/) {
213 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
214     }
215    
216     # create Text::Iconv object
217 dpavlin 585 $self->{iconv} = Text::Iconv->new($code_page,$self->{'encoding'}); ## FIXME remove!
218 dpavlin 285
219 dpavlin 416 my $filter_ref;
220 dpavlin 597 my $recode_regex;
221     my $recode_map;
222 dpavlin 416
223     if ($self->{recode}) {
224     my @r = split(/\s/, $self->{recode});
225     if ($#r % 2 != 1) {
226     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
227     } else {
228     while (@r) {
229     my $from = shift @r;
230     my $to = shift @r;
231 dpavlin 597 $recode_map->{$from} = $to;
232 dpavlin 416 }
233    
234 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
235 dpavlin 416
236 dpavlin 597 $log->debug("using recode regex: $recode_regex");
237 dpavlin 416 }
238    
239     }
240    
241 dpavlin 597 my $rec_regex = $self->modify_record_regexps(%{ $arg->{modify_records} });
242     $log->debug("rec_regex: ", Dumper($rec_regex));
243    
244 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
245 dpavlin 285 path => $arg->{path},
246 dpavlin 597 filter => sub {
247     my ($l,$f_nr) = @_;
248     return unless defined($l);
249    
250     ## FIXME remove iconv!
251     $l = $self->{iconv}->convert($l) if ($self->{iconv});
252    
253     $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
254    
255 dpavlin 599 ## FIXME remove this warning when we are sure that none of API is calling
256     ## this wrongly
257 dpavlin 619 warn "filter called without field number" unless ($f_nr);
258 dpavlin 597
259     return $l;
260     },
261 dpavlin 523 %{ $arg },
262 dpavlin 285 );
263    
264 dpavlin 496 unless (defined($db)) {
265 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
266     return;
267     }
268    
269     unless ($size) {
270     $log->logwarn("no records in database $arg->{path}, skipping...");
271     return;
272     }
273    
274 dpavlin 339 my $from_rec = 1;
275     my $to_rec = $size;
276 dpavlin 285
277 dpavlin 286 if (my $s = $self->{offset}) {
278 dpavlin 513 $log->debug("skipping to MFN $s");
279 dpavlin 339 $from_rec = $s;
280 dpavlin 285 } else {
281 dpavlin 339 $self->{offset} = $from_rec;
282 dpavlin 285 }
283    
284 dpavlin 286 if ($self->{limit}) {
285 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
286 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
287     $to_rec = $size if ($to_rec > $size);
288 dpavlin 285 }
289    
290     # store size for later
291 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
292 dpavlin 285
293 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
294 dpavlin 285
295     # read database
296 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
297 dpavlin 285
298 dpavlin 286 $log->debug("position: $pos\n");
299 dpavlin 285
300 dpavlin 619 my $rec = $self->{fetch_rec}->($self, $db, $pos, sub {
301     my ($l,$f_nr) = @_;
302     return unless defined($l);
303     return $l unless ($rec_regex && $f_nr);
304 dpavlin 285
305 dpavlin 619 # apply regexps
306     if ($rec_regex && defined($rec_regex->{$f_nr})) {
307     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
308     my $c = 0;
309     foreach my $r (@{ $rec_regex->{$f_nr} }) {
310     #$log->debug("\$l = $l\neval \$l =~ $r");
311     eval '$l =~ ' . $r;
312     $log->error("error applying regex: $r") if ($@);
313     }
314     }
315    
316     return $l;
317     });
318    
319 dpavlin 308 $log->debug(sub { Dumper($rec) });
320    
321 dpavlin 285 if (! $rec) {
322 dpavlin 286 $log->warn("record $pos empty? skipping...");
323 dpavlin 285 next;
324     }
325    
326     # store
327 dpavlin 286 if ($self->{low_mem}) {
328     $self->{db}->put($pos, $rec);
329 dpavlin 285 } else {
330 dpavlin 286 $self->{data}->{$pos} = $rec;
331 dpavlin 285 }
332    
333     # create lookup
334 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
335 dpavlin 285
336 dpavlin 506 # update counters for statistics
337     if ($self->{stats}) {
338 dpavlin 593
339 dpavlin 619 # fetch clean record with regexpes applied for statistics
340     my $rec = $self->{fetch_rec}->($self, $db, $pos);
341    
342 dpavlin 593 foreach my $fld (keys %{ $rec }) {
343 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
344 dpavlin 593
345     $log->logdie("invalid record fild $fld, not ARRAY")
346     unless (ref($rec->{ $fld }) eq 'ARRAY');
347    
348     foreach my $row (@{ $rec->{$fld} }) {
349    
350     if (ref($row) eq 'HASH') {
351    
352     foreach my $sf (keys %{ $row }) {
353 dpavlin 619 next if ($sf eq 'subfields');
354 dpavlin 593 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
355     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
356     if (ref($row->{$sf}) eq 'ARRAY');
357 dpavlin 506 }
358 dpavlin 593
359     } else {
360     $self->{_stats}->{repeatable}->{ $fld }++;
361     }
362 dpavlin 506 }
363 dpavlin 593 }
364 dpavlin 506 }
365    
366 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
367 dpavlin 285
368     }
369    
370 dpavlin 286 $self->{pos} = -1;
371     $self->{last_pcnt} = 0;
372 dpavlin 285
373     # store max mfn and return it.
374 dpavlin 339 $self->{max_pos} = $to_rec;
375     $log->debug("max_pos: $to_rec");
376 dpavlin 285
377     return $size;
378     }
379    
380     =head2 fetch
381    
382     Fetch next record from database. It will also displays progress bar.
383    
384     my $rec = $isis->fetch;
385    
386     Record from this function should probably go to C<data_structure> for
387     normalisation.
388    
389     =cut
390    
391     sub fetch {
392     my $self = shift;
393    
394     my $log = $self->_get_logger();
395    
396 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
397 dpavlin 285
398 dpavlin 286 if ($self->{pos} == -1) {
399     $self->{pos} = $self->{offset};
400 dpavlin 285 } else {
401 dpavlin 286 $self->{pos}++;
402 dpavlin 285 }
403    
404 dpavlin 286 my $mfn = $self->{pos};
405 dpavlin 285
406 dpavlin 286 if ($mfn > $self->{max_pos}) {
407     $self->{pos} = $self->{max_pos};
408 dpavlin 285 $log->debug("at EOF");
409     return;
410     }
411    
412 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
413 dpavlin 285
414     my $rec;
415    
416 dpavlin 286 if ($self->{low_mem}) {
417     $rec = $self->{db}->get($mfn);
418 dpavlin 285 } else {
419 dpavlin 286 $rec = $self->{data}->{$mfn};
420 dpavlin 285 }
421    
422     $rec ||= 0E0;
423     }
424    
425     =head2 pos
426    
427     Returns current record number (MFN).
428    
429     print $isis->pos;
430    
431     First record in database has position 1.
432    
433     =cut
434    
435     sub pos {
436     my $self = shift;
437 dpavlin 286 return $self->{pos};
438 dpavlin 285 }
439    
440    
441     =head2 size
442    
443     Returns number of records in database
444    
445     print $isis->size;
446    
447     Result from this function can be used to loop through all records
448    
449     foreach my $mfn ( 1 ... $isis->size ) { ... }
450    
451 dpavlin 286 because it takes into account C<offset> and C<limit>.
452 dpavlin 285
453     =cut
454    
455     sub size {
456     my $self = shift;
457 dpavlin 286 return $self->{size};
458 dpavlin 285 }
459    
460     =head2 seek
461    
462     Seek to specified MFN in file.
463    
464     $isis->seek(42);
465    
466     First record in database has position 1.
467    
468     =cut
469    
470     sub seek {
471     my $self = shift;
472     my $pos = shift || return;
473    
474     my $log = $self->_get_logger();
475    
476     if ($pos < 1) {
477     $log->warn("seek before first record");
478     $pos = 1;
479 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
480 dpavlin 285 $log->warn("seek beyond last record");
481 dpavlin 286 $pos = $self->{max_pos};
482 dpavlin 285 }
483    
484 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
485 dpavlin 285 }
486    
487 dpavlin 506 =head2 stats
488 dpavlin 285
489 dpavlin 506 Dump statistics about field and subfield usage
490    
491 dpavlin 507 print $input->stats;
492 dpavlin 506
493     =cut
494    
495     sub stats {
496     my $self = shift;
497 dpavlin 507
498     my $log = $self->_get_logger();
499    
500     my $s = $self->{_stats};
501     if (! $s) {
502     $log->warn("called stats, but there is no statistics collected");
503     return;
504     }
505    
506     my $max_fld = 0;
507    
508     my $out = join("\n",
509     map {
510     my $f = $_ || die "no field";
511     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
512     $max_fld = $v if ($v > $max_fld);
513    
514 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
515 dpavlin 507
516     if (defined($s->{sf}->{$f})) {
517     map {
518 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
519     $s->{sf}->{$f}->{$_}->{count},
520     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
521     );
522 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
523     }
524    
525     if (my $v_r = $s->{repeatable}->{$f}) {
526     $o .= " ($v_r)" if ($v_r != $v);
527     }
528    
529     $o;
530 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
531 dpavlin 507 );
532    
533     $log->debug( sub { Dumper($s) } );
534    
535     return $out;
536 dpavlin 506 }
537    
538 dpavlin 598 =head2 modify_record_regexps
539 dpavlin 597
540     Generate hash with regexpes to be applied using L<filter>.
541    
542     my $regexpes = $input->modify_record_regexps(
543     900 => { '^a' => { ' : ' => '^b' } },
544     901 => { '*' => { '^b' => ' ; ' } },
545     );
546    
547     =cut
548    
549     sub modify_record_regexps {
550     my $self = shift;
551     my $modify_record = {@_};
552    
553     my $regexpes;
554    
555     foreach my $f (keys %$modify_record) {
556     warn "--- f: $f\n";
557     foreach my $sf (keys %{ $modify_record->{$f} }) {
558     warn "---- sf: $sf\n";
559     foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
560     my $to = $modify_record->{$f}->{$sf}->{$from};
561     #die "no field?" unless defined($to);
562     warn "----- transform: |$from| -> |$to|\n";
563    
564     if ($sf =~ /^\^/) {
565     my $regex =
566     's/\Q'. $sf .'\E([^\^]+)\Q'. $from .'\E([^\^]+)/'. $sf .'$1'. $to .'$2/g';
567     push @{ $regexpes->{$f} }, $regex;
568     warn ">>>>> $regex [sf]\n";
569     } else {
570     my $regex =
571     's/\Q'. $from .'\E/'. $to .'/g';
572     push @{ $regexpes->{$f} }, $regex;
573     warn ">>>>> $regex [global]\n";
574     }
575    
576     }
577     }
578     }
579    
580     return $regexpes;
581     }
582    
583 dpavlin 3 =head1 MEMORY USAGE
584 dpavlin 1
585 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
586     will run on memory constraint machines (which doesn't have enough
587     physical RAM to create memory structure for whole source database).
588 dpavlin 1
589 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
590     memory shouldn't be an issue. If you don't have enough physical RAM, you
591     might consider using virtual memory (if your operating system is handling it
592     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
593     parsed structure of ISIS database (this is what C<low_mem> option does).
594 dpavlin 1
595 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
596     hitting swap before 90% will dramatically decrease performance and you will
597     be better off with C<low_mem> and using rest of availble memory for
598     operating system disk cache (Linux is particuallary good about this).
599     However, every access to database record will require disk access, so
600     generation phase will be slower 10-100 times.
601    
602     Parsed structures are essential - you just have option to trade RAM memory
603     (which is fast) for disk space (which is slow). Be sure to have planty of
604     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
605    
606     However, when WebPAC is running on desktop machines (or laptops :-), it's
607     highly undesireable for system to start swapping. Using C<low_mem> option can
608     reduce WecPAC memory usage to around 64Mb for same database with lookup
609     fields and sorted indexes which stay in RAM. Performance will suffer, but
610     memory usage will really be minimal. It might be also more confortable to
611     run WebPAC reniced on those machines.
612    
613    
614     =head1 AUTHOR
615    
616     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
617    
618 dpavlin 1 =head1 COPYRIGHT & LICENSE
619    
620 dpavlin 599 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
621 dpavlin 1
622     This program is free software; you can redistribute it and/or modify it
623     under the same terms as Perl itself.
624    
625     =cut
626    
627     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26