/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 624 - (hide annotations)
Sat Aug 26 12:00:31 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 14912 byte(s)
 r877@llin:  dpavlin | 2006-08-25 21:55:05 +0200
 removed traces of Text::Iconv and replaced them with Encode,
 code page 852 is now cp852 (instead of just 852) because Encode
 likes it that way, record encoding is now hard-coded to utf-8

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10 dpavlin 308 use Data::Dumper;
11 dpavlin 624 use Encode qw/from_to/;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 619 Version 0.11
20 dpavlin 1
21     =cut
22    
23 dpavlin 619 our $VERSION = '0.11';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101     my $module = $self->{module};
102     $module =~ s#::#/#g;
103     $module .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module");
105 dpavlin 289
106 dpavlin 286 require $module;
107 dpavlin 289 #eval $self->{module} .'->import';
108 dpavlin 286
109 dpavlin 285 # check if required subclasses are implemented
110 dpavlin 289 foreach my $subclass (qw/open_db fetch_rec init/) {
111     my $n = $self->{module} . '::' . $subclass;
112     if (! defined &{ $n }) {
113 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
114 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
115 dpavlin 286 } else {
116 dpavlin 289 $self->{$subclass} = \&{ $n };
117 dpavlin 286 }
118 dpavlin 285 }
119    
120 dpavlin 289 if ($self->{init}) {
121 dpavlin 285 $log->debug("calling init");
122 dpavlin 289 $self->{init}->($self, @_);
123 dpavlin 285 }
124    
125 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
126 dpavlin 9
127 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
128     if ($self->{'low_mem'}) {
129     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
130    
131     my $db_file = "data.db";
132    
133     if (-e $db_file) {
134     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
135     $log->debug("removed '$db_file' from last run");
136     }
137    
138     require DBM::Deep;
139    
140     my $db = new DBM::Deep $db_file;
141    
142     $log->logdie("DBM::Deep error: $!") unless ($db);
143    
144     if ($db->error()) {
145     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
146     } else {
147     $log->debug("using file '$db_file' for DBM::Deep");
148     }
149    
150     $self->{'db'} = $db;
151     }
152    
153 dpavlin 3 $self ? return $self : return undef;
154 dpavlin 1 }
155    
156 dpavlin 285 =head2 open
157    
158     This function will read whole database in memory and produce lookups.
159    
160 dpavlin 286 $input->open(
161 dpavlin 285 path => '/path/to/database/file',
162 dpavlin 624 code_page => 'cp852',
163 dpavlin 286 limit => 500,
164     offset => 6000,
165 dpavlin 285 lookup => $lookup_obj,
166 dpavlin 506 stats => 1,
167 dpavlin 585 lookup_ref => sub {
168     my ($k,$v) = @_;
169     # store lookup $k => $v
170     },
171 dpavlin 597 modify_records => {
172     900 => { '^a' => { ' : ' => '^b' } },
173     901 => { '*' => { '^b' => ' ; ' } },
174     },
175 dpavlin 285 );
176    
177 dpavlin 624 By default, C<code_page> is assumed to be C<cp852>.
178 dpavlin 285
179 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
180 dpavlin 285
181 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
182 dpavlin 285
183 dpavlin 506 C<stats> create optional report about usage of fields and subfields
184    
185 dpavlin 597 C<lookup_coderef> is closure to call when adding C<< key => 'value' >> combinations to
186 dpavlin 585 lookup.
187    
188 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
189     delimiters to subfields, as well as oprations on fields (if subfield is
190     defined as C<*>.
191    
192 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
193     parametars, see also C<size>.
194 dpavlin 285
195     =cut
196    
197     sub open {
198     my $self = shift;
199     my $arg = {@_};
200    
201     my $log = $self->_get_logger();
202    
203 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
204     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
205     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
206    
207 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
208 dpavlin 624 my $code_page = $arg->{'code_page'} || 'cp852';
209 dpavlin 285
210     # store data in object
211 dpavlin 292 $self->{'input_code_page'} = $code_page;
212 dpavlin 286 foreach my $v (qw/path offset limit/) {
213 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
214     }
215    
216 dpavlin 416 my $filter_ref;
217 dpavlin 597 my $recode_regex;
218     my $recode_map;
219 dpavlin 416
220     if ($self->{recode}) {
221     my @r = split(/\s/, $self->{recode});
222     if ($#r % 2 != 1) {
223     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
224     } else {
225     while (@r) {
226     my $from = shift @r;
227     my $to = shift @r;
228 dpavlin 597 $recode_map->{$from} = $to;
229 dpavlin 416 }
230    
231 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
232 dpavlin 416
233 dpavlin 597 $log->debug("using recode regex: $recode_regex");
234 dpavlin 416 }
235    
236     }
237    
238 dpavlin 597 my $rec_regex = $self->modify_record_regexps(%{ $arg->{modify_records} });
239     $log->debug("rec_regex: ", Dumper($rec_regex));
240    
241 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
242 dpavlin 285 path => $arg->{path},
243 dpavlin 624 # filter => sub {
244     # my ($l,$f_nr) = @_;
245     # return unless defined($l);
246     # from_to($l, $code_page, $self->{'encoding'});
247     # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
248     # return $l;
249     # },
250 dpavlin 523 %{ $arg },
251 dpavlin 285 );
252    
253 dpavlin 496 unless (defined($db)) {
254 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
255     return;
256     }
257    
258     unless ($size) {
259     $log->logwarn("no records in database $arg->{path}, skipping...");
260     return;
261     }
262    
263 dpavlin 339 my $from_rec = 1;
264     my $to_rec = $size;
265 dpavlin 285
266 dpavlin 286 if (my $s = $self->{offset}) {
267 dpavlin 513 $log->debug("skipping to MFN $s");
268 dpavlin 339 $from_rec = $s;
269 dpavlin 285 } else {
270 dpavlin 339 $self->{offset} = $from_rec;
271 dpavlin 285 }
272    
273 dpavlin 286 if ($self->{limit}) {
274 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
275 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
276     $to_rec = $size if ($to_rec > $size);
277 dpavlin 285 }
278    
279     # store size for later
280 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
281 dpavlin 285
282 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
283 dpavlin 285
284     # read database
285 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
286 dpavlin 285
287 dpavlin 286 $log->debug("position: $pos\n");
288 dpavlin 285
289 dpavlin 619 my $rec = $self->{fetch_rec}->($self, $db, $pos, sub {
290     my ($l,$f_nr) = @_;
291 dpavlin 624 # return unless defined($l);
292     # return $l unless ($rec_regex && $f_nr);
293 dpavlin 285
294 dpavlin 624 warn "## --> $f_nr ## $l\n";
295     # codepage conversion and recode_regex
296     # from_to($l, $code_page, $self->{'encoding'});
297     from_to($l, $code_page, 'utf-8');
298     $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
299    
300 dpavlin 619 # apply regexps
301     if ($rec_regex && defined($rec_regex->{$f_nr})) {
302     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
303     my $c = 0;
304     foreach my $r (@{ $rec_regex->{$f_nr} }) {
305     #$log->debug("\$l = $l\neval \$l =~ $r");
306     eval '$l =~ ' . $r;
307     $log->error("error applying regex: $r") if ($@);
308     }
309     }
310    
311 dpavlin 624 warn "## <-- $f_nr ## $l\n";
312 dpavlin 619 return $l;
313     });
314    
315 dpavlin 308 $log->debug(sub { Dumper($rec) });
316    
317 dpavlin 285 if (! $rec) {
318 dpavlin 286 $log->warn("record $pos empty? skipping...");
319 dpavlin 285 next;
320     }
321    
322     # store
323 dpavlin 286 if ($self->{low_mem}) {
324     $self->{db}->put($pos, $rec);
325 dpavlin 285 } else {
326 dpavlin 286 $self->{data}->{$pos} = $rec;
327 dpavlin 285 }
328    
329     # create lookup
330 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
331 dpavlin 285
332 dpavlin 506 # update counters for statistics
333     if ($self->{stats}) {
334 dpavlin 593
335 dpavlin 619 # fetch clean record with regexpes applied for statistics
336     my $rec = $self->{fetch_rec}->($self, $db, $pos);
337    
338 dpavlin 593 foreach my $fld (keys %{ $rec }) {
339 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
340 dpavlin 593
341     $log->logdie("invalid record fild $fld, not ARRAY")
342     unless (ref($rec->{ $fld }) eq 'ARRAY');
343    
344     foreach my $row (@{ $rec->{$fld} }) {
345    
346     if (ref($row) eq 'HASH') {
347    
348     foreach my $sf (keys %{ $row }) {
349 dpavlin 619 next if ($sf eq 'subfields');
350 dpavlin 593 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
351     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
352     if (ref($row->{$sf}) eq 'ARRAY');
353 dpavlin 506 }
354 dpavlin 593
355     } else {
356     $self->{_stats}->{repeatable}->{ $fld }++;
357     }
358 dpavlin 506 }
359 dpavlin 593 }
360 dpavlin 506 }
361    
362 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
363 dpavlin 285
364     }
365    
366 dpavlin 286 $self->{pos} = -1;
367     $self->{last_pcnt} = 0;
368 dpavlin 285
369     # store max mfn and return it.
370 dpavlin 339 $self->{max_pos} = $to_rec;
371     $log->debug("max_pos: $to_rec");
372 dpavlin 285
373     return $size;
374     }
375    
376     =head2 fetch
377    
378     Fetch next record from database. It will also displays progress bar.
379    
380     my $rec = $isis->fetch;
381    
382     Record from this function should probably go to C<data_structure> for
383     normalisation.
384    
385     =cut
386    
387     sub fetch {
388     my $self = shift;
389    
390     my $log = $self->_get_logger();
391    
392 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
393 dpavlin 285
394 dpavlin 286 if ($self->{pos} == -1) {
395     $self->{pos} = $self->{offset};
396 dpavlin 285 } else {
397 dpavlin 286 $self->{pos}++;
398 dpavlin 285 }
399    
400 dpavlin 286 my $mfn = $self->{pos};
401 dpavlin 285
402 dpavlin 286 if ($mfn > $self->{max_pos}) {
403     $self->{pos} = $self->{max_pos};
404 dpavlin 285 $log->debug("at EOF");
405     return;
406     }
407    
408 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
409 dpavlin 285
410     my $rec;
411    
412 dpavlin 286 if ($self->{low_mem}) {
413     $rec = $self->{db}->get($mfn);
414 dpavlin 285 } else {
415 dpavlin 286 $rec = $self->{data}->{$mfn};
416 dpavlin 285 }
417    
418     $rec ||= 0E0;
419     }
420    
421     =head2 pos
422    
423     Returns current record number (MFN).
424    
425     print $isis->pos;
426    
427     First record in database has position 1.
428    
429     =cut
430    
431     sub pos {
432     my $self = shift;
433 dpavlin 286 return $self->{pos};
434 dpavlin 285 }
435    
436    
437     =head2 size
438    
439     Returns number of records in database
440    
441     print $isis->size;
442    
443     Result from this function can be used to loop through all records
444    
445     foreach my $mfn ( 1 ... $isis->size ) { ... }
446    
447 dpavlin 286 because it takes into account C<offset> and C<limit>.
448 dpavlin 285
449     =cut
450    
451     sub size {
452     my $self = shift;
453 dpavlin 286 return $self->{size};
454 dpavlin 285 }
455    
456     =head2 seek
457    
458     Seek to specified MFN in file.
459    
460     $isis->seek(42);
461    
462     First record in database has position 1.
463    
464     =cut
465    
466     sub seek {
467     my $self = shift;
468     my $pos = shift || return;
469    
470     my $log = $self->_get_logger();
471    
472     if ($pos < 1) {
473     $log->warn("seek before first record");
474     $pos = 1;
475 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
476 dpavlin 285 $log->warn("seek beyond last record");
477 dpavlin 286 $pos = $self->{max_pos};
478 dpavlin 285 }
479    
480 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
481 dpavlin 285 }
482    
483 dpavlin 506 =head2 stats
484 dpavlin 285
485 dpavlin 506 Dump statistics about field and subfield usage
486    
487 dpavlin 507 print $input->stats;
488 dpavlin 506
489     =cut
490    
491     sub stats {
492     my $self = shift;
493 dpavlin 507
494     my $log = $self->_get_logger();
495    
496     my $s = $self->{_stats};
497     if (! $s) {
498     $log->warn("called stats, but there is no statistics collected");
499     return;
500     }
501    
502     my $max_fld = 0;
503    
504     my $out = join("\n",
505     map {
506     my $f = $_ || die "no field";
507     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
508     $max_fld = $v if ($v > $max_fld);
509    
510 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
511 dpavlin 507
512     if (defined($s->{sf}->{$f})) {
513     map {
514 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
515     $s->{sf}->{$f}->{$_}->{count},
516     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
517     );
518 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
519     }
520    
521     if (my $v_r = $s->{repeatable}->{$f}) {
522     $o .= " ($v_r)" if ($v_r != $v);
523     }
524    
525     $o;
526 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
527 dpavlin 507 );
528    
529     $log->debug( sub { Dumper($s) } );
530    
531     return $out;
532 dpavlin 506 }
533    
534 dpavlin 598 =head2 modify_record_regexps
535 dpavlin 597
536     Generate hash with regexpes to be applied using L<filter>.
537    
538     my $regexpes = $input->modify_record_regexps(
539     900 => { '^a' => { ' : ' => '^b' } },
540     901 => { '*' => { '^b' => ' ; ' } },
541     );
542    
543     =cut
544    
545     sub modify_record_regexps {
546     my $self = shift;
547     my $modify_record = {@_};
548    
549     my $regexpes;
550    
551     foreach my $f (keys %$modify_record) {
552     warn "--- f: $f\n";
553     foreach my $sf (keys %{ $modify_record->{$f} }) {
554     warn "---- sf: $sf\n";
555     foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
556     my $to = $modify_record->{$f}->{$sf}->{$from};
557     #die "no field?" unless defined($to);
558     warn "----- transform: |$from| -> |$to|\n";
559    
560     if ($sf =~ /^\^/) {
561     my $regex =
562     's/\Q'. $sf .'\E([^\^]+)\Q'. $from .'\E([^\^]+)/'. $sf .'$1'. $to .'$2/g';
563     push @{ $regexpes->{$f} }, $regex;
564     warn ">>>>> $regex [sf]\n";
565     } else {
566     my $regex =
567     's/\Q'. $from .'\E/'. $to .'/g';
568     push @{ $regexpes->{$f} }, $regex;
569     warn ">>>>> $regex [global]\n";
570     }
571    
572     }
573     }
574     }
575    
576     return $regexpes;
577     }
578    
579 dpavlin 3 =head1 MEMORY USAGE
580 dpavlin 1
581 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
582     will run on memory constraint machines (which doesn't have enough
583     physical RAM to create memory structure for whole source database).
584 dpavlin 1
585 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
586     memory shouldn't be an issue. If you don't have enough physical RAM, you
587     might consider using virtual memory (if your operating system is handling it
588     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
589     parsed structure of ISIS database (this is what C<low_mem> option does).
590 dpavlin 1
591 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
592     hitting swap before 90% will dramatically decrease performance and you will
593     be better off with C<low_mem> and using rest of availble memory for
594     operating system disk cache (Linux is particuallary good about this).
595     However, every access to database record will require disk access, so
596     generation phase will be slower 10-100 times.
597    
598     Parsed structures are essential - you just have option to trade RAM memory
599     (which is fast) for disk space (which is slow). Be sure to have planty of
600     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
601    
602     However, when WebPAC is running on desktop machines (or laptops :-), it's
603     highly undesireable for system to start swapping. Using C<low_mem> option can
604     reduce WecPAC memory usage to around 64Mb for same database with lookup
605     fields and sorted indexes which stay in RAM. Performance will suffer, but
606     memory usage will really be minimal. It might be also more confortable to
607     run WebPAC reniced on those machines.
608    
609    
610     =head1 AUTHOR
611    
612     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
613    
614 dpavlin 1 =head1 COPYRIGHT & LICENSE
615    
616 dpavlin 599 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
617 dpavlin 1
618     This program is free software; you can redistribute it and/or modify it
619     under the same terms as Perl itself.
620    
621     =cut
622    
623     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26