/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 285 by dpavlin, Sun Dec 18 21:06:39 2005 UTC revision 483 by dpavlin, Sun May 14 09:34:05 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  use blib;  use WebPAC::Common 0.03;
   
 use WebPAC::Common;  
7  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common/;
8  use Text::Iconv;  use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.02  Version 0.04
18    
19  =cut  =cut
20    
21  our $VERSION = '0.02';  our $VERSION = '0.04';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  This module is used as base class for all database specific modules  This module implements input as database which have fixed and known
26  (basically, files which have one handle, fixed size while indexing and some  I<size> while indexing and single unique numeric identifier for database
27  kind of numeric idefinirier which goes from 1 to filesize).  position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
43          format => 'NULL',          module => 'WebPAC::Input::ISIS',
44          config => $config,                  config => $config,
45          lookup => $lookup_obj,                  lookup => $lookup_obj,
46          low_mem => 1,                  low_mem => 1,
47      );      );
48    
49      $db->open('/path/to/database');      $db->open('/path/to/database');
50      print "database size: ",$db->size,"\n";          print "database size: ",$db->size,"\n";
51      while (my $row = $db->fetch) {          while (my $rec = $db->fetch) {
52          ...                  # do something with $rec
53      }          }
54    
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
# Line 51  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67            recode => 'char pairs',
68            no_progress_bar => 1,
69    );    );
70    
71    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72    L<WebPAC::Input::MARC>.
73    
74  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
75  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
76  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
77    
78  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80    C<recode> is optional string constisting of character or words pairs that
81    should be replaced in input stream.
82    
83    C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85  This function will also call low-level C<init> if it exists with same  This function will also call low-level C<init> if it exists with same
86  parametars.  parametars.
87    
# Line 74  sub new { Line 94  sub new {
94    
95          my $log = $self->_get_logger;          my $log = $self->_get_logger;
96    
97            $log->logconfess("specify low-level file format module") unless ($self->{module});
98            my $module = $self->{module};
99            $module =~ s#::#/#g;
100            $module .= '.pm';
101            $log->debug("require low-level module $self->{module} from $module");
102    
103            require $module;
104            #eval $self->{module} .'->import';
105    
106          # check if required subclasses are implemented          # check if required subclasses are implemented
107          foreach my $subclass (qw/open_db fetch_rec/) {          foreach my $subclass (qw/open_db fetch_rec init/) {
108                  $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));                  my $n = $self->{module} . '::' . $subclass;
109                    if (! defined &{ $n }) {
110                            my $missing = "missing $subclass in $self->{module}";
111                            $self->{$subclass} = sub { $log->logwarn($missing) };
112                    } else {
113                            $self->{$subclass} = \&{ $n };
114                    }
115          }          }
116    
117          if ($self->can('init')) {          if ($self->{init}) {
118                  $log->debug("calling init");                  $log->debug("calling init");
119                  $self->init(@_);                  $self->{init}->($self, @_);
120          }          }
121    
122          $self->{'code_page'} ||= 'ISO-8859-2';          $self->{'code_page'} ||= 'ISO-8859-2';
# Line 119  sub new { Line 154  sub new {
154    
155  This function will read whole database in memory and produce lookups.  This function will read whole database in memory and produce lookups.
156    
157   $isis->open(   $input->open(
158          path => '/path/to/database/file',          path => '/path/to/database/file',
159          code_page => '852',          code_page => '852',
160          limit_mfn => 500,          limit => 500,
161          start_mfn => 6000,          offset => 6000,
162          lookup => $lookup_obj,          lookup => $lookup_obj,
163   );   );
164    
165  By default, C<code_page> is assumed to be C<852>.  By default, C<code_page> is assumed to be C<852>.
166    
167  If optional parametar C<start_mfn> is set, this will be first MFN to read  C<offset> is optional parametar to position at some offset before reading from database.
 from database (so you can skip beginning of your database if you need to).  
168    
169  If optional parametar C<limit_mfn> is set, it will read just 500 records  C<limit> is optional parametar to read just C<limit> records from database
 from database in example above.  
170    
171  Returns size of database, regardless of C<start_mfn> and C<limit_mfn>  Returns size of database, regardless of C<offset> and C<limit>
172  parametars, see also C<$isis->size>.  parametars, see also C<size>.
173    
174  =cut  =cut
175    
# Line 150  sub open { Line 183  sub open {
183          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
184    
185          # store data in object          # store data in object
186          $self->{'code_page'} = $code_page;          $self->{'input_code_page'} = $code_page;
187          foreach my $v (qw/path start_mfn limit_mfn/) {          foreach my $v (qw/path offset limit/) {
188                  $self->{$v} = $arg->{$v} if ($arg->{$v});                  $self->{$v} = $arg->{$v} if ($arg->{$v});
189          }          }
190    
191          # create Text::Iconv object          # create Text::Iconv object
192          $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});          $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
193    
194          my ($db, $size) = $self->open_db(          my $filter_ref;
195    
196            if ($self->{recode}) {
197                    my @r = split(/\s/, $self->{recode});
198                    if ($#r % 2 != 1) {
199                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
200                    } else {
201                            my $recode;
202                            while (@r) {
203                                    my $from = shift @r;
204                                    my $to = shift @r;
205                                    $recode->{$from} = $to;
206                            }
207    
208                            my $regex = join '|' => keys %{ $recode };
209    
210                            $log->debug("using recode regex: $regex");
211                            
212                            $filter_ref = sub {
213                                    my $t = shift;
214                                    $t =~ s/($regex)/$recode->{$1}/g;
215                                    return $t;
216                            };
217    
218                    }
219    
220            }
221    
222            my ($db, $size) = $self->{open_db}->( $self,
223                  path => $arg->{path},                  path => $arg->{path},
224                    filter => $filter_ref,
225          );          );
226    
227          unless ($db) {          unless ($db) {
# Line 172  sub open { Line 234  sub open {
234                  return;                  return;
235          }          }
236    
237          my $startmfn = 1;          my $from_rec = 1;
238          my $maxmfn = $size;          my $to_rec = $size;
239    
240          if (my $s = $self->{start_mfn}) {          if (my $s = $self->{offset}) {
241                  $log->info("skipping to MFN $s");                  $log->info("skipping to MFN $s");
242                  $startmfn = $s;                  $from_rec = $s;
243          } else {          } else {
244                  $self->{start_mfn} = $startmfn;                  $self->{offset} = $from_rec;
245          }          }
246    
247          if ($self->{limit_mfn}) {          if ($self->{limit}) {
248                  $log->info("limiting to ",$self->{limit_mfn}," records");                  $log->debug("limiting to ",$self->{limit}," records");
249                  $maxmfn = $startmfn + $self->{limit_mfn} - 1;                  $to_rec = $from_rec + $self->{limit} - 1;
250                  $maxmfn = $size if ($maxmfn > $size);                  $to_rec = $size if ($to_rec > $size);
251          }          }
252    
253          # store size for later          # store size for later
254          $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;          $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
255    
256          $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");          $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
257    
258          # read database          # read database
259          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {          for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
260    
261                    $log->debug("position: $pos\n");
262    
263                  $log->debug("mfn: $mfn\n");                  my $rec = $self->{fetch_rec}->($self, $db, $pos );
264    
265                  my $rec = $self->fetch_rec( $db, $mfn );                  $log->debug(sub { Dumper($rec) });
266    
267                  if (! $rec) {                  if (! $rec) {
268                          $log->warn("record $mfn empty? skipping...");                          $log->warn("record $pos empty? skipping...");
269                          next;                          next;
270                  }                  }
271    
272                  # store                  # store
273                  if ($self->{'low_mem'}) {                  if ($self->{low_mem}) {
274                          $self->{'db'}->put($mfn, $rec);                          $self->{db}->put($pos, $rec);
275                  } else {                  } else {
276                          $self->{'data'}->{$mfn} = $rec;                          $self->{data}->{$pos} = $rec;
277                  }                  }
278    
279                  # create lookup                  # create lookup
280                  $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});                  $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
281    
282                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
283    
284          }          }
285    
286          $self->{'current_mfn'} = -1;          $self->{pos} = -1;
287          $self->{'last_pcnt'} = 0;          $self->{last_pcnt} = 0;
   
         $log->debug("max mfn: $maxmfn");  
288    
289          # store max mfn and return it.          # store max mfn and return it.
290          $self->{'max_mfn'} = $maxmfn;          $self->{max_pos} = $to_rec;
291            $log->debug("max_pos: $to_rec");
292    
293          return $size;          return $size;
294  }  }
# Line 246  sub fetch { Line 309  sub fetch {
309    
310          my $log = $self->_get_logger();          my $log = $self->_get_logger();
311    
312          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});          $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
313    
314          if ($self->{'current_mfn'} == -1) {          if ($self->{pos} == -1) {
315                  $self->{'current_mfn'} = $self->{'start_mfn'};                  $self->{pos} = $self->{offset};
316          } else {          } else {
317                  $self->{'current_mfn'}++;                  $self->{pos}++;
318          }          }
319    
320          my $mfn = $self->{'current_mfn'};          my $mfn = $self->{pos};
321    
322          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{max_pos}) {
323                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{pos} = $self->{max_pos};
324                  $log->debug("at EOF");                  $log->debug("at EOF");
325                  return;                  return;
326          }          }
327    
328          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
329    
330          my $rec;          my $rec;
331    
332          if ($self->{'low_mem'}) {          if ($self->{low_mem}) {
333                  $rec = $self->{'db'}->get($mfn);                  $rec = $self->{db}->get($mfn);
334          } else {          } else {
335                  $rec = $self->{'data'}->{$mfn};                  $rec = $self->{data}->{$mfn};
336          }          }
337    
338          $rec ||= 0E0;          $rec ||= 0E0;
# Line 287  First record in database has position 1. Line 350  First record in database has position 1.
350    
351  sub pos {  sub pos {
352          my $self = shift;          my $self = shift;
353          return $self->{'current_mfn'};          return $self->{pos};
354  }  }
355    
356    
# Line 301  Result from this function can be used to Line 364  Result from this function can be used to
364    
365   foreach my $mfn ( 1 ... $isis->size ) { ... }   foreach my $mfn ( 1 ... $isis->size ) { ... }
366    
367  because it takes into account C<start_mfn> and C<limit_mfn>.  because it takes into account C<offset> and C<limit>.
368    
369  =cut  =cut
370    
371  sub size {  sub size {
372          my $self = shift;          my $self = shift;
373          return $self->{'size'};          return $self->{size};
374  }  }
375    
376  =head2 seek  =head2 seek
# Line 329  sub seek { Line 392  sub seek {
392          if ($pos < 1) {          if ($pos < 1) {
393                  $log->warn("seek before first record");                  $log->warn("seek before first record");
394                  $pos = 1;                  $pos = 1;
395          } elsif ($pos > $self->{'max_mfn'}) {          } elsif ($pos > $self->{max_pos}) {
396                  $log->warn("seek beyond last record");                  $log->warn("seek beyond last record");
397                  $pos = $self->{'max_mfn'};                  $pos = $self->{max_pos};
398          }          }
399    
400          return $self->{'current_mfn'} = (($pos - 1) || -1);          return $self->{pos} = (($pos - 1) || -1);
401  }  }
402    
403    

Legend:
Removed from v.285  
changed lines
  Added in v.483

  ViewVC Help
Powered by ViewVC 1.1.26