/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 285 by dpavlin, Sun Dec 18 21:06:39 2005 UTC revision 416 by dpavlin, Sun Feb 26 23:21:50 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
 use blib;  
   
6  use WebPAC::Common;  use WebPAC::Common;
7  use base qw/WebPAC::Common/;  use base qw/WebPAC::Common/;
8  use Text::Iconv;  use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.02  Version 0.04
18    
19  =cut  =cut
20    
21  our $VERSION = '0.02';  our $VERSION = '0.04';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  This module is used as base class for all database specific modules  This module implements input as database which have fixed and known
26  (basically, files which have one handle, fixed size while indexing and some  I<size> while indexing and single unique numeric identifier for database
27  kind of numeric idefinirier which goes from 1 to filesize).  position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
43          format => 'NULL',          module => 'WebPAC::Input::ISIS',
44          config => $config,                  config => $config,
45          lookup => $lookup_obj,                  lookup => $lookup_obj,
46          low_mem => 1,                  low_mem => 1,
47      );      );
48    
49      $db->open('/path/to/database');      $db->open('/path/to/database');
50      print "database size: ",$db->size,"\n";          print "database size: ",$db->size,"\n";
51      while (my $row = $db->fetch) {          while (my $rec = $db->fetch) {
52          ...                  # do something with $rec
53      }          }
54    
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
# Line 51  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67            recode => 'char pairs',
68    );    );
69    
70    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
71    L<WebPAC::Input::MARC>.
72    
73  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
74  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
75  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
# Line 74  sub new { Line 88  sub new {
88    
89          my $log = $self->_get_logger;          my $log = $self->_get_logger;
90    
91            $log->logconfess("specify low-level file format module") unless ($self->{module});
92            my $module = $self->{module};
93            $module =~ s#::#/#g;
94            $module .= '.pm';
95            $log->debug("require low-level module $self->{module} from $module");
96    
97            require $module;
98            #eval $self->{module} .'->import';
99    
100          # check if required subclasses are implemented          # check if required subclasses are implemented
101          foreach my $subclass (qw/open_db fetch_rec/) {          foreach my $subclass (qw/open_db fetch_rec init/) {
102                  $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));                  my $n = $self->{module} . '::' . $subclass;
103                    if (! defined &{ $n }) {
104                            my $missing = "missing $subclass in $self->{module}";
105                            $self->{$subclass} = sub { $log->logwarn($missing) };
106                    } else {
107                            $self->{$subclass} = \&{ $n };
108                    }
109          }          }
110    
111          if ($self->can('init')) {          if ($self->{init}) {
112                  $log->debug("calling init");                  $log->debug("calling init");
113                  $self->init(@_);                  $self->{init}->($self, @_);
114          }          }
115    
116          $self->{'code_page'} ||= 'ISO-8859-2';          $self->{'code_page'} ||= 'ISO-8859-2';
# Line 119  sub new { Line 148  sub new {
148    
149  This function will read whole database in memory and produce lookups.  This function will read whole database in memory and produce lookups.
150    
151   $isis->open(   $input->open(
152          path => '/path/to/database/file',          path => '/path/to/database/file',
153          code_page => '852',          code_page => '852',
154          limit_mfn => 500,          limit => 500,
155          start_mfn => 6000,          offset => 6000,
156          lookup => $lookup_obj,          lookup => $lookup_obj,
157   );   );
158    
159  By default, C<code_page> is assumed to be C<852>.  By default, C<code_page> is assumed to be C<852>.
160    
161  If optional parametar C<start_mfn> is set, this will be first MFN to read  C<offset> is optional parametar to position at some offset before reading from database.
 from database (so you can skip beginning of your database if you need to).  
162    
163  If optional parametar C<limit_mfn> is set, it will read just 500 records  C<limit> is optional parametar to read just C<limit> records from database
 from database in example above.  
164    
165  Returns size of database, regardless of C<start_mfn> and C<limit_mfn>  Returns size of database, regardless of C<offset> and C<limit>
166  parametars, see also C<$isis->size>.  parametars, see also C<size>.
167    
168  =cut  =cut
169    
# Line 150  sub open { Line 177  sub open {
177          my $code_page = $arg->{'code_page'} || '852';          my $code_page = $arg->{'code_page'} || '852';
178    
179          # store data in object          # store data in object
180          $self->{'code_page'} = $code_page;          $self->{'input_code_page'} = $code_page;
181          foreach my $v (qw/path start_mfn limit_mfn/) {          foreach my $v (qw/path offset limit/) {
182                  $self->{$v} = $arg->{$v} if ($arg->{$v});                  $self->{$v} = $arg->{$v} if ($arg->{$v});
183          }          }
184    
185          # create Text::Iconv object          # create Text::Iconv object
186          $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});          $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188          my ($db, $size) = $self->open_db(          my $filter_ref;
189    
190            if ($self->{recode}) {
191                    my @r = split(/\s/, $self->{recode});
192                    if ($#r % 2 != 1) {
193                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
194                    } else {
195                            my $recode;
196                            while (@r) {
197                                    my $from = shift @r;
198                                    my $to = shift @r;
199                                    $recode->{$from} = $to;
200                            }
201    
202                            my $regex = join '|' => keys %{ $recode };
203    
204                            $log->debug("using recode regex: $regex");
205                            
206                            $filter_ref = sub {
207                                    my $t = shift;
208                                    $t =~ s/($regex)/$recode->{$1}/g;
209                                    return $t;
210                            };
211    
212                    }
213    
214            }
215    
216            my ($db, $size) = $self->{open_db}->( $self,
217                  path => $arg->{path},                  path => $arg->{path},
218                    filter => $filter_ref,
219          );          );
220    
221          unless ($db) {          unless ($db) {
# Line 172  sub open { Line 228  sub open {
228                  return;                  return;
229          }          }
230    
231          my $startmfn = 1;          my $from_rec = 1;
232          my $maxmfn = $size;          my $to_rec = $size;
233    
234          if (my $s = $self->{start_mfn}) {          if (my $s = $self->{offset}) {
235                  $log->info("skipping to MFN $s");                  $log->info("skipping to MFN $s");
236                  $startmfn = $s;                  $from_rec = $s;
237          } else {          } else {
238                  $self->{start_mfn} = $startmfn;                  $self->{offset} = $from_rec;
239          }          }
240    
241          if ($self->{limit_mfn}) {          if ($self->{limit}) {
242                  $log->info("limiting to ",$self->{limit_mfn}," records");                  $log->debug("limiting to ",$self->{limit}," records");
243                  $maxmfn = $startmfn + $self->{limit_mfn} - 1;                  $to_rec = $from_rec + $self->{limit} - 1;
244                  $maxmfn = $size if ($maxmfn > $size);                  $to_rec = $size if ($to_rec > $size);
245          }          }
246    
247          # store size for later          # store size for later
248          $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;          $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
249    
250          $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");          $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
251    
252          # read database          # read database
253          for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {          for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
254    
255                    $log->debug("position: $pos\n");
256    
257                  $log->debug("mfn: $mfn\n");                  my $rec = $self->{fetch_rec}->($self, $db, $pos );
258    
259                  my $rec = $self->fetch_rec( $db, $mfn );                  $log->debug(sub { Dumper($rec) });
260    
261                  if (! $rec) {                  if (! $rec) {
262                          $log->warn("record $mfn empty? skipping...");                          $log->warn("record $pos empty? skipping...");
263                          next;                          next;
264                  }                  }
265    
266                  # store                  # store
267                  if ($self->{'low_mem'}) {                  if ($self->{low_mem}) {
268                          $self->{'db'}->put($mfn, $rec);                          $self->{db}->put($pos, $rec);
269                  } else {                  } else {
270                          $self->{'data'}->{$mfn} = $rec;                          $self->{data}->{$pos} = $rec;
271                  }                  }
272    
273                  # create lookup                  # create lookup
274                  $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});                  $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
275    
276                  $self->progress_bar($mfn,$maxmfn);                  $self->progress_bar($pos,$to_rec);
277    
278          }          }
279    
280          $self->{'current_mfn'} = -1;          $self->{pos} = -1;
281          $self->{'last_pcnt'} = 0;          $self->{last_pcnt} = 0;
   
         $log->debug("max mfn: $maxmfn");  
282    
283          # store max mfn and return it.          # store max mfn and return it.
284          $self->{'max_mfn'} = $maxmfn;          $self->{max_pos} = $to_rec;
285            $log->debug("max_pos: $to_rec");
286    
287          return $size;          return $size;
288  }  }
# Line 246  sub fetch { Line 303  sub fetch {
303    
304          my $log = $self->_get_logger();          my $log = $self->_get_logger();
305    
306          $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});          $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
307    
308          if ($self->{'current_mfn'} == -1) {          if ($self->{pos} == -1) {
309                  $self->{'current_mfn'} = $self->{'start_mfn'};                  $self->{pos} = $self->{offset};
310          } else {          } else {
311                  $self->{'current_mfn'}++;                  $self->{pos}++;
312          }          }
313    
314          my $mfn = $self->{'current_mfn'};          my $mfn = $self->{pos};
315    
316          if ($mfn > $self->{'max_mfn'}) {          if ($mfn > $self->{max_pos}) {
317                  $self->{'current_mfn'} = $self->{'max_mfn'};                  $self->{pos} = $self->{max_pos};
318                  $log->debug("at EOF");                  $log->debug("at EOF");
319                  return;                  return;
320          }          }
321    
322          $self->progress_bar($mfn,$self->{'max_mfn'});          $self->progress_bar($mfn,$self->{max_pos});
323    
324          my $rec;          my $rec;
325    
326          if ($self->{'low_mem'}) {          if ($self->{low_mem}) {
327                  $rec = $self->{'db'}->get($mfn);                  $rec = $self->{db}->get($mfn);
328          } else {          } else {
329                  $rec = $self->{'data'}->{$mfn};                  $rec = $self->{data}->{$mfn};
330          }          }
331    
332          $rec ||= 0E0;          $rec ||= 0E0;
# Line 287  First record in database has position 1. Line 344  First record in database has position 1.
344    
345  sub pos {  sub pos {
346          my $self = shift;          my $self = shift;
347          return $self->{'current_mfn'};          return $self->{pos};
348  }  }
349    
350    
# Line 301  Result from this function can be used to Line 358  Result from this function can be used to
358    
359   foreach my $mfn ( 1 ... $isis->size ) { ... }   foreach my $mfn ( 1 ... $isis->size ) { ... }
360    
361  because it takes into account C<start_mfn> and C<limit_mfn>.  because it takes into account C<offset> and C<limit>.
362    
363  =cut  =cut
364    
365  sub size {  sub size {
366          my $self = shift;          my $self = shift;
367          return $self->{'size'};          return $self->{size};
368  }  }
369    
370  =head2 seek  =head2 seek
# Line 329  sub seek { Line 386  sub seek {
386          if ($pos < 1) {          if ($pos < 1) {
387                  $log->warn("seek before first record");                  $log->warn("seek before first record");
388                  $pos = 1;                  $pos = 1;
389          } elsif ($pos > $self->{'max_mfn'}) {          } elsif ($pos > $self->{max_pos}) {
390                  $log->warn("seek beyond last record");                  $log->warn("seek beyond last record");
391                  $pos = $self->{'max_mfn'};                  $pos = $self->{max_pos};
392          }          }
393    
394          return $self->{'current_mfn'} = (($pos - 1) || -1);          return $self->{pos} = (($pos - 1) || -1);
395  }  }
396    
397    

Legend:
Removed from v.285  
changed lines
  Added in v.416

  ViewVC Help
Powered by ViewVC 1.1.26