/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by dpavlin, Sat Jul 16 20:54:28 2005 UTC revision 287 by dpavlin, Sun Dec 18 21:06:51 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.03
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.03';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
27    I<size> while indexing and single unique numeric identifier for database
28    position ranging from 1 to I<size>.
29    
30    Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32    Examples of such databases are CDS/ISIS files, MARC files, lines in
33    text file, and so on.
34    
35    Specific file formats are implemented using low-level interface modules,
36    located in C<WebPAC::Input::*> namespace which export C<open_db>,
37    C<fetch_rec> and optional C<init> functions.
38    
39  Perhaps a little code snippet.  Perhaps a little code snippet.
40    
41      use WebPAC::Input;      use WebPAC::Input;
42    
43      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
44          format => 'NULL',          module => 'WebPAC::Input::ISIS',
45          config => $config,                  config => $config,
46          lookup => $lookup_obj,                  lookup => $lookup_obj,
47          low_mem => 1,                  low_mem => 1,
48      );      );
49    
50      $db->open('/path/to/database');      $db->open('/path/to/database');
51      print "database size: ",$db->size,"\n";      print "database size: ",$db->size,"\n";
52      while (my $row = $db->fetch) {      while (my $rec = $db->fetch) {
         ...  
53      }      }
54    
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 new  =head2 new
# Line 43  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67    );    );
68    
69    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70    L<WebPAC::Input::MARC>.
71    
72  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
73  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
74  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
75    
76  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78    This function will also call low-level C<init> if it exists with same
79    parametars.
80    
81  =cut  =cut
82    
83  sub new {  sub new {
84          my $class = shift;          my $class = shift;
85          my $self = {@_};          my $self = {@_};
86          bless($self, $class);          bless($self, $class);
87    
         $self->{'code_page'} ||= 'ISO-8859-2';  
   
88          my $log = $self->_get_logger;          my $log = $self->_get_logger;
89    
90            $log->logconfess("specify low-level file format module") unless ($self->{module});
91            my $module = $self->{module};
92            $module =~ s#::#/#g;
93            $module .= '.pm';
94            $log->debug("require low-level module $self->{module} from $module");
95            require $module;
96            eval $self->{module} .'->import';
97    
98            # check if required subclasses are implemented
99            foreach my $subclass (qw/open_db fetch_rec/) {
100                    if ( $self->can($subclass) ) {
101                            $log->debug("imported $subclass");
102                    } else {
103                            $log->warn("missing $subclass in $self->{module}");
104                    }
105            }
106    
107            if ($self->can('init')) {
108                    $log->debug("calling init");
109                    $self->init(@_);
110            }
111    
112            $self->{'code_page'} ||= 'ISO-8859-2';
113    
114          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
115          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
116                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
# Line 94  sub new { Line 140  sub new {
140          $self ? return $self : return undef;          $self ? return $self : return undef;
141  }  }
142    
143    =head2 open
144    
145    This function will read whole database in memory and produce lookups.
146    
147     $input->open(
148            path => '/path/to/database/file',
149            code_page => '852',
150            limit => 500,
151            offset => 6000,
152            lookup => $lookup_obj,
153     );
154    
155    By default, C<code_page> is assumed to be C<852>.
156    
157    C<offset> is optional parametar to position at some offset before reading from database.
158    
159    C<limit> is optional parametar to read just C<limit> records from database
160    
161    Returns size of database, regardless of C<offset> and C<limit>
162    parametars, see also C<size>.
163    
164    =cut
165    
166    sub open {
167            my $self = shift;
168            my $arg = {@_};
169    
170            my $log = $self->_get_logger();
171    
172            $log->logcroak("need path") if (! $arg->{'path'});
173            my $code_page = $arg->{'code_page'} || '852';
174    
175            # store data in object
176            $self->{'code_page'} = $code_page;
177            foreach my $v (qw/path offset limit/) {
178                    $self->{$v} = $arg->{$v} if ($arg->{$v});
179            }
180    
181            # create Text::Iconv object
182            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
183    
184            my ($db, $size) = $self->open_db(
185                    path => $arg->{path},
186            );
187    
188            unless ($db) {
189                    $log->logwarn("can't open database $arg->{path}, skipping...");
190                    return;
191            }
192    
193            unless ($size) {
194                    $log->logwarn("no records in database $arg->{path}, skipping...");
195                    return;
196            }
197    
198            my $offset = 1;
199            my $limit = $size;
200    
201            if (my $s = $self->{offset}) {
202                    $log->info("skipping to MFN $s");
203                    $offset = $s;
204            } else {
205                    $self->{offset} = $offset;
206            }
207    
208            if ($self->{limit}) {
209                    $log->info("limiting to ",$self->{limit}," records");
210                    $limit = $offset + $self->{limit} - 1;
211                    $limit = $size if ($limit > $size);
212            }
213    
214            # store size for later
215            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
216    
217            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
218    
219            # read database
220            for (my $pos = $offset; $pos <= $limit; $pos++) {
221    
222                    $log->debug("position: $pos\n");
223    
224                    my $rec = $self->fetch_rec( $db, $pos );
225    
226                    if (! $rec) {
227                            $log->warn("record $pos empty? skipping...");
228                            next;
229                    }
230    
231                    # store
232                    if ($self->{low_mem}) {
233                            $self->{db}->put($pos, $rec);
234                    } else {
235                            $self->{data}->{$pos} = $rec;
236                    }
237    
238                    # create lookup
239                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
240    
241                    $self->progress_bar($pos,$limit);
242    
243            }
244    
245            $self->{pos} = -1;
246            $self->{last_pcnt} = 0;
247    
248            # store max mfn and return it.
249            $self->{max_pos} = $limit;
250            $log->debug("max_pos: $limit");
251    
252            return $size;
253    }
254    
255    =head2 fetch
256    
257    Fetch next record from database. It will also displays progress bar.
258    
259     my $rec = $isis->fetch;
260    
261    Record from this function should probably go to C<data_structure> for
262    normalisation.
263    
264    =cut
265    
266    sub fetch {
267            my $self = shift;
268    
269            my $log = $self->_get_logger();
270    
271            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
272    
273            if ($self->{pos} == -1) {
274                    $self->{pos} = $self->{offset};
275            } else {
276                    $self->{pos}++;
277            }
278    
279            my $mfn = $self->{pos};
280    
281            if ($mfn > $self->{max_pos}) {
282                    $self->{pos} = $self->{max_pos};
283                    $log->debug("at EOF");
284                    return;
285            }
286    
287            $self->progress_bar($mfn,$self->{max_pos});
288    
289            my $rec;
290    
291            if ($self->{low_mem}) {
292                    $rec = $self->{db}->get($mfn);
293            } else {
294                    $rec = $self->{data}->{$mfn};
295            }
296    
297            $rec ||= 0E0;
298    }
299    
300    =head2 pos
301    
302    Returns current record number (MFN).
303    
304     print $isis->pos;
305    
306    First record in database has position 1.
307    
308    =cut
309    
310    sub pos {
311            my $self = shift;
312            return $self->{pos};
313    }
314    
315    
316    =head2 size
317    
318    Returns number of records in database
319    
320     print $isis->size;
321    
322    Result from this function can be used to loop through all records
323    
324     foreach my $mfn ( 1 ... $isis->size ) { ... }
325    
326    because it takes into account C<offset> and C<limit>.
327    
328    =cut
329    
330    sub size {
331            my $self = shift;
332            return $self->{size};
333    }
334    
335    =head2 seek
336    
337    Seek to specified MFN in file.
338    
339     $isis->seek(42);
340    
341    First record in database has position 1.
342    
343    =cut
344    
345    sub seek {
346            my $self = shift;
347            my $pos = shift || return;
348    
349            my $log = $self->_get_logger();
350    
351            if ($pos < 1) {
352                    $log->warn("seek before first record");
353                    $pos = 1;
354            } elsif ($pos > $self->{max_pos}) {
355                    $log->warn("seek beyond last record");
356                    $pos = $self->{max_pos};
357            }
358    
359            return $self->{pos} = (($pos - 1) || -1);
360    }
361    
362    
363  =head1 MEMORY USAGE  =head1 MEMORY USAGE
364    
365  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.11  
changed lines
  Added in v.287

  ViewVC Help
Powered by ViewVC 1.1.26