/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by dpavlin, Sat Jul 16 20:54:28 2005 UTC revision 289 by dpavlin, Sun Dec 18 22:16:44 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.03
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.03';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
27    I<size> while indexing and single unique numeric identifier for database
28    position ranging from 1 to I<size>.
29    
30    Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32    Examples of such databases are CDS/ISIS files, MARC files, lines in
33    text file, and so on.
34    
35    Specific file formats are implemented using low-level interface modules,
36    located in C<WebPAC::Input::*> namespace which export C<open_db>,
37    C<fetch_rec> and optional C<init> functions.
38    
39  Perhaps a little code snippet.  Perhaps a little code snippet.
40    
41      use WebPAC::Input;      use WebPAC::Input;
42    
43      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
44          format => 'NULL',          module => 'WebPAC::Input::ISIS',
45          config => $config,                  config => $config,
46          lookup => $lookup_obj,                  lookup => $lookup_obj,
47          low_mem => 1,                  low_mem => 1,
48      );      );
49    
50      $db->open('/path/to/database');      $db->open('/path/to/database');
51      print "database size: ",$db->size,"\n";      print "database size: ",$db->size,"\n";
52      while (my $row = $db->fetch) {      while (my $rec = $db->fetch) {
         ...  
53      }      }
54    
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 new  =head2 new
# Line 43  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67    );    );
68    
69    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70    L<WebPAC::Input::MARC>.
71    
72  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
73  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
74  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
75    
76  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78    This function will also call low-level C<init> if it exists with same
79    parametars.
80    
81  =cut  =cut
82    
83  sub new {  sub new {
84          my $class = shift;          my $class = shift;
85          my $self = {@_};          my $self = {@_};
86          bless($self, $class);          bless($self, $class);
87    
         $self->{'code_page'} ||= 'ISO-8859-2';  
   
88          my $log = $self->_get_logger;          my $log = $self->_get_logger;
89    
90            $log->logconfess("specify low-level file format module") unless ($self->{module});
91            my $module = $self->{module};
92            $module =~ s#::#/#g;
93            $module .= '.pm';
94            $log->debug("require low-level module $self->{module} from $module");
95    
96            require $module;
97            #eval $self->{module} .'->import';
98    
99            # check if required subclasses are implemented
100            foreach my $subclass (qw/open_db fetch_rec init/) {
101                    my $n = $self->{module} . '::' . $subclass;
102                    if (! defined &{ $n }) {
103                            $log->logwarn("missing implementation of $subclass");
104                    } else {
105                            $self->{$subclass} = \&{ $n };
106                    }
107            }
108    
109            if ($self->{init}) {
110                    $log->debug("calling init");
111                    $self->{init}->($self, @_);
112            }
113    
114            $self->{'code_page'} ||= 'ISO-8859-2';
115    
116          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
117          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
118                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
# Line 94  sub new { Line 142  sub new {
142          $self ? return $self : return undef;          $self ? return $self : return undef;
143  }  }
144    
145    =head2 open
146    
147    This function will read whole database in memory and produce lookups.
148    
149     $input->open(
150            path => '/path/to/database/file',
151            code_page => '852',
152            limit => 500,
153            offset => 6000,
154            lookup => $lookup_obj,
155     );
156    
157    By default, C<code_page> is assumed to be C<852>.
158    
159    C<offset> is optional parametar to position at some offset before reading from database.
160    
161    C<limit> is optional parametar to read just C<limit> records from database
162    
163    Returns size of database, regardless of C<offset> and C<limit>
164    parametars, see also C<size>.
165    
166    =cut
167    
168    sub open {
169            my $self = shift;
170            my $arg = {@_};
171    
172            my $log = $self->_get_logger();
173    
174            $log->logcroak("need path") if (! $arg->{'path'});
175            my $code_page = $arg->{'code_page'} || '852';
176    
177            # store data in object
178            $self->{'code_page'} = $code_page;
179            foreach my $v (qw/path offset limit/) {
180                    $self->{$v} = $arg->{$v} if ($arg->{$v});
181            }
182    
183            # create Text::Iconv object
184            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
185    
186            my ($db, $size) = $self->{open_db}->( $self,
187                    path => $arg->{path},
188            );
189    
190            unless ($db) {
191                    $log->logwarn("can't open database $arg->{path}, skipping...");
192                    return;
193            }
194    
195            unless ($size) {
196                    $log->logwarn("no records in database $arg->{path}, skipping...");
197                    return;
198            }
199    
200            my $offset = 1;
201            my $limit = $size;
202    
203            if (my $s = $self->{offset}) {
204                    $log->info("skipping to MFN $s");
205                    $offset = $s;
206            } else {
207                    $self->{offset} = $offset;
208            }
209    
210            if ($self->{limit}) {
211                    $log->info("limiting to ",$self->{limit}," records");
212                    $limit = $offset + $self->{limit} - 1;
213                    $limit = $size if ($limit > $size);
214            }
215    
216            # store size for later
217            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
218    
219            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
220    
221            # read database
222            for (my $pos = $offset; $pos <= $limit; $pos++) {
223    
224                    $log->debug("position: $pos\n");
225    
226                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
227    
228                    if (! $rec) {
229                            $log->warn("record $pos empty? skipping...");
230                            next;
231                    }
232    
233                    # store
234                    if ($self->{low_mem}) {
235                            $self->{db}->put($pos, $rec);
236                    } else {
237                            $self->{data}->{$pos} = $rec;
238                    }
239    
240                    # create lookup
241                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
242    
243                    $self->progress_bar($pos,$limit);
244    
245            }
246    
247            $self->{pos} = -1;
248            $self->{last_pcnt} = 0;
249    
250            # store max mfn and return it.
251            $self->{max_pos} = $limit;
252            $log->debug("max_pos: $limit");
253    
254            return $size;
255    }
256    
257    =head2 fetch
258    
259    Fetch next record from database. It will also displays progress bar.
260    
261     my $rec = $isis->fetch;
262    
263    Record from this function should probably go to C<data_structure> for
264    normalisation.
265    
266    =cut
267    
268    sub fetch {
269            my $self = shift;
270    
271            my $log = $self->_get_logger();
272    
273            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
274    
275            if ($self->{pos} == -1) {
276                    $self->{pos} = $self->{offset};
277            } else {
278                    $self->{pos}++;
279            }
280    
281            my $mfn = $self->{pos};
282    
283            if ($mfn > $self->{max_pos}) {
284                    $self->{pos} = $self->{max_pos};
285                    $log->debug("at EOF");
286                    return;
287            }
288    
289            $self->progress_bar($mfn,$self->{max_pos});
290    
291            my $rec;
292    
293            if ($self->{low_mem}) {
294                    $rec = $self->{db}->get($mfn);
295            } else {
296                    $rec = $self->{data}->{$mfn};
297            }
298    
299            $rec ||= 0E0;
300    }
301    
302    =head2 pos
303    
304    Returns current record number (MFN).
305    
306     print $isis->pos;
307    
308    First record in database has position 1.
309    
310    =cut
311    
312    sub pos {
313            my $self = shift;
314            return $self->{pos};
315    }
316    
317    
318    =head2 size
319    
320    Returns number of records in database
321    
322     print $isis->size;
323    
324    Result from this function can be used to loop through all records
325    
326     foreach my $mfn ( 1 ... $isis->size ) { ... }
327    
328    because it takes into account C<offset> and C<limit>.
329    
330    =cut
331    
332    sub size {
333            my $self = shift;
334            return $self->{size};
335    }
336    
337    =head2 seek
338    
339    Seek to specified MFN in file.
340    
341     $isis->seek(42);
342    
343    First record in database has position 1.
344    
345    =cut
346    
347    sub seek {
348            my $self = shift;
349            my $pos = shift || return;
350    
351            my $log = $self->_get_logger();
352    
353            if ($pos < 1) {
354                    $log->warn("seek before first record");
355                    $pos = 1;
356            } elsif ($pos > $self->{max_pos}) {
357                    $log->warn("seek beyond last record");
358                    $pos = $self->{max_pos};
359            }
360    
361            return $self->{pos} = (($pos - 1) || -1);
362    }
363    
364    
365  =head1 MEMORY USAGE  =head1 MEMORY USAGE
366    
367  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.11  
changed lines
  Added in v.289

  ViewVC Help
Powered by ViewVC 1.1.26