/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 10 by dpavlin, Sat Jul 16 20:35:30 2005 UTC revision 301 by dpavlin, Mon Dec 19 21:26:04 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.03
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.03';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
27    I<size> while indexing and single unique numeric identifier for database
28    position ranging from 1 to I<size>.
29    
30    Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32    Examples of such databases are CDS/ISIS files, MARC files, lines in
33    text file, and so on.
34    
35    Specific file formats are implemented using low-level interface modules,
36    located in C<WebPAC::Input::*> namespace which export C<open_db>,
37    C<fetch_rec> and optional C<init> functions.
38    
39  Perhaps a little code snippet.  Perhaps a little code snippet.
40    
41      use WebPAC::Input;      use WebPAC::Input;
42    
43      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
44          format => 'NULL',          module => 'WebPAC::Input::ISIS',
45          config => $config,                  config => $config,
46          lookup => $lookup_obj,                  lookup => $lookup_obj,
47          low_mem => 1,                  low_mem => 1,
48      );      );
49    
50      $db->open('/path/to/database');      $db->open('/path/to/database');
51      print "database size: ",$db->size,"\n";      print "database size: ",$db->size,"\n";
52      while (my $row = $db->fetch) {      while (my $rec = $db->fetch) {
         ...  
53      }      }
54      $db->close;  
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
# Line 44  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67    );    );
68    
69    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70    L<WebPAC::Input::MARC>.
71    
72  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
73  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
74  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
75    
76  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78    This function will also call low-level C<init> if it exists with same
79    parametars.
80    
81  =cut  =cut
82    
83  sub new {  sub new {
84          my $class = shift;          my $class = shift;
85          my $self = {@_};          my $self = {@_};
86          bless($self, $class);          bless($self, $class);
87    
         $self->{'code_page'} ||= 'ISO-8859-2';  
   
88          my $log = $self->_get_logger;          my $log = $self->_get_logger;
89    
90            $log->logconfess("specify low-level file format module") unless ($self->{module});
91            my $module = $self->{module};
92            $module =~ s#::#/#g;
93            $module .= '.pm';
94            $log->debug("require low-level module $self->{module} from $module");
95    
96            require $module;
97            #eval $self->{module} .'->import';
98    
99            # check if required subclasses are implemented
100            foreach my $subclass (qw/open_db fetch_rec init/) {
101                    my $n = $self->{module} . '::' . $subclass;
102                    if (! defined &{ $n }) {
103                            my $missing = "missing $subclass in $self->{module}";
104                            $self->{$subclass} = sub { $log->logwarn($missing) };
105                    } else {
106                            $self->{$subclass} = \&{ $n };
107                    }
108            }
109    
110            if ($self->{init}) {
111                    $log->debug("calling init");
112                    $self->{init}->($self, @_);
113            }
114    
115            $self->{'code_page'} ||= 'ISO-8859-2';
116    
117          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
118          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
119                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
# Line 95  sub new { Line 143  sub new {
143          $self ? return $self : return undef;          $self ? return $self : return undef;
144  }  }
145    
146    =head2 open
147    
148    This function will read whole database in memory and produce lookups.
149    
150     $input->open(
151            path => '/path/to/database/file',
152            code_page => '852',
153            limit => 500,
154            offset => 6000,
155            lookup => $lookup_obj,
156     );
157    
158    By default, C<code_page> is assumed to be C<852>.
159    
160    C<offset> is optional parametar to position at some offset before reading from database.
161    
162    C<limit> is optional parametar to read just C<limit> records from database
163    
164    Returns size of database, regardless of C<offset> and C<limit>
165    parametars, see also C<size>.
166    
167    =cut
168    
169    sub open {
170            my $self = shift;
171            my $arg = {@_};
172    
173            my $log = $self->_get_logger();
174    
175            $log->logcroak("need path") if (! $arg->{'path'});
176            my $code_page = $arg->{'code_page'} || '852';
177    
178            # store data in object
179            $self->{'input_code_page'} = $code_page;
180            foreach my $v (qw/path offset limit/) {
181                    $self->{$v} = $arg->{$v} if ($arg->{$v});
182            }
183    
184            # create Text::Iconv object
185            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
186    
187            my ($db, $size) = $self->{open_db}->( $self,
188                    path => $arg->{path},
189            );
190    
191            unless ($db) {
192                    $log->logwarn("can't open database $arg->{path}, skipping...");
193                    return;
194            }
195    
196            unless ($size) {
197                    $log->logwarn("no records in database $arg->{path}, skipping...");
198                    return;
199            }
200    
201            my $offset = 1;
202            my $limit = $size;
203    
204            if (my $s = $self->{offset}) {
205                    $log->info("skipping to MFN $s");
206                    $offset = $s;
207            } else {
208                    $self->{offset} = $offset;
209            }
210    
211            if ($self->{limit}) {
212                    $log->debug("limiting to ",$self->{limit}," records");
213                    $limit = $offset + $self->{limit} - 1;
214                    $limit = $size if ($limit > $size);
215            }
216    
217            # store size for later
218            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
219    
220            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
221    
222            # read database
223            for (my $pos = $offset; $pos <= $limit; $pos++) {
224    
225                    $log->debug("position: $pos\n");
226    
227                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
228    
229                    if (! $rec) {
230                            $log->warn("record $pos empty? skipping...");
231                            next;
232                    }
233    
234                    # store
235                    if ($self->{low_mem}) {
236                            $self->{db}->put($pos, $rec);
237                    } else {
238                            $self->{data}->{$pos} = $rec;
239                    }
240    
241                    # create lookup
242                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
243    
244                    $self->progress_bar($pos,$limit);
245    
246            }
247    
248            $self->{pos} = -1;
249            $self->{last_pcnt} = 0;
250    
251            # store max mfn and return it.
252            $self->{max_pos} = $limit;
253            $log->debug("max_pos: $limit");
254    
255            return $size;
256    }
257    
258    =head2 fetch
259    
260    Fetch next record from database. It will also displays progress bar.
261    
262     my $rec = $isis->fetch;
263    
264    Record from this function should probably go to C<data_structure> for
265    normalisation.
266    
267    =cut
268    
269    sub fetch {
270            my $self = shift;
271    
272            my $log = $self->_get_logger();
273    
274            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
275    
276            if ($self->{pos} == -1) {
277                    $self->{pos} = $self->{offset};
278            } else {
279                    $self->{pos}++;
280            }
281    
282            my $mfn = $self->{pos};
283    
284            if ($mfn > $self->{max_pos}) {
285                    $self->{pos} = $self->{max_pos};
286                    $log->debug("at EOF");
287                    return;
288            }
289    
290            $self->progress_bar($mfn,$self->{max_pos});
291    
292            my $rec;
293    
294            if ($self->{low_mem}) {
295                    $rec = $self->{db}->get($mfn);
296            } else {
297                    $rec = $self->{data}->{$mfn};
298            }
299    
300            $rec ||= 0E0;
301    }
302    
303    =head2 pos
304    
305    Returns current record number (MFN).
306    
307     print $isis->pos;
308    
309    First record in database has position 1.
310    
311    =cut
312    
313    sub pos {
314            my $self = shift;
315            return $self->{pos};
316    }
317    
318    
319    =head2 size
320    
321    Returns number of records in database
322    
323     print $isis->size;
324    
325    Result from this function can be used to loop through all records
326    
327     foreach my $mfn ( 1 ... $isis->size ) { ... }
328    
329    because it takes into account C<offset> and C<limit>.
330    
331    =cut
332    
333    sub size {
334            my $self = shift;
335            return $self->{size};
336    }
337    
338    =head2 seek
339    
340    Seek to specified MFN in file.
341    
342     $isis->seek(42);
343    
344    First record in database has position 1.
345    
346    =cut
347    
348    sub seek {
349            my $self = shift;
350            my $pos = shift || return;
351    
352            my $log = $self->_get_logger();
353    
354            if ($pos < 1) {
355                    $log->warn("seek before first record");
356                    $pos = 1;
357            } elsif ($pos > $self->{max_pos}) {
358                    $log->warn("seek beyond last record");
359                    $pos = $self->{max_pos};
360            }
361    
362            return $self->{pos} = (($pos - 1) || -1);
363    }
364    
365    
366  =head1 MEMORY USAGE  =head1 MEMORY USAGE
367    
368  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.10  
changed lines
  Added in v.301

  ViewVC Help
Powered by ViewVC 1.1.26