/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 10 by dpavlin, Sat Jul 16 20:35:30 2005 UTC revision 292 by dpavlin, Sun Dec 18 23:34:30 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.03
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.03';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
27    I<size> while indexing and single unique numeric identifier for database
28    position ranging from 1 to I<size>.
29    
30    Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32    Examples of such databases are CDS/ISIS files, MARC files, lines in
33    text file, and so on.
34    
35    Specific file formats are implemented using low-level interface modules,
36    located in C<WebPAC::Input::*> namespace which export C<open_db>,
37    C<fetch_rec> and optional C<init> functions.
38    
39  Perhaps a little code snippet.  Perhaps a little code snippet.
40    
41      use WebPAC::Input;      use WebPAC::Input;
42    
43      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
44          format => 'NULL',          module => 'WebPAC::Input::ISIS',
45          config => $config,                  config => $config,
46          lookup => $lookup_obj,                  lookup => $lookup_obj,
47          low_mem => 1,                  low_mem => 1,
48      );      );
49    
50      $db->open('/path/to/database');      $db->open('/path/to/database');
51      print "database size: ",$db->size,"\n";      print "database size: ",$db->size,"\n";
52      while (my $row = $db->fetch) {      while (my $rec = $db->fetch) {
         ...  
53      }      }
54      $db->close;  
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
# Line 44  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67    );    );
68    
69    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70    L<WebPAC::Input::MARC>.
71    
72  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
73  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
74  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
75    
76  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78    This function will also call low-level C<init> if it exists with same
79    parametars.
80    
81  =cut  =cut
82    
83  sub new {  sub new {
84          my $class = shift;          my $class = shift;
85          my $self = {@_};          my $self = {@_};
86          bless($self, $class);          bless($self, $class);
87    
         $self->{'code_page'} ||= 'ISO-8859-2';  
   
88          my $log = $self->_get_logger;          my $log = $self->_get_logger;
89    
90            $log->logconfess("specify low-level file format module") unless ($self->{module});
91            my $module = $self->{module};
92            $module =~ s#::#/#g;
93            $module .= '.pm';
94            $log->debug("require low-level module $self->{module} from $module");
95    
96            require $module;
97            #eval $self->{module} .'->import';
98    
99            # check if required subclasses are implemented
100            foreach my $subclass (qw/open_db fetch_rec init/) {
101                    my $n = $self->{module} . '::' . $subclass;
102                    if (! defined &{ $n }) {
103                            my $missing = "missing $subclass in $self->{module}";
104                            $log->logwarn($missing);
105                            $self->{$subclass} = sub { warn "$missing\n" };
106                    } else {
107                            $self->{$subclass} = \&{ $n };
108                    }
109            }
110    
111            if ($self->{init}) {
112                    $log->debug("calling init");
113                    $self->{init}->($self, @_);
114            }
115    
116            $self->{'code_page'} ||= 'ISO-8859-2';
117    
118          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
119          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
120                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
# Line 95  sub new { Line 144  sub new {
144          $self ? return $self : return undef;          $self ? return $self : return undef;
145  }  }
146    
147    =head2 open
148    
149    This function will read whole database in memory and produce lookups.
150    
151     $input->open(
152            path => '/path/to/database/file',
153            code_page => '852',
154            limit => 500,
155            offset => 6000,
156            lookup => $lookup_obj,
157     );
158    
159    By default, C<code_page> is assumed to be C<852>.
160    
161    C<offset> is optional parametar to position at some offset before reading from database.
162    
163    C<limit> is optional parametar to read just C<limit> records from database
164    
165    Returns size of database, regardless of C<offset> and C<limit>
166    parametars, see also C<size>.
167    
168    =cut
169    
170    sub open {
171            my $self = shift;
172            my $arg = {@_};
173    
174            my $log = $self->_get_logger();
175    
176            $log->logcroak("need path") if (! $arg->{'path'});
177            my $code_page = $arg->{'code_page'} || '852';
178    
179            # store data in object
180            $self->{'input_code_page'} = $code_page;
181            foreach my $v (qw/path offset limit/) {
182                    $self->{$v} = $arg->{$v} if ($arg->{$v});
183            }
184    
185            # create Text::Iconv object
186            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188            my ($db, $size) = $self->{open_db}->( $self,
189                    path => $arg->{path},
190            );
191    
192            unless ($db) {
193                    $log->logwarn("can't open database $arg->{path}, skipping...");
194                    return;
195            }
196    
197            unless ($size) {
198                    $log->logwarn("no records in database $arg->{path}, skipping...");
199                    return;
200            }
201    
202            my $offset = 1;
203            my $limit = $size;
204    
205            if (my $s = $self->{offset}) {
206                    $log->info("skipping to MFN $s");
207                    $offset = $s;
208            } else {
209                    $self->{offset} = $offset;
210            }
211    
212            if ($self->{limit}) {
213                    $log->info("limiting to ",$self->{limit}," records");
214                    $limit = $offset + $self->{limit} - 1;
215                    $limit = $size if ($limit > $size);
216            }
217    
218            # store size for later
219            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
220    
221            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
222    
223            # read database
224            for (my $pos = $offset; $pos <= $limit; $pos++) {
225    
226                    $log->debug("position: $pos\n");
227    
228                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
229    
230                    if (! $rec) {
231                            $log->warn("record $pos empty? skipping...");
232                            next;
233                    }
234    
235                    # store
236                    if ($self->{low_mem}) {
237                            $self->{db}->put($pos, $rec);
238                    } else {
239                            $self->{data}->{$pos} = $rec;
240                    }
241    
242                    # create lookup
243                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
244    
245                    $self->progress_bar($pos,$limit);
246    
247            }
248    
249            $self->{pos} = -1;
250            $self->{last_pcnt} = 0;
251    
252            # store max mfn and return it.
253            $self->{max_pos} = $limit;
254            $log->debug("max_pos: $limit");
255    
256            return $size;
257    }
258    
259    =head2 fetch
260    
261    Fetch next record from database. It will also displays progress bar.
262    
263     my $rec = $isis->fetch;
264    
265    Record from this function should probably go to C<data_structure> for
266    normalisation.
267    
268    =cut
269    
270    sub fetch {
271            my $self = shift;
272    
273            my $log = $self->_get_logger();
274    
275            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
276    
277            if ($self->{pos} == -1) {
278                    $self->{pos} = $self->{offset};
279            } else {
280                    $self->{pos}++;
281            }
282    
283            my $mfn = $self->{pos};
284    
285            if ($mfn > $self->{max_pos}) {
286                    $self->{pos} = $self->{max_pos};
287                    $log->debug("at EOF");
288                    return;
289            }
290    
291            $self->progress_bar($mfn,$self->{max_pos});
292    
293            my $rec;
294    
295            if ($self->{low_mem}) {
296                    $rec = $self->{db}->get($mfn);
297            } else {
298                    $rec = $self->{data}->{$mfn};
299            }
300    
301            $rec ||= 0E0;
302    }
303    
304    =head2 pos
305    
306    Returns current record number (MFN).
307    
308     print $isis->pos;
309    
310    First record in database has position 1.
311    
312    =cut
313    
314    sub pos {
315            my $self = shift;
316            return $self->{pos};
317    }
318    
319    
320    =head2 size
321    
322    Returns number of records in database
323    
324     print $isis->size;
325    
326    Result from this function can be used to loop through all records
327    
328     foreach my $mfn ( 1 ... $isis->size ) { ... }
329    
330    because it takes into account C<offset> and C<limit>.
331    
332    =cut
333    
334    sub size {
335            my $self = shift;
336            return $self->{size};
337    }
338    
339    =head2 seek
340    
341    Seek to specified MFN in file.
342    
343     $isis->seek(42);
344    
345    First record in database has position 1.
346    
347    =cut
348    
349    sub seek {
350            my $self = shift;
351            my $pos = shift || return;
352    
353            my $log = $self->_get_logger();
354    
355            if ($pos < 1) {
356                    $log->warn("seek before first record");
357                    $pos = 1;
358            } elsif ($pos > $self->{max_pos}) {
359                    $log->warn("seek beyond last record");
360                    $pos = $self->{max_pos};
361            }
362    
363            return $self->{pos} = (($pos - 1) || -1);
364    }
365    
366    
367  =head1 MEMORY USAGE  =head1 MEMORY USAGE
368    
369  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.10  
changed lines
  Added in v.292

  ViewVC Help
Powered by ViewVC 1.1.26