/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 9 by dpavlin, Sat Jul 16 17:14:43 2005 UTC revision 307 by dpavlin, Tue Dec 20 00:03:04 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    
10  =head1 NAME  =head1 NAME
11    
12  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
13    
14  =head1 VERSION  =head1 VERSION
15    
16  Version 0.01  Version 0.03
17    
18  =cut  =cut
19    
20  our $VERSION = '0.01';  our $VERSION = '0.03';
21    
22  =head1 SYNOPSIS  =head1 SYNOPSIS
23    
24  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
25    I<size> while indexing and single unique numeric identifier for database
26    position ranging from 1 to I<size>.
27    
28    Simply, something that is indexed by unmber from 1 .. I<size>.
29    
30    Examples of such databases are CDS/ISIS files, MARC files, lines in
31    text file, and so on.
32    
33    Specific file formats are implemented using low-level interface modules,
34    located in C<WebPAC::Input::*> namespace which export C<open_db>,
35    C<fetch_rec> and optional C<init> functions.
36    
37  Perhaps a little code snippet.  Perhaps a little code snippet.
38    
39      use WebPAC::Input;      use WebPAC::Input;
40    
41      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
42          format => 'NULL',          module => 'WebPAC::Input::ISIS',
43          config => $config,                  config => $config,
44          lookup => $lookup_obj,                  lookup => $lookup_obj,
45                    low_mem => 1,
46      );      );
47    
48      $db->open('/path/to/database');      $db->open('/path/to/database');
49      print "database size: ",$db->size,"\n";      print "database size: ",$db->size,"\n";
50      while (my $row = $db->fetch) {      while (my $rec = $db->fetch) {
         ...  
51      }      }
52      $db->close;  
53    
54    
55  =head1 FUNCTIONS  =head1 FUNCTIONS
56    
# Line 43  Perhaps a little code snippet. Line 59  Perhaps a little code snippet.
59  Create new input database object.  Create new input database object.
60    
61    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
62          format => 'NULL'          module => 'WebPAC::Input::MARC',
63          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
64            low_mem => 1,
65    );    );
66    
67    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
68    L<WebPAC::Input::MARC>.
69    
70  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
71  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
72  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
73    
74    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
75    
76    This function will also call low-level C<init> if it exists with same
77    parametars.
78    
79  =cut  =cut
80    
81  sub new {  sub new {
82          my $class = shift;          my $class = shift;
83          my $self = {@_};          my $self = {@_};
84          bless($self, $class);          bless($self, $class);
85    
86            my $log = $self->_get_logger;
87    
88            $log->logconfess("specify low-level file format module") unless ($self->{module});
89            my $module = $self->{module};
90            $module =~ s#::#/#g;
91            $module .= '.pm';
92            $log->debug("require low-level module $self->{module} from $module");
93    
94            require $module;
95            #eval $self->{module} .'->import';
96    
97            # check if required subclasses are implemented
98            foreach my $subclass (qw/open_db fetch_rec init/) {
99                    my $n = $self->{module} . '::' . $subclass;
100                    if (! defined &{ $n }) {
101                            my $missing = "missing $subclass in $self->{module}";
102                            $self->{$subclass} = sub { $log->logwarn($missing) };
103                    } else {
104                            $self->{$subclass} = \&{ $n };
105                    }
106            }
107    
108            if ($self->{init}) {
109                    $log->debug("calling init");
110                    $self->{init}->($self, @_);
111            }
112    
113          $self->{'code_page'} ||= 'ISO-8859-2';          $self->{'code_page'} ||= 'ISO-8859-2';
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141          $self ? return $self : return undef;          $self ? return $self : return undef;
142  }  }
143    
144    =head2 open
145    
146    This function will read whole database in memory and produce lookups.
147    
148     $input->open(
149            path => '/path/to/database/file',
150            code_page => '852',
151            limit => 500,
152            offset => 6000,
153            lookup => $lookup_obj,
154     );
155    
156    By default, C<code_page> is assumed to be C<852>.
157    
158    C<offset> is optional parametar to position at some offset before reading from database.
159    
160    C<limit> is optional parametar to read just C<limit> records from database
161    
162    Returns size of database, regardless of C<offset> and C<limit>
163    parametars, see also C<size>.
164    
165    =cut
166    
167    sub open {
168            my $self = shift;
169            my $arg = {@_};
170    
171            my $log = $self->_get_logger();
172    
173            $log->logcroak("need path") if (! $arg->{'path'});
174            my $code_page = $arg->{'code_page'} || '852';
175    
176            # store data in object
177            $self->{'input_code_page'} = $code_page;
178            foreach my $v (qw/path offset limit/) {
179                    $self->{$v} = $arg->{$v} if ($arg->{$v});
180            }
181    
182            # create Text::Iconv object
183            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
184    
185            my ($db, $size) = $self->{open_db}->( $self,
186                    path => $arg->{path},
187            );
188    
189            unless ($db) {
190                    $log->logwarn("can't open database $arg->{path}, skipping...");
191                    return;
192            }
193    
194            unless ($size) {
195                    $log->logwarn("no records in database $arg->{path}, skipping...");
196                    return;
197            }
198    
199            my $offset = 1;
200            my $limit = $size;
201    
202            if (my $s = $self->{offset}) {
203                    $log->info("skipping to MFN $s");
204                    $offset = $s;
205            } else {
206                    $self->{offset} = $offset;
207            }
208    
209            if ($self->{limit}) {
210                    $log->debug("limiting to ",$self->{limit}," records");
211                    $limit = $offset + $self->{limit} - 1;
212                    $limit = $size if ($limit > $size);
213            }
214    
215            # store size for later
216            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
217    
218            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
219    
220            # read database
221            for (my $pos = $offset; $pos <= $limit; $pos++) {
222    
223                    $log->debug("position: $pos\n");
224    
225                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
226    
227                    if (! $rec) {
228                            $log->warn("record $pos empty? skipping...");
229                            next;
230                    }
231    
232                    # store
233                    if ($self->{low_mem}) {
234                            $self->{db}->put($pos, $rec);
235                    } else {
236                            $self->{data}->{$pos} = $rec;
237                    }
238    
239                    # create lookup
240                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
241    
242                    $self->progress_bar($pos,$limit);
243    
244            }
245    
246            $self->{pos} = -1;
247            $self->{last_pcnt} = 0;
248    
249            # store max mfn and return it.
250            $self->{max_pos} = $limit;
251            $log->debug("max_pos: $limit");
252    
253            return $size;
254    }
255    
256    =head2 fetch
257    
258    Fetch next record from database. It will also displays progress bar.
259    
260     my $rec = $isis->fetch;
261    
262    Record from this function should probably go to C<data_structure> for
263    normalisation.
264    
265    =cut
266    
267    sub fetch {
268            my $self = shift;
269    
270            my $log = $self->_get_logger();
271    
272            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
273    
274            if ($self->{pos} == -1) {
275                    $self->{pos} = $self->{offset};
276            } else {
277                    $self->{pos}++;
278            }
279    
280            my $mfn = $self->{pos};
281    
282            if ($mfn > $self->{max_pos}) {
283                    $self->{pos} = $self->{max_pos};
284                    $log->debug("at EOF");
285                    return;
286            }
287    
288            $self->progress_bar($mfn,$self->{max_pos});
289    
290            my $rec;
291    
292            if ($self->{low_mem}) {
293                    $rec = $self->{db}->get($mfn);
294            } else {
295                    $rec = $self->{data}->{$mfn};
296            }
297    
298            $rec ||= 0E0;
299    }
300    
301    =head2 pos
302    
303    Returns current record number (MFN).
304    
305     print $isis->pos;
306    
307    First record in database has position 1.
308    
309    =cut
310    
311    sub pos {
312            my $self = shift;
313            return $self->{pos};
314    }
315    
316    
317    =head2 size
318    
319    Returns number of records in database
320    
321     print $isis->size;
322    
323    Result from this function can be used to loop through all records
324    
325     foreach my $mfn ( 1 ... $isis->size ) { ... }
326    
327    because it takes into account C<offset> and C<limit>.
328    
329    =cut
330    
331    sub size {
332            my $self = shift;
333            return $self->{size};
334    }
335    
336    =head2 seek
337    
338    Seek to specified MFN in file.
339    
340     $isis->seek(42);
341    
342    First record in database has position 1.
343    
344    =cut
345    
346    sub seek {
347            my $self = shift;
348            my $pos = shift || return;
349    
350            my $log = $self->_get_logger();
351    
352            if ($pos < 1) {
353                    $log->warn("seek before first record");
354                    $pos = 1;
355            } elsif ($pos > $self->{max_pos}) {
356                    $log->warn("seek beyond last record");
357                    $pos = $self->{max_pos};
358            }
359    
360            return $self->{pos} = (($pos - 1) || -1);
361    }
362    
363    
364  =head1 MEMORY USAGE  =head1 MEMORY USAGE
365    
366  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.9  
changed lines
  Added in v.307

  ViewVC Help
Powered by ViewVC 1.1.26