/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by dpavlin, Sat Jul 16 20:54:28 2005 UTC revision 416 by dpavlin, Sun Feb 26 23:21:50 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - core module for input file format  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.01  Version 0.04
18    
19  =cut  =cut
20    
21  our $VERSION = '0.01';  our $VERSION = '0.04';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  This module will load particular loader module and execute it's functions.  This module implements input as database which have fixed and known
26    I<size> while indexing and single unique numeric identifier for database
27    position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $db = WebPAC::Input->new(      my $db = WebPAC::Input->new(
43          format => 'NULL',          module => 'WebPAC::Input::ISIS',
44          config => $config,                  config => $config,
45          lookup => $lookup_obj,                  lookup => $lookup_obj,
46          low_mem => 1,                  low_mem => 1,
47      );      );
48    
49      $db->open('/path/to/database');      $db->open('/path/to/database');
50      print "database size: ",$db->size,"\n";          print "database size: ",$db->size,"\n";
51      while (my $row = $db->fetch) {          while (my $rec = $db->fetch) {
52          ...                  # do something with $rec
53      }          }
54    
55    
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
# Line 43  Perhaps a little code snippet. Line 61  Perhaps a little code snippet.
61  Create new input database object.  Create new input database object.
62    
63    my $db = new WebPAC::Input(    my $db = new WebPAC::Input(
64          format => 'NULL'          module => 'WebPAC::Input::MARC',
65          code_page => 'ISO-8859-2',          code_page => 'ISO-8859-2',
66          low_mem => 1,          low_mem => 1,
67            recode => 'char pairs',
68    );    );
69    
70    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
71    L<WebPAC::Input::MARC>.
72    
73  Optional parametar C<code_page> specify application code page (which will be  Optional parametar C<code_page> specify application code page (which will be
74  used internally). This should probably be your terminal encoding, and by  used internally). This should probably be your terminal encoding, and by
75  default, it C<ISO-8859-2>.  default, it C<ISO-8859-2>.
76    
77  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).  Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
78    
79    This function will also call low-level C<init> if it exists with same
80    parametars.
81    
82  =cut  =cut
83    
84  sub new {  sub new {
85          my $class = shift;          my $class = shift;
86          my $self = {@_};          my $self = {@_};
87          bless($self, $class);          bless($self, $class);
88    
         $self->{'code_page'} ||= 'ISO-8859-2';  
   
89          my $log = $self->_get_logger;          my $log = $self->_get_logger;
90    
91            $log->logconfess("specify low-level file format module") unless ($self->{module});
92            my $module = $self->{module};
93            $module =~ s#::#/#g;
94            $module .= '.pm';
95            $log->debug("require low-level module $self->{module} from $module");
96    
97            require $module;
98            #eval $self->{module} .'->import';
99    
100            # check if required subclasses are implemented
101            foreach my $subclass (qw/open_db fetch_rec init/) {
102                    my $n = $self->{module} . '::' . $subclass;
103                    if (! defined &{ $n }) {
104                            my $missing = "missing $subclass in $self->{module}";
105                            $self->{$subclass} = sub { $log->logwarn($missing) };
106                    } else {
107                            $self->{$subclass} = \&{ $n };
108                    }
109            }
110    
111            if ($self->{init}) {
112                    $log->debug("calling init");
113                    $self->{init}->($self, @_);
114            }
115    
116            $self->{'code_page'} ||= 'ISO-8859-2';
117    
118          # running with low_mem flag? well, use DBM::Deep then.          # running with low_mem flag? well, use DBM::Deep then.
119          if ($self->{'low_mem'}) {          if ($self->{'low_mem'}) {
120                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");                  $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
# Line 94  sub new { Line 144  sub new {
144          $self ? return $self : return undef;          $self ? return $self : return undef;
145  }  }
146    
147    =head2 open
148    
149    This function will read whole database in memory and produce lookups.
150    
151     $input->open(
152            path => '/path/to/database/file',
153            code_page => '852',
154            limit => 500,
155            offset => 6000,
156            lookup => $lookup_obj,
157     );
158    
159    By default, C<code_page> is assumed to be C<852>.
160    
161    C<offset> is optional parametar to position at some offset before reading from database.
162    
163    C<limit> is optional parametar to read just C<limit> records from database
164    
165    Returns size of database, regardless of C<offset> and C<limit>
166    parametars, see also C<size>.
167    
168    =cut
169    
170    sub open {
171            my $self = shift;
172            my $arg = {@_};
173    
174            my $log = $self->_get_logger();
175    
176            $log->logcroak("need path") if (! $arg->{'path'});
177            my $code_page = $arg->{'code_page'} || '852';
178    
179            # store data in object
180            $self->{'input_code_page'} = $code_page;
181            foreach my $v (qw/path offset limit/) {
182                    $self->{$v} = $arg->{$v} if ($arg->{$v});
183            }
184    
185            # create Text::Iconv object
186            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188            my $filter_ref;
189    
190            if ($self->{recode}) {
191                    my @r = split(/\s/, $self->{recode});
192                    if ($#r % 2 != 1) {
193                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
194                    } else {
195                            my $recode;
196                            while (@r) {
197                                    my $from = shift @r;
198                                    my $to = shift @r;
199                                    $recode->{$from} = $to;
200                            }
201    
202                            my $regex = join '|' => keys %{ $recode };
203    
204                            $log->debug("using recode regex: $regex");
205                            
206                            $filter_ref = sub {
207                                    my $t = shift;
208                                    $t =~ s/($regex)/$recode->{$1}/g;
209                                    return $t;
210                            };
211    
212                    }
213    
214            }
215    
216            my ($db, $size) = $self->{open_db}->( $self,
217                    path => $arg->{path},
218                    filter => $filter_ref,
219            );
220    
221            unless ($db) {
222                    $log->logwarn("can't open database $arg->{path}, skipping...");
223                    return;
224            }
225    
226            unless ($size) {
227                    $log->logwarn("no records in database $arg->{path}, skipping...");
228                    return;
229            }
230    
231            my $from_rec = 1;
232            my $to_rec = $size;
233    
234            if (my $s = $self->{offset}) {
235                    $log->info("skipping to MFN $s");
236                    $from_rec = $s;
237            } else {
238                    $self->{offset} = $from_rec;
239            }
240    
241            if ($self->{limit}) {
242                    $log->debug("limiting to ",$self->{limit}," records");
243                    $to_rec = $from_rec + $self->{limit} - 1;
244                    $to_rec = $size if ($to_rec > $size);
245            }
246    
247            # store size for later
248            $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
249    
250            $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
251    
252            # read database
253            for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
254    
255                    $log->debug("position: $pos\n");
256    
257                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
258    
259                    $log->debug(sub { Dumper($rec) });
260    
261                    if (! $rec) {
262                            $log->warn("record $pos empty? skipping...");
263                            next;
264                    }
265    
266                    # store
267                    if ($self->{low_mem}) {
268                            $self->{db}->put($pos, $rec);
269                    } else {
270                            $self->{data}->{$pos} = $rec;
271                    }
272    
273                    # create lookup
274                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
275    
276                    $self->progress_bar($pos,$to_rec);
277    
278            }
279    
280            $self->{pos} = -1;
281            $self->{last_pcnt} = 0;
282    
283            # store max mfn and return it.
284            $self->{max_pos} = $to_rec;
285            $log->debug("max_pos: $to_rec");
286    
287            return $size;
288    }
289    
290    =head2 fetch
291    
292    Fetch next record from database. It will also displays progress bar.
293    
294     my $rec = $isis->fetch;
295    
296    Record from this function should probably go to C<data_structure> for
297    normalisation.
298    
299    =cut
300    
301    sub fetch {
302            my $self = shift;
303    
304            my $log = $self->_get_logger();
305    
306            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
307    
308            if ($self->{pos} == -1) {
309                    $self->{pos} = $self->{offset};
310            } else {
311                    $self->{pos}++;
312            }
313    
314            my $mfn = $self->{pos};
315    
316            if ($mfn > $self->{max_pos}) {
317                    $self->{pos} = $self->{max_pos};
318                    $log->debug("at EOF");
319                    return;
320            }
321    
322            $self->progress_bar($mfn,$self->{max_pos});
323    
324            my $rec;
325    
326            if ($self->{low_mem}) {
327                    $rec = $self->{db}->get($mfn);
328            } else {
329                    $rec = $self->{data}->{$mfn};
330            }
331    
332            $rec ||= 0E0;
333    }
334    
335    =head2 pos
336    
337    Returns current record number (MFN).
338    
339     print $isis->pos;
340    
341    First record in database has position 1.
342    
343    =cut
344    
345    sub pos {
346            my $self = shift;
347            return $self->{pos};
348    }
349    
350    
351    =head2 size
352    
353    Returns number of records in database
354    
355     print $isis->size;
356    
357    Result from this function can be used to loop through all records
358    
359     foreach my $mfn ( 1 ... $isis->size ) { ... }
360    
361    because it takes into account C<offset> and C<limit>.
362    
363    =cut
364    
365    sub size {
366            my $self = shift;
367            return $self->{size};
368    }
369    
370    =head2 seek
371    
372    Seek to specified MFN in file.
373    
374     $isis->seek(42);
375    
376    First record in database has position 1.
377    
378    =cut
379    
380    sub seek {
381            my $self = shift;
382            my $pos = shift || return;
383    
384            my $log = $self->_get_logger();
385    
386            if ($pos < 1) {
387                    $log->warn("seek before first record");
388                    $pos = 1;
389            } elsif ($pos > $self->{max_pos}) {
390                    $log->warn("seek beyond last record");
391                    $pos = $self->{max_pos};
392            }
393    
394            return $self->{pos} = (($pos - 1) || -1);
395    }
396    
397    
398  =head1 MEMORY USAGE  =head1 MEMORY USAGE
399    
400  C<low_mem> options is double-edged sword. If enabled, WebPAC  C<low_mem> options is double-edged sword. If enabled, WebPAC

Legend:
Removed from v.11  
changed lines
  Added in v.416

  ViewVC Help
Powered by ViewVC 1.1.26