/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 287 by dpavlin, Sun Dec 18 21:06:51 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use blib;
7    
8    use WebPAC::Common;
9    use base qw/WebPAC::Common/;
10    use Text::Iconv;
11    
12  =head1 NAME  =head1 NAME
13    
14  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
15    
16  =head1 VERSION  =head1 VERSION
17    
18  Version 0.01  Version 0.03
19    
20  =cut  =cut
21    
22  our $VERSION = '0.01';  our $VERSION = '0.03';
23    
24  =head1 SYNOPSIS  =head1 SYNOPSIS
25    
26  Quick summary of what the module does.  This module implements input as database which have fixed and known
27    I<size> while indexing and single unique numeric identifier for database
28    position ranging from 1 to I<size>.
29    
30    Simply, something that is indexed by unmber from 1 .. I<size>.
31    
32    Examples of such databases are CDS/ISIS files, MARC files, lines in
33    text file, and so on.
34    
35    Specific file formats are implemented using low-level interface modules,
36    located in C<WebPAC::Input::*> namespace which export C<open_db>,
37    C<fetch_rec> and optional C<init> functions.
38    
39  Perhaps a little code snippet.  Perhaps a little code snippet.
40    
41      use WebPAC::Input;      use WebPAC::Input;
42    
43      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
44      ...          module => 'WebPAC::Input::ISIS',
45                    config => $config,
46                    lookup => $lookup_obj,
47                    low_mem => 1,
48        );
49    
50        $db->open('/path/to/database');
51        print "database size: ",$db->size,"\n";
52        while (my $rec = $db->fetch) {
53        }
54    
 =head1 EXPORT  
55    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 function1  =head2 new
60    
61    Create new input database object.
62    
63      my $db = new WebPAC::Input(
64            module => 'WebPAC::Input::MARC',
65            code_page => 'ISO-8859-2',
66            low_mem => 1,
67      );
68    
69    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70    L<WebPAC::Input::MARC>.
71    
72    Optional parametar C<code_page> specify application code page (which will be
73    used internally). This should probably be your terminal encoding, and by
74    default, it C<ISO-8859-2>.
75    
76    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
77    
78    This function will also call low-level C<init> if it exists with same
79    parametars.
80    
81  =cut  =cut
82    
83  sub function1 {  sub new {
84            my $class = shift;
85            my $self = {@_};
86            bless($self, $class);
87    
88            my $log = $self->_get_logger;
89    
90            $log->logconfess("specify low-level file format module") unless ($self->{module});
91            my $module = $self->{module};
92            $module =~ s#::#/#g;
93            $module .= '.pm';
94            $log->debug("require low-level module $self->{module} from $module");
95            require $module;
96            eval $self->{module} .'->import';
97    
98            # check if required subclasses are implemented
99            foreach my $subclass (qw/open_db fetch_rec/) {
100                    if ( $self->can($subclass) ) {
101                            $log->debug("imported $subclass");
102                    } else {
103                            $log->warn("missing $subclass in $self->{module}");
104                    }
105            }
106    
107            if ($self->can('init')) {
108                    $log->debug("calling init");
109                    $self->init(@_);
110            }
111    
112            $self->{'code_page'} ||= 'ISO-8859-2';
113    
114            # running with low_mem flag? well, use DBM::Deep then.
115            if ($self->{'low_mem'}) {
116                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
117    
118                    my $db_file = "data.db";
119    
120                    if (-e $db_file) {
121                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
122                            $log->debug("removed '$db_file' from last run");
123                    }
124    
125                    require DBM::Deep;
126    
127                    my $db = new DBM::Deep $db_file;
128    
129                    $log->logdie("DBM::Deep error: $!") unless ($db);
130    
131                    if ($db->error()) {
132                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
133                    } else {
134                            $log->debug("using file '$db_file' for DBM::Deep");
135                    }
136    
137                    $self->{'db'} = $db;
138            }
139    
140            $self ? return $self : return undef;
141  }  }
142    
143  =head2 function2  =head2 open
144    
145    This function will read whole database in memory and produce lookups.
146    
147     $input->open(
148            path => '/path/to/database/file',
149            code_page => '852',
150            limit => 500,
151            offset => 6000,
152            lookup => $lookup_obj,
153     );
154    
155    By default, C<code_page> is assumed to be C<852>.
156    
157    C<offset> is optional parametar to position at some offset before reading from database.
158    
159    C<limit> is optional parametar to read just C<limit> records from database
160    
161    Returns size of database, regardless of C<offset> and C<limit>
162    parametars, see also C<size>.
163    
164  =cut  =cut
165    
166  sub function2 {  sub open {
167            my $self = shift;
168            my $arg = {@_};
169    
170            my $log = $self->_get_logger();
171    
172            $log->logcroak("need path") if (! $arg->{'path'});
173            my $code_page = $arg->{'code_page'} || '852';
174    
175            # store data in object
176            $self->{'code_page'} = $code_page;
177            foreach my $v (qw/path offset limit/) {
178                    $self->{$v} = $arg->{$v} if ($arg->{$v});
179            }
180    
181            # create Text::Iconv object
182            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
183    
184            my ($db, $size) = $self->open_db(
185                    path => $arg->{path},
186            );
187    
188            unless ($db) {
189                    $log->logwarn("can't open database $arg->{path}, skipping...");
190                    return;
191            }
192    
193            unless ($size) {
194                    $log->logwarn("no records in database $arg->{path}, skipping...");
195                    return;
196            }
197    
198            my $offset = 1;
199            my $limit = $size;
200    
201            if (my $s = $self->{offset}) {
202                    $log->info("skipping to MFN $s");
203                    $offset = $s;
204            } else {
205                    $self->{offset} = $offset;
206            }
207    
208            if ($self->{limit}) {
209                    $log->info("limiting to ",$self->{limit}," records");
210                    $limit = $offset + $self->{limit} - 1;
211                    $limit = $size if ($limit > $size);
212            }
213    
214            # store size for later
215            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
216    
217            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
218    
219            # read database
220            for (my $pos = $offset; $pos <= $limit; $pos++) {
221    
222                    $log->debug("position: $pos\n");
223    
224                    my $rec = $self->fetch_rec( $db, $pos );
225    
226                    if (! $rec) {
227                            $log->warn("record $pos empty? skipping...");
228                            next;
229                    }
230    
231                    # store
232                    if ($self->{low_mem}) {
233                            $self->{db}->put($pos, $rec);
234                    } else {
235                            $self->{data}->{$pos} = $rec;
236                    }
237    
238                    # create lookup
239                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
240    
241                    $self->progress_bar($pos,$limit);
242    
243            }
244    
245            $self->{pos} = -1;
246            $self->{last_pcnt} = 0;
247    
248            # store max mfn and return it.
249            $self->{max_pos} = $limit;
250            $log->debug("max_pos: $limit");
251    
252            return $size;
253  }  }
254    
255  =head1 AUTHOR  =head2 fetch
256    
257  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Fetch next record from database. It will also displays progress bar.
258    
259     my $rec = $isis->fetch;
260    
261    Record from this function should probably go to C<data_structure> for
262    normalisation.
263    
264    =cut
265    
266  =head1 BUGS  sub fetch {
267            my $self = shift;
268    
269  Please report any bugs or feature requests to          my $log = $self->_get_logger();
 C<bug-webpac-input@rt.cpan.org>, or through the web interface at  
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
270    
271  =head1 ACKNOWLEDGEMENTS          $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
272    
273            if ($self->{pos} == -1) {
274                    $self->{pos} = $self->{offset};
275            } else {
276                    $self->{pos}++;
277            }
278    
279            my $mfn = $self->{pos};
280    
281            if ($mfn > $self->{max_pos}) {
282                    $self->{pos} = $self->{max_pos};
283                    $log->debug("at EOF");
284                    return;
285            }
286    
287            $self->progress_bar($mfn,$self->{max_pos});
288    
289            my $rec;
290    
291            if ($self->{low_mem}) {
292                    $rec = $self->{db}->get($mfn);
293            } else {
294                    $rec = $self->{data}->{$mfn};
295            }
296    
297            $rec ||= 0E0;
298    }
299    
300    =head2 pos
301    
302    Returns current record number (MFN).
303    
304     print $isis->pos;
305    
306    First record in database has position 1.
307    
308    =cut
309    
310    sub pos {
311            my $self = shift;
312            return $self->{pos};
313    }
314    
315    
316    =head2 size
317    
318    Returns number of records in database
319    
320     print $isis->size;
321    
322    Result from this function can be used to loop through all records
323    
324     foreach my $mfn ( 1 ... $isis->size ) { ... }
325    
326    because it takes into account C<offset> and C<limit>.
327    
328    =cut
329    
330    sub size {
331            my $self = shift;
332            return $self->{size};
333    }
334    
335    =head2 seek
336    
337    Seek to specified MFN in file.
338    
339     $isis->seek(42);
340    
341    First record in database has position 1.
342    
343    =cut
344    
345    sub seek {
346            my $self = shift;
347            my $pos = shift || return;
348    
349            my $log = $self->_get_logger();
350    
351            if ($pos < 1) {
352                    $log->warn("seek before first record");
353                    $pos = 1;
354            } elsif ($pos > $self->{max_pos}) {
355                    $log->warn("seek beyond last record");
356                    $pos = $self->{max_pos};
357            }
358    
359            return $self->{pos} = (($pos - 1) || -1);
360    }
361    
362    
363    =head1 MEMORY USAGE
364    
365    C<low_mem> options is double-edged sword. If enabled, WebPAC
366    will run on memory constraint machines (which doesn't have enough
367    physical RAM to create memory structure for whole source database).
368    
369    If your machine has 512Mb or more of RAM and database is around 10000 records,
370    memory shouldn't be an issue. If you don't have enough physical RAM, you
371    might consider using virtual memory (if your operating system is handling it
372    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
373    parsed structure of ISIS database (this is what C<low_mem> option does).
374    
375    Hitting swap at end of reading source database is probably o.k. However,
376    hitting swap before 90% will dramatically decrease performance and you will
377    be better off with C<low_mem> and using rest of availble memory for
378    operating system disk cache (Linux is particuallary good about this).
379    However, every access to database record will require disk access, so
380    generation phase will be slower 10-100 times.
381    
382    Parsed structures are essential - you just have option to trade RAM memory
383    (which is fast) for disk space (which is slow). Be sure to have planty of
384    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
385    
386    However, when WebPAC is running on desktop machines (or laptops :-), it's
387    highly undesireable for system to start swapping. Using C<low_mem> option can
388    reduce WecPAC memory usage to around 64Mb for same database with lookup
389    fields and sorted indexes which stay in RAM. Performance will suffer, but
390    memory usage will really be minimal. It might be also more confortable to
391    run WebPAC reniced on those machines.
392    
393    
394    =head1 AUTHOR
395    
396    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
397    
398  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
399    

Legend:
Removed from v.1  
changed lines
  Added in v.287

  ViewVC Help
Powered by ViewVC 1.1.26