/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 307 by dpavlin, Tue Dec 20 00:03:04 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    
10  =head1 NAME  =head1 NAME
11    
12  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
13    
14  =head1 VERSION  =head1 VERSION
15    
16  Version 0.01  Version 0.03
17    
18  =cut  =cut
19    
20  our $VERSION = '0.01';  our $VERSION = '0.03';
21    
22  =head1 SYNOPSIS  =head1 SYNOPSIS
23    
24  Quick summary of what the module does.  This module implements input as database which have fixed and known
25    I<size> while indexing and single unique numeric identifier for database
26    position ranging from 1 to I<size>.
27    
28    Simply, something that is indexed by unmber from 1 .. I<size>.
29    
30    Examples of such databases are CDS/ISIS files, MARC files, lines in
31    text file, and so on.
32    
33    Specific file formats are implemented using low-level interface modules,
34    located in C<WebPAC::Input::*> namespace which export C<open_db>,
35    C<fetch_rec> and optional C<init> functions.
36    
37  Perhaps a little code snippet.  Perhaps a little code snippet.
38    
39      use WebPAC::Input;      use WebPAC::Input;
40    
41      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
42      ...          module => 'WebPAC::Input::ISIS',
43                    config => $config,
44                    lookup => $lookup_obj,
45                    low_mem => 1,
46        );
47    
48        $db->open('/path/to/database');
49        print "database size: ",$db->size,"\n";
50        while (my $rec = $db->fetch) {
51        }
52    
 =head1 EXPORT  
53    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
54    
55  =head1 FUNCTIONS  =head1 FUNCTIONS
56    
57  =head2 function1  =head2 new
58    
59    Create new input database object.
60    
61      my $db = new WebPAC::Input(
62            module => 'WebPAC::Input::MARC',
63            code_page => 'ISO-8859-2',
64            low_mem => 1,
65      );
66    
67    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
68    L<WebPAC::Input::MARC>.
69    
70    Optional parametar C<code_page> specify application code page (which will be
71    used internally). This should probably be your terminal encoding, and by
72    default, it C<ISO-8859-2>.
73    
74    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
75    
76    This function will also call low-level C<init> if it exists with same
77    parametars.
78    
79  =cut  =cut
80    
81  sub function1 {  sub new {
82            my $class = shift;
83            my $self = {@_};
84            bless($self, $class);
85    
86            my $log = $self->_get_logger;
87    
88            $log->logconfess("specify low-level file format module") unless ($self->{module});
89            my $module = $self->{module};
90            $module =~ s#::#/#g;
91            $module .= '.pm';
92            $log->debug("require low-level module $self->{module} from $module");
93    
94            require $module;
95            #eval $self->{module} .'->import';
96    
97            # check if required subclasses are implemented
98            foreach my $subclass (qw/open_db fetch_rec init/) {
99                    my $n = $self->{module} . '::' . $subclass;
100                    if (! defined &{ $n }) {
101                            my $missing = "missing $subclass in $self->{module}";
102                            $self->{$subclass} = sub { $log->logwarn($missing) };
103                    } else {
104                            $self->{$subclass} = \&{ $n };
105                    }
106            }
107    
108            if ($self->{init}) {
109                    $log->debug("calling init");
110                    $self->{init}->($self, @_);
111            }
112    
113            $self->{'code_page'} ||= 'ISO-8859-2';
114    
115            # running with low_mem flag? well, use DBM::Deep then.
116            if ($self->{'low_mem'}) {
117                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119                    my $db_file = "data.db";
120    
121                    if (-e $db_file) {
122                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123                            $log->debug("removed '$db_file' from last run");
124                    }
125    
126                    require DBM::Deep;
127    
128                    my $db = new DBM::Deep $db_file;
129    
130                    $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132                    if ($db->error()) {
133                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134                    } else {
135                            $log->debug("using file '$db_file' for DBM::Deep");
136                    }
137    
138                    $self->{'db'} = $db;
139            }
140    
141            $self ? return $self : return undef;
142  }  }
143    
144  =head2 function2  =head2 open
145    
146    This function will read whole database in memory and produce lookups.
147    
148     $input->open(
149            path => '/path/to/database/file',
150            code_page => '852',
151            limit => 500,
152            offset => 6000,
153            lookup => $lookup_obj,
154     );
155    
156    By default, C<code_page> is assumed to be C<852>.
157    
158    C<offset> is optional parametar to position at some offset before reading from database.
159    
160    C<limit> is optional parametar to read just C<limit> records from database
161    
162    Returns size of database, regardless of C<offset> and C<limit>
163    parametars, see also C<size>.
164    
165  =cut  =cut
166    
167  sub function2 {  sub open {
168            my $self = shift;
169            my $arg = {@_};
170    
171            my $log = $self->_get_logger();
172    
173            $log->logcroak("need path") if (! $arg->{'path'});
174            my $code_page = $arg->{'code_page'} || '852';
175    
176            # store data in object
177            $self->{'input_code_page'} = $code_page;
178            foreach my $v (qw/path offset limit/) {
179                    $self->{$v} = $arg->{$v} if ($arg->{$v});
180            }
181    
182            # create Text::Iconv object
183            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
184    
185            my ($db, $size) = $self->{open_db}->( $self,
186                    path => $arg->{path},
187            );
188    
189            unless ($db) {
190                    $log->logwarn("can't open database $arg->{path}, skipping...");
191                    return;
192            }
193    
194            unless ($size) {
195                    $log->logwarn("no records in database $arg->{path}, skipping...");
196                    return;
197            }
198    
199            my $offset = 1;
200            my $limit = $size;
201    
202            if (my $s = $self->{offset}) {
203                    $log->info("skipping to MFN $s");
204                    $offset = $s;
205            } else {
206                    $self->{offset} = $offset;
207            }
208    
209            if ($self->{limit}) {
210                    $log->debug("limiting to ",$self->{limit}," records");
211                    $limit = $offset + $self->{limit} - 1;
212                    $limit = $size if ($limit > $size);
213            }
214    
215            # store size for later
216            $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
217    
218            $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
219    
220            # read database
221            for (my $pos = $offset; $pos <= $limit; $pos++) {
222    
223                    $log->debug("position: $pos\n");
224    
225                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
226    
227                    if (! $rec) {
228                            $log->warn("record $pos empty? skipping...");
229                            next;
230                    }
231    
232                    # store
233                    if ($self->{low_mem}) {
234                            $self->{db}->put($pos, $rec);
235                    } else {
236                            $self->{data}->{$pos} = $rec;
237                    }
238    
239                    # create lookup
240                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
241    
242                    $self->progress_bar($pos,$limit);
243    
244            }
245    
246            $self->{pos} = -1;
247            $self->{last_pcnt} = 0;
248    
249            # store max mfn and return it.
250            $self->{max_pos} = $limit;
251            $log->debug("max_pos: $limit");
252    
253            return $size;
254  }  }
255    
256  =head1 AUTHOR  =head2 fetch
257    
258  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Fetch next record from database. It will also displays progress bar.
259    
260     my $rec = $isis->fetch;
261    
262    Record from this function should probably go to C<data_structure> for
263    normalisation.
264    
265    =cut
266    
267  =head1 BUGS  sub fetch {
268            my $self = shift;
269    
270  Please report any bugs or feature requests to          my $log = $self->_get_logger();
 C<bug-webpac-input@rt.cpan.org>, or through the web interface at  
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
271    
272  =head1 ACKNOWLEDGEMENTS          $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
273    
274            if ($self->{pos} == -1) {
275                    $self->{pos} = $self->{offset};
276            } else {
277                    $self->{pos}++;
278            }
279    
280            my $mfn = $self->{pos};
281    
282            if ($mfn > $self->{max_pos}) {
283                    $self->{pos} = $self->{max_pos};
284                    $log->debug("at EOF");
285                    return;
286            }
287    
288            $self->progress_bar($mfn,$self->{max_pos});
289    
290            my $rec;
291    
292            if ($self->{low_mem}) {
293                    $rec = $self->{db}->get($mfn);
294            } else {
295                    $rec = $self->{data}->{$mfn};
296            }
297    
298            $rec ||= 0E0;
299    }
300    
301    =head2 pos
302    
303    Returns current record number (MFN).
304    
305     print $isis->pos;
306    
307    First record in database has position 1.
308    
309    =cut
310    
311    sub pos {
312            my $self = shift;
313            return $self->{pos};
314    }
315    
316    
317    =head2 size
318    
319    Returns number of records in database
320    
321     print $isis->size;
322    
323    Result from this function can be used to loop through all records
324    
325     foreach my $mfn ( 1 ... $isis->size ) { ... }
326    
327    because it takes into account C<offset> and C<limit>.
328    
329    =cut
330    
331    sub size {
332            my $self = shift;
333            return $self->{size};
334    }
335    
336    =head2 seek
337    
338    Seek to specified MFN in file.
339    
340     $isis->seek(42);
341    
342    First record in database has position 1.
343    
344    =cut
345    
346    sub seek {
347            my $self = shift;
348            my $pos = shift || return;
349    
350            my $log = $self->_get_logger();
351    
352            if ($pos < 1) {
353                    $log->warn("seek before first record");
354                    $pos = 1;
355            } elsif ($pos > $self->{max_pos}) {
356                    $log->warn("seek beyond last record");
357                    $pos = $self->{max_pos};
358            }
359    
360            return $self->{pos} = (($pos - 1) || -1);
361    }
362    
363    
364    =head1 MEMORY USAGE
365    
366    C<low_mem> options is double-edged sword. If enabled, WebPAC
367    will run on memory constraint machines (which doesn't have enough
368    physical RAM to create memory structure for whole source database).
369    
370    If your machine has 512Mb or more of RAM and database is around 10000 records,
371    memory shouldn't be an issue. If you don't have enough physical RAM, you
372    might consider using virtual memory (if your operating system is handling it
373    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
374    parsed structure of ISIS database (this is what C<low_mem> option does).
375    
376    Hitting swap at end of reading source database is probably o.k. However,
377    hitting swap before 90% will dramatically decrease performance and you will
378    be better off with C<low_mem> and using rest of availble memory for
379    operating system disk cache (Linux is particuallary good about this).
380    However, every access to database record will require disk access, so
381    generation phase will be slower 10-100 times.
382    
383    Parsed structures are essential - you just have option to trade RAM memory
384    (which is fast) for disk space (which is slow). Be sure to have planty of
385    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
386    
387    However, when WebPAC is running on desktop machines (or laptops :-), it's
388    highly undesireable for system to start swapping. Using C<low_mem> option can
389    reduce WecPAC memory usage to around 64Mb for same database with lookup
390    fields and sorted indexes which stay in RAM. Performance will suffer, but
391    memory usage will really be minimal. It might be also more confortable to
392    run WebPAC reniced on those machines.
393    
394    
395    =head1 AUTHOR
396    
397    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
398    
399  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
400    

Legend:
Removed from v.1  
changed lines
  Added in v.307

  ViewVC Help
Powered by ViewVC 1.1.26