/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 339 by dpavlin, Sat Dec 31 16:50:11 2005 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.01  Version 0.03
18    
19  =cut  =cut
20    
21  our $VERSION = '0.01';  our $VERSION = '0.03';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  Quick summary of what the module does.  This module implements input as database which have fixed and known
26    I<size> while indexing and single unique numeric identifier for database
27    position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
43      ...          module => 'WebPAC::Input::ISIS',
44                    config => $config,
45                    lookup => $lookup_obj,
46                    low_mem => 1,
47        );
48    
49        $db->open('/path/to/database');
50        print "database size: ",$db->size,"\n";
51        while (my $rec = $db->fetch) {
52        }
53    
 =head1 EXPORT  
54    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
55    
56  =head1 FUNCTIONS  =head1 FUNCTIONS
57    
58  =head2 function1  =head2 new
59    
60    Create new input database object.
61    
62      my $db = new WebPAC::Input(
63            module => 'WebPAC::Input::MARC',
64            code_page => 'ISO-8859-2',
65            low_mem => 1,
66      );
67    
68    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
69    L<WebPAC::Input::MARC>.
70    
71    Optional parametar C<code_page> specify application code page (which will be
72    used internally). This should probably be your terminal encoding, and by
73    default, it C<ISO-8859-2>.
74    
75    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
76    
77    This function will also call low-level C<init> if it exists with same
78    parametars.
79    
80  =cut  =cut
81    
82  sub function1 {  sub new {
83            my $class = shift;
84            my $self = {@_};
85            bless($self, $class);
86    
87            my $log = $self->_get_logger;
88    
89            $log->logconfess("specify low-level file format module") unless ($self->{module});
90            my $module = $self->{module};
91            $module =~ s#::#/#g;
92            $module .= '.pm';
93            $log->debug("require low-level module $self->{module} from $module");
94    
95            require $module;
96            #eval $self->{module} .'->import';
97    
98            # check if required subclasses are implemented
99            foreach my $subclass (qw/open_db fetch_rec init/) {
100                    my $n = $self->{module} . '::' . $subclass;
101                    if (! defined &{ $n }) {
102                            my $missing = "missing $subclass in $self->{module}";
103                            $self->{$subclass} = sub { $log->logwarn($missing) };
104                    } else {
105                            $self->{$subclass} = \&{ $n };
106                    }
107            }
108    
109            if ($self->{init}) {
110                    $log->debug("calling init");
111                    $self->{init}->($self, @_);
112            }
113    
114            $self->{'code_page'} ||= 'ISO-8859-2';
115    
116            # running with low_mem flag? well, use DBM::Deep then.
117            if ($self->{'low_mem'}) {
118                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
119    
120                    my $db_file = "data.db";
121    
122                    if (-e $db_file) {
123                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
124                            $log->debug("removed '$db_file' from last run");
125                    }
126    
127                    require DBM::Deep;
128    
129                    my $db = new DBM::Deep $db_file;
130    
131                    $log->logdie("DBM::Deep error: $!") unless ($db);
132    
133                    if ($db->error()) {
134                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
135                    } else {
136                            $log->debug("using file '$db_file' for DBM::Deep");
137                    }
138    
139                    $self->{'db'} = $db;
140            }
141    
142            $self ? return $self : return undef;
143  }  }
144    
145  =head2 function2  =head2 open
146    
147    This function will read whole database in memory and produce lookups.
148    
149     $input->open(
150            path => '/path/to/database/file',
151            code_page => '852',
152            limit => 500,
153            offset => 6000,
154            lookup => $lookup_obj,
155     );
156    
157    By default, C<code_page> is assumed to be C<852>.
158    
159    C<offset> is optional parametar to position at some offset before reading from database.
160    
161    C<limit> is optional parametar to read just C<limit> records from database
162    
163    Returns size of database, regardless of C<offset> and C<limit>
164    parametars, see also C<size>.
165    
166  =cut  =cut
167    
168  sub function2 {  sub open {
169            my $self = shift;
170            my $arg = {@_};
171    
172            my $log = $self->_get_logger();
173    
174            $log->logcroak("need path") if (! $arg->{'path'});
175            my $code_page = $arg->{'code_page'} || '852';
176    
177            # store data in object
178            $self->{'input_code_page'} = $code_page;
179            foreach my $v (qw/path offset limit/) {
180                    $self->{$v} = $arg->{$v} if ($arg->{$v});
181            }
182    
183            # create Text::Iconv object
184            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
185    
186            my ($db, $size) = $self->{open_db}->( $self,
187                    path => $arg->{path},
188            );
189    
190            unless ($db) {
191                    $log->logwarn("can't open database $arg->{path}, skipping...");
192                    return;
193            }
194    
195            unless ($size) {
196                    $log->logwarn("no records in database $arg->{path}, skipping...");
197                    return;
198            }
199    
200            my $from_rec = 1;
201            my $to_rec = $size;
202    
203            if (my $s = $self->{offset}) {
204                    $log->info("skipping to MFN $s");
205                    $from_rec = $s;
206            } else {
207                    $self->{offset} = $from_rec;
208            }
209    
210            if ($self->{limit}) {
211                    $log->debug("limiting to ",$self->{limit}," records");
212                    $to_rec = $from_rec + $self->{limit} - 1;
213                    $to_rec = $size if ($to_rec > $size);
214            }
215    
216            # store size for later
217            $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
218    
219            $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
220    
221            # read database
222            for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
223    
224                    $log->debug("position: $pos\n");
225    
226                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
227    
228                    $log->debug(sub { Dumper($rec) });
229    
230                    if (! $rec) {
231                            $log->warn("record $pos empty? skipping...");
232                            next;
233                    }
234    
235                    # store
236                    if ($self->{low_mem}) {
237                            $self->{db}->put($pos, $rec);
238                    } else {
239                            $self->{data}->{$pos} = $rec;
240                    }
241    
242                    # create lookup
243                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
244    
245                    $self->progress_bar($pos,$to_rec);
246    
247            }
248    
249            $self->{pos} = -1;
250            $self->{last_pcnt} = 0;
251    
252            # store max mfn and return it.
253            $self->{max_pos} = $to_rec;
254            $log->debug("max_pos: $to_rec");
255    
256            return $size;
257  }  }
258    
259  =head1 AUTHOR  =head2 fetch
260    
261  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Fetch next record from database. It will also displays progress bar.
262    
263     my $rec = $isis->fetch;
264    
265    Record from this function should probably go to C<data_structure> for
266    normalisation.
267    
268    =cut
269    
270  =head1 BUGS  sub fetch {
271            my $self = shift;
272    
273  Please report any bugs or feature requests to          my $log = $self->_get_logger();
 C<bug-webpac-input@rt.cpan.org>, or through the web interface at  
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
274    
275  =head1 ACKNOWLEDGEMENTS          $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
276    
277            if ($self->{pos} == -1) {
278                    $self->{pos} = $self->{offset};
279            } else {
280                    $self->{pos}++;
281            }
282    
283            my $mfn = $self->{pos};
284    
285            if ($mfn > $self->{max_pos}) {
286                    $self->{pos} = $self->{max_pos};
287                    $log->debug("at EOF");
288                    return;
289            }
290    
291            $self->progress_bar($mfn,$self->{max_pos});
292    
293            my $rec;
294    
295            if ($self->{low_mem}) {
296                    $rec = $self->{db}->get($mfn);
297            } else {
298                    $rec = $self->{data}->{$mfn};
299            }
300    
301            $rec ||= 0E0;
302    }
303    
304    =head2 pos
305    
306    Returns current record number (MFN).
307    
308     print $isis->pos;
309    
310    First record in database has position 1.
311    
312    =cut
313    
314    sub pos {
315            my $self = shift;
316            return $self->{pos};
317    }
318    
319    
320    =head2 size
321    
322    Returns number of records in database
323    
324     print $isis->size;
325    
326    Result from this function can be used to loop through all records
327    
328     foreach my $mfn ( 1 ... $isis->size ) { ... }
329    
330    because it takes into account C<offset> and C<limit>.
331    
332    =cut
333    
334    sub size {
335            my $self = shift;
336            return $self->{size};
337    }
338    
339    =head2 seek
340    
341    Seek to specified MFN in file.
342    
343     $isis->seek(42);
344    
345    First record in database has position 1.
346    
347    =cut
348    
349    sub seek {
350            my $self = shift;
351            my $pos = shift || return;
352    
353            my $log = $self->_get_logger();
354    
355            if ($pos < 1) {
356                    $log->warn("seek before first record");
357                    $pos = 1;
358            } elsif ($pos > $self->{max_pos}) {
359                    $log->warn("seek beyond last record");
360                    $pos = $self->{max_pos};
361            }
362    
363            return $self->{pos} = (($pos - 1) || -1);
364    }
365    
366    
367    =head1 MEMORY USAGE
368    
369    C<low_mem> options is double-edged sword. If enabled, WebPAC
370    will run on memory constraint machines (which doesn't have enough
371    physical RAM to create memory structure for whole source database).
372    
373    If your machine has 512Mb or more of RAM and database is around 10000 records,
374    memory shouldn't be an issue. If you don't have enough physical RAM, you
375    might consider using virtual memory (if your operating system is handling it
376    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
377    parsed structure of ISIS database (this is what C<low_mem> option does).
378    
379    Hitting swap at end of reading source database is probably o.k. However,
380    hitting swap before 90% will dramatically decrease performance and you will
381    be better off with C<low_mem> and using rest of availble memory for
382    operating system disk cache (Linux is particuallary good about this).
383    However, every access to database record will require disk access, so
384    generation phase will be slower 10-100 times.
385    
386    Parsed structures are essential - you just have option to trade RAM memory
387    (which is fast) for disk space (which is slow). Be sure to have planty of
388    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
389    
390    However, when WebPAC is running on desktop machines (or laptops :-), it's
391    highly undesireable for system to start swapping. Using C<low_mem> option can
392    reduce WecPAC memory usage to around 64Mb for same database with lookup
393    fields and sorted indexes which stay in RAM. Performance will suffer, but
394    memory usage will really be minimal. It might be also more confortable to
395    run WebPAC reniced on those machines.
396    
397    
398    =head1 AUTHOR
399    
400    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
401    
402  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
403    

Legend:
Removed from v.1  
changed lines
  Added in v.339

  ViewVC Help
Powered by ViewVC 1.1.26