/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 487 by dpavlin, Sun May 14 12:34:50 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.01  Version 0.04
18    
19  =cut  =cut
20    
21  our $VERSION = '0.01';  our $VERSION = '0.04';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  Quick summary of what the module does.  This module implements input as database which have fixed and known
26    I<size> while indexing and single unique numeric identifier for database
27    position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
43      ...          module => 'WebPAC::Input::ISIS',
44                    config => $config,
45                    lookup => $lookup_obj,
46                    low_mem => 1,
47        );
48    
49        $db->open('/path/to/database');
50            print "database size: ",$db->size,"\n";
51            while (my $rec = $db->fetch) {
52                    # do something with $rec
53            }
54    
 =head1 EXPORT  
55    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 function1  =head2 new
60    
61    Create new input database object.
62    
63      my $db = new WebPAC::Input(
64            module => 'WebPAC::Input::MARC',
65            code_page => 'ISO-8859-2',
66            low_mem => 1,
67            recode => 'char pairs',
68            no_progress_bar => 1,
69      );
70    
71    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72    L<WebPAC::Input::MARC>.
73    
74    Optional parametar C<code_page> specify application code page (which will be
75    used internally). This should probably be your terminal encoding, and by
76    default, it C<ISO-8859-2>.
77    
78    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80    C<recode> is optional string constisting of character or words pairs that
81    should be replaced in input stream.
82    
83    C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85    This function will also call low-level C<init> if it exists with same
86    parametars.
87    
88  =cut  =cut
89    
90  sub function1 {  sub new {
91            my $class = shift;
92            my $self = {@_};
93            bless($self, $class);
94    
95            my $log = $self->_get_logger;
96    
97            $log->logconfess("specify low-level file format module") unless ($self->{module});
98            my $module = $self->{module};
99            $module =~ s#::#/#g;
100            $module .= '.pm';
101            $log->debug("require low-level module $self->{module} from $module");
102    
103            require $module;
104            #eval $self->{module} .'->import';
105    
106            # check if required subclasses are implemented
107            foreach my $subclass (qw/open_db fetch_rec init/) {
108                    my $n = $self->{module} . '::' . $subclass;
109                    if (! defined &{ $n }) {
110                            my $missing = "missing $subclass in $self->{module}";
111                            $self->{$subclass} = sub { $log->logwarn($missing) };
112                    } else {
113                            $self->{$subclass} = \&{ $n };
114                    }
115            }
116    
117            if ($self->{init}) {
118                    $log->debug("calling init");
119                    $self->{init}->($self, @_);
120            }
121    
122            $self->{'code_page'} ||= 'ISO-8859-2';
123    
124            # running with low_mem flag? well, use DBM::Deep then.
125            if ($self->{'low_mem'}) {
126                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
127    
128                    my $db_file = "data.db";
129    
130                    if (-e $db_file) {
131                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
132                            $log->debug("removed '$db_file' from last run");
133                    }
134    
135                    require DBM::Deep;
136    
137                    my $db = new DBM::Deep $db_file;
138    
139                    $log->logdie("DBM::Deep error: $!") unless ($db);
140    
141                    if ($db->error()) {
142                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
143                    } else {
144                            $log->debug("using file '$db_file' for DBM::Deep");
145                    }
146    
147                    $self->{'db'} = $db;
148            }
149    
150            $self ? return $self : return undef;
151  }  }
152    
153  =head2 function2  =head2 open
154    
155    This function will read whole database in memory and produce lookups.
156    
157     $input->open(
158            path => '/path/to/database/file',
159            code_page => '852',
160            limit => 500,
161            offset => 6000,
162            lookup => $lookup_obj,
163     );
164    
165    By default, C<code_page> is assumed to be C<852>.
166    
167    C<offset> is optional parametar to position at some offset before reading from database.
168    
169    C<limit> is optional parametar to read just C<limit> records from database
170    
171    Returns size of database, regardless of C<offset> and C<limit>
172    parametars, see also C<size>.
173    
174  =cut  =cut
175    
176  sub function2 {  sub open {
177            my $self = shift;
178            my $arg = {@_};
179    
180            my $log = $self->_get_logger();
181    
182            $log->logcroak("need path") if (! $arg->{'path'});
183            my $code_page = $arg->{'code_page'} || '852';
184    
185            # store data in object
186            $self->{'input_code_page'} = $code_page;
187            foreach my $v (qw/path offset limit/) {
188                    $self->{$v} = $arg->{$v} if ($arg->{$v});
189            }
190    
191            # create Text::Iconv object
192            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
193    
194            my $filter_ref;
195    
196            if ($self->{recode}) {
197                    my @r = split(/\s/, $self->{recode});
198                    if ($#r % 2 != 1) {
199                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
200                    } else {
201                            my $recode;
202                            while (@r) {
203                                    my $from = shift @r;
204                                    my $to = shift @r;
205                                    $recode->{$from} = $to;
206                            }
207    
208                            my $regex = join '|' => keys %{ $recode };
209    
210                            $log->debug("using recode regex: $regex");
211                            
212                            $filter_ref = sub {
213                                    my $t = shift;
214                                    $t =~ s/($regex)/$recode->{$1}/g;
215                                    return $t;
216                            };
217    
218                    }
219    
220            }
221    
222            my ($db, $size) = $self->{open_db}->( $self,
223                    path => $arg->{path},
224                    filter => $filter_ref,
225            );
226    
227            unless ($db) {
228                    $log->logwarn("can't open database $arg->{path}, skipping...");
229                    return;
230            }
231    
232            unless ($size) {
233                    $log->logwarn("no records in database $arg->{path}, skipping...");
234                    return;
235            }
236    
237            my $from_rec = 1;
238            my $to_rec = $size;
239    
240            if (my $s = $self->{offset}) {
241                    $log->info("skipping to MFN $s");
242                    $from_rec = $s;
243            } else {
244                    $self->{offset} = $from_rec;
245            }
246    
247            if ($self->{limit}) {
248                    $log->debug("limiting to ",$self->{limit}," records");
249                    $to_rec = $from_rec + $self->{limit} - 1;
250                    $to_rec = $size if ($to_rec > $size);
251            }
252    
253            # store size for later
254            $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
255    
256            $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
257    
258            # read database
259            for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
260    
261                    $log->debug("position: $pos\n");
262    
263                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
264    
265                    $log->debug(sub { Dumper($rec) });
266    
267                    if (! $rec) {
268                            $log->warn("record $pos empty? skipping...");
269                            next;
270                    }
271    
272                    # store
273                    if ($self->{low_mem}) {
274                            $self->{db}->put($pos, $rec);
275                    } else {
276                            $self->{data}->{$pos} = $rec;
277                    }
278    
279                    # create lookup
280                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
281    
282                    $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
283    
284            }
285    
286            $self->{pos} = -1;
287            $self->{last_pcnt} = 0;
288    
289            # store max mfn and return it.
290            $self->{max_pos} = $to_rec;
291            $log->debug("max_pos: $to_rec");
292    
293            return $size;
294  }  }
295    
296  =head1 AUTHOR  =head2 fetch
297    
298  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Fetch next record from database. It will also displays progress bar.
299    
300  =head1 BUGS   my $rec = $isis->fetch;
301    
302  Please report any bugs or feature requests to  Record from this function should probably go to C<data_structure> for
303  C<bug-webpac-input@rt.cpan.org>, or through the web interface at  normalisation.
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
304    
305  =head1 ACKNOWLEDGEMENTS  =cut
306    
307    sub fetch {
308            my $self = shift;
309    
310            my $log = $self->_get_logger();
311    
312            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
313    
314            if ($self->{pos} == -1) {
315                    $self->{pos} = $self->{offset};
316            } else {
317                    $self->{pos}++;
318            }
319    
320            my $mfn = $self->{pos};
321    
322            if ($mfn > $self->{max_pos}) {
323                    $self->{pos} = $self->{max_pos};
324                    $log->debug("at EOF");
325                    return;
326            }
327    
328            $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
329    
330            my $rec;
331    
332            if ($self->{low_mem}) {
333                    $rec = $self->{db}->get($mfn);
334            } else {
335                    $rec = $self->{data}->{$mfn};
336            }
337    
338            $rec ||= 0E0;
339    }
340    
341    =head2 pos
342    
343    Returns current record number (MFN).
344    
345     print $isis->pos;
346    
347    First record in database has position 1.
348    
349    =cut
350    
351    sub pos {
352            my $self = shift;
353            return $self->{pos};
354    }
355    
356    
357    =head2 size
358    
359    Returns number of records in database
360    
361     print $isis->size;
362    
363    Result from this function can be used to loop through all records
364    
365     foreach my $mfn ( 1 ... $isis->size ) { ... }
366    
367    because it takes into account C<offset> and C<limit>.
368    
369    =cut
370    
371    sub size {
372            my $self = shift;
373            return $self->{size};
374    }
375    
376    =head2 seek
377    
378    Seek to specified MFN in file.
379    
380     $isis->seek(42);
381    
382    First record in database has position 1.
383    
384    =cut
385    
386    sub seek {
387            my $self = shift;
388            my $pos = shift || return;
389    
390            my $log = $self->_get_logger();
391    
392            if ($pos < 1) {
393                    $log->warn("seek before first record");
394                    $pos = 1;
395            } elsif ($pos > $self->{max_pos}) {
396                    $log->warn("seek beyond last record");
397                    $pos = $self->{max_pos};
398            }
399    
400            return $self->{pos} = (($pos - 1) || -1);
401    }
402    
403    
404    =head1 MEMORY USAGE
405    
406    C<low_mem> options is double-edged sword. If enabled, WebPAC
407    will run on memory constraint machines (which doesn't have enough
408    physical RAM to create memory structure for whole source database).
409    
410    If your machine has 512Mb or more of RAM and database is around 10000 records,
411    memory shouldn't be an issue. If you don't have enough physical RAM, you
412    might consider using virtual memory (if your operating system is handling it
413    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
414    parsed structure of ISIS database (this is what C<low_mem> option does).
415    
416    Hitting swap at end of reading source database is probably o.k. However,
417    hitting swap before 90% will dramatically decrease performance and you will
418    be better off with C<low_mem> and using rest of availble memory for
419    operating system disk cache (Linux is particuallary good about this).
420    However, every access to database record will require disk access, so
421    generation phase will be slower 10-100 times.
422    
423    Parsed structures are essential - you just have option to trade RAM memory
424    (which is fast) for disk space (which is slow). Be sure to have planty of
425    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
426    
427    However, when WebPAC is running on desktop machines (or laptops :-), it's
428    highly undesireable for system to start swapping. Using C<low_mem> option can
429    reduce WecPAC memory usage to around 64Mb for same database with lookup
430    fields and sorted indexes which stay in RAM. Performance will suffer, but
431    memory usage will really be minimal. It might be also more confortable to
432    run WebPAC reniced on those machines.
433    
434    
435    =head1 AUTHOR
436    
437    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
438    
439  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
440    

Legend:
Removed from v.1  
changed lines
  Added in v.487

  ViewVC Help
Powered by ViewVC 1.1.26