/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 506 by dpavlin, Mon May 15 09:59:05 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.01  Version 0.05
18    
19  =cut  =cut
20    
21  our $VERSION = '0.01';  our $VERSION = '0.05';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  Quick summary of what the module does.  This module implements input as database which have fixed and known
26    I<size> while indexing and single unique numeric identifier for database
27    position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
43      ...          module => 'WebPAC::Input::ISIS',
44                    config => $config,
45                    lookup => $lookup_obj,
46                    low_mem => 1,
47        );
48    
49        $db->open('/path/to/database');
50            print "database size: ",$db->size,"\n";
51            while (my $rec = $db->fetch) {
52                    # do something with $rec
53            }
54    
 =head1 EXPORT  
55    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 function1  =head2 new
60    
61    Create new input database object.
62    
63      my $db = new WebPAC::Input(
64            module => 'WebPAC::Input::MARC',
65            code_page => 'ISO-8859-2',
66            low_mem => 1,
67            recode => 'char pairs',
68            no_progress_bar => 1,
69      );
70    
71    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
72    L<WebPAC::Input::MARC>.
73    
74    Optional parametar C<code_page> specify application code page (which will be
75    used internally). This should probably be your terminal encoding, and by
76    default, it C<ISO-8859-2>.
77    
78    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80    C<recode> is optional string constisting of character or words pairs that
81    should be replaced in input stream.
82    
83    C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85    This function will also call low-level C<init> if it exists with same
86    parametars.
87    
88    =cut
89    
90    sub new {
91            my $class = shift;
92            my $self = {@_};
93            bless($self, $class);
94    
95            my $log = $self->_get_logger;
96    
97            $log->logconfess("specify low-level file format module") unless ($self->{module});
98            my $module = $self->{module};
99            $module =~ s#::#/#g;
100            $module .= '.pm';
101            $log->debug("require low-level module $self->{module} from $module");
102    
103            require $module;
104            #eval $self->{module} .'->import';
105    
106            # check if required subclasses are implemented
107            foreach my $subclass (qw/open_db fetch_rec init/) {
108                    my $n = $self->{module} . '::' . $subclass;
109                    if (! defined &{ $n }) {
110                            my $missing = "missing $subclass in $self->{module}";
111                            $self->{$subclass} = sub { $log->logwarn($missing) };
112                    } else {
113                            $self->{$subclass} = \&{ $n };
114                    }
115            }
116    
117            if ($self->{init}) {
118                    $log->debug("calling init");
119                    $self->{init}->($self, @_);
120            }
121    
122            $self->{'code_page'} ||= 'ISO-8859-2';
123    
124            # running with low_mem flag? well, use DBM::Deep then.
125            if ($self->{'low_mem'}) {
126                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
127    
128                    my $db_file = "data.db";
129    
130                    if (-e $db_file) {
131                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
132                            $log->debug("removed '$db_file' from last run");
133                    }
134    
135                    require DBM::Deep;
136    
137                    my $db = new DBM::Deep $db_file;
138    
139                    $log->logdie("DBM::Deep error: $!") unless ($db);
140    
141                    if ($db->error()) {
142                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
143                    } else {
144                            $log->debug("using file '$db_file' for DBM::Deep");
145                    }
146    
147                    $self->{'db'} = $db;
148            }
149    
150            $self ? return $self : return undef;
151    }
152    
153    =head2 open
154    
155    This function will read whole database in memory and produce lookups.
156    
157     $input->open(
158            path => '/path/to/database/file',
159            code_page => '852',
160            limit => 500,
161            offset => 6000,
162            lookup => $lookup_obj,
163            stats => 1,
164     );
165    
166    By default, C<code_page> is assumed to be C<852>.
167    
168    C<offset> is optional parametar to position at some offset before reading from database.
169    
170    C<limit> is optional parametar to read just C<limit> records from database
171    
172    C<stats> create optional report about usage of fields and subfields
173    
174    Returns size of database, regardless of C<offset> and C<limit>
175    parametars, see also C<size>.
176    
177  =cut  =cut
178    
179  sub function1 {  sub open {
180            my $self = shift;
181            my $arg = {@_};
182    
183            my $log = $self->_get_logger();
184    
185            $log->logcroak("need path") if (! $arg->{'path'});
186            my $code_page = $arg->{'code_page'} || '852';
187    
188            # store data in object
189            $self->{'input_code_page'} = $code_page;
190            foreach my $v (qw/path offset limit/) {
191                    $self->{$v} = $arg->{$v} if ($arg->{$v});
192            }
193    
194            # create Text::Iconv object
195            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
196    
197            my $filter_ref;
198    
199            if ($self->{recode}) {
200                    my @r = split(/\s/, $self->{recode});
201                    if ($#r % 2 != 1) {
202                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
203                    } else {
204                            my $recode;
205                            while (@r) {
206                                    my $from = shift @r;
207                                    my $to = shift @r;
208                                    $recode->{$from} = $to;
209                            }
210    
211                            my $regex = join '|' => keys %{ $recode };
212    
213                            $log->debug("using recode regex: $regex");
214                            
215                            $filter_ref = sub {
216                                    my $t = shift;
217                                    $t =~ s/($regex)/$recode->{$1}/g;
218                                    return $t;
219                            };
220    
221                    }
222    
223            }
224    
225            my ($db, $size) = $self->{open_db}->( $self,
226                    path => $arg->{path},
227                    filter => $filter_ref,
228            );
229    
230            unless (defined($db)) {
231                    $log->logwarn("can't open database $arg->{path}, skipping...");
232                    return;
233            }
234    
235            unless ($size) {
236                    $log->logwarn("no records in database $arg->{path}, skipping...");
237                    return;
238            }
239    
240            my $from_rec = 1;
241            my $to_rec = $size;
242    
243            if (my $s = $self->{offset}) {
244                    $log->info("skipping to MFN $s");
245                    $from_rec = $s;
246            } else {
247                    $self->{offset} = $from_rec;
248            }
249    
250            if ($self->{limit}) {
251                    $log->debug("limiting to ",$self->{limit}," records");
252                    $to_rec = $from_rec + $self->{limit} - 1;
253                    $to_rec = $size if ($to_rec > $size);
254            }
255    
256            # store size for later
257            $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
258    
259            $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}", $self->{stats} ? ' [stats]' : '');
260    
261            # read database
262            for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
263    
264                    $log->debug("position: $pos\n");
265    
266                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
267    
268                    $log->debug(sub { Dumper($rec) });
269    
270                    if (! $rec) {
271                            $log->warn("record $pos empty? skipping...");
272                            next;
273                    }
274    
275                    # store
276                    if ($self->{low_mem}) {
277                            $self->{db}->put($pos, $rec);
278                    } else {
279                            $self->{data}->{$pos} = $rec;
280                    }
281    
282                    # create lookup
283                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
284    
285                    # update counters for statistics
286                    if ($self->{stats}) {
287                            map {
288                                    my $fld = $_;
289                                    $self->{_stats}->{fld}->{ $fld }++;
290                                    if (ref($rec->{ $fld }) eq 'ARRAY') {
291                                            map {
292                                                    if (ref($_) eq 'HASH') {
293                                                            map {
294                                                                    $self->{_stats}->{sf}->{ $fld }->{ $_ }++;
295                                                            } keys %{ $_ };
296                                                    } else {
297                                                            $self->{_stats}->{repeatable}->{ $fld }++;
298                                                    }
299                                            } @{ $rec->{$fld} };
300                                    }
301                            } keys %{ $rec };
302                    }
303    
304                    $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
305    
306            }
307    
308            $self->{pos} = -1;
309            $self->{last_pcnt} = 0;
310    
311            # store max mfn and return it.
312            $self->{max_pos} = $to_rec;
313            $log->debug("max_pos: $to_rec");
314    
315            return $size;
316  }  }
317    
318  =head2 function2  =head2 fetch
319    
320    Fetch next record from database. It will also displays progress bar.
321    
322     my $rec = $isis->fetch;
323    
324    Record from this function should probably go to C<data_structure> for
325    normalisation.
326    
327  =cut  =cut
328    
329  sub function2 {  sub fetch {
330            my $self = shift;
331    
332            my $log = $self->_get_logger();
333    
334            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
335    
336            if ($self->{pos} == -1) {
337                    $self->{pos} = $self->{offset};
338            } else {
339                    $self->{pos}++;
340            }
341    
342            my $mfn = $self->{pos};
343    
344            if ($mfn > $self->{max_pos}) {
345                    $self->{pos} = $self->{max_pos};
346                    $log->debug("at EOF");
347                    return;
348            }
349    
350            $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
351    
352            my $rec;
353    
354            if ($self->{low_mem}) {
355                    $rec = $self->{db}->get($mfn);
356            } else {
357                    $rec = $self->{data}->{$mfn};
358            }
359    
360            $rec ||= 0E0;
361  }  }
362    
363  =head1 AUTHOR  =head2 pos
364    
365  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Returns current record number (MFN).
366    
367  =head1 BUGS   print $isis->pos;
368    
369  Please report any bugs or feature requests to  First record in database has position 1.
 C<bug-webpac-input@rt.cpan.org>, or through the web interface at  
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
370    
371  =head1 ACKNOWLEDGEMENTS  =cut
372    
373    sub pos {
374            my $self = shift;
375            return $self->{pos};
376    }
377    
378    
379    =head2 size
380    
381    Returns number of records in database
382    
383     print $isis->size;
384    
385    Result from this function can be used to loop through all records
386    
387     foreach my $mfn ( 1 ... $isis->size ) { ... }
388    
389    because it takes into account C<offset> and C<limit>.
390    
391    =cut
392    
393    sub size {
394            my $self = shift;
395            return $self->{size};
396    }
397    
398    =head2 seek
399    
400    Seek to specified MFN in file.
401    
402     $isis->seek(42);
403    
404    First record in database has position 1.
405    
406    =cut
407    
408    sub seek {
409            my $self = shift;
410            my $pos = shift || return;
411    
412            my $log = $self->_get_logger();
413    
414            if ($pos < 1) {
415                    $log->warn("seek before first record");
416                    $pos = 1;
417            } elsif ($pos > $self->{max_pos}) {
418                    $log->warn("seek beyond last record");
419                    $pos = $self->{max_pos};
420            }
421    
422            return $self->{pos} = (($pos - 1) || -1);
423    }
424    
425    =head2 stats
426    
427    Dump statistics about field and subfield usage
428    
429      print Dumper( $input->stats );
430    
431    =cut
432    
433    sub stats {
434            my $self = shift;
435            return $self->{_stats};
436    }
437    
438    =head1 MEMORY USAGE
439    
440    C<low_mem> options is double-edged sword. If enabled, WebPAC
441    will run on memory constraint machines (which doesn't have enough
442    physical RAM to create memory structure for whole source database).
443    
444    If your machine has 512Mb or more of RAM and database is around 10000 records,
445    memory shouldn't be an issue. If you don't have enough physical RAM, you
446    might consider using virtual memory (if your operating system is handling it
447    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
448    parsed structure of ISIS database (this is what C<low_mem> option does).
449    
450    Hitting swap at end of reading source database is probably o.k. However,
451    hitting swap before 90% will dramatically decrease performance and you will
452    be better off with C<low_mem> and using rest of availble memory for
453    operating system disk cache (Linux is particuallary good about this).
454    However, every access to database record will require disk access, so
455    generation phase will be slower 10-100 times.
456    
457    Parsed structures are essential - you just have option to trade RAM memory
458    (which is fast) for disk space (which is slow). Be sure to have planty of
459    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
460    
461    However, when WebPAC is running on desktop machines (or laptops :-), it's
462    highly undesireable for system to start swapping. Using C<low_mem> option can
463    reduce WecPAC memory usage to around 64Mb for same database with lookup
464    fields and sorted indexes which stay in RAM. Performance will suffer, but
465    memory usage will really be minimal. It might be also more confortable to
466    run WebPAC reniced on those machines.
467    
468    
469    =head1 AUTHOR
470    
471    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
472    
473  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
474    

Legend:
Removed from v.1  
changed lines
  Added in v.506

  ViewVC Help
Powered by ViewVC 1.1.26