/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1 by dpavlin, Sat Jun 25 20:23:23 2005 UTC revision 416 by dpavlin, Sun Feb 26 23:21:50 2006 UTC
# Line 3  package WebPAC::Input; Line 3  package WebPAC::Input;
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6    use WebPAC::Common;
7    use base qw/WebPAC::Common/;
8    use Text::Iconv;
9    use Data::Dumper;
10    
11  =head1 NAME  =head1 NAME
12    
13  WebPAC::Input - The great new WebPAC::Input!  WebPAC::Input - read different file formats into WebPAC
14    
15  =head1 VERSION  =head1 VERSION
16    
17  Version 0.01  Version 0.04
18    
19  =cut  =cut
20    
21  our $VERSION = '0.01';  our $VERSION = '0.04';
22    
23  =head1 SYNOPSIS  =head1 SYNOPSIS
24    
25  Quick summary of what the module does.  This module implements input as database which have fixed and known
26    I<size> while indexing and single unique numeric identifier for database
27    position ranging from 1 to I<size>.
28    
29    Simply, something that is indexed by unmber from 1 .. I<size>.
30    
31    Examples of such databases are CDS/ISIS files, MARC files, lines in
32    text file, and so on.
33    
34    Specific file formats are implemented using low-level interface modules,
35    located in C<WebPAC::Input::*> namespace which export C<open_db>,
36    C<fetch_rec> and optional C<init> functions.
37    
38  Perhaps a little code snippet.  Perhaps a little code snippet.
39    
40      use WebPAC::Input;      use WebPAC::Input;
41    
42      my $foo = WebPAC::Input->new();      my $db = WebPAC::Input->new(
43      ...          module => 'WebPAC::Input::ISIS',
44                    config => $config,
45                    lookup => $lookup_obj,
46                    low_mem => 1,
47        );
48    
49        $db->open('/path/to/database');
50            print "database size: ",$db->size,"\n";
51            while (my $rec = $db->fetch) {
52                    # do something with $rec
53            }
54    
 =head1 EXPORT  
55    
 A list of functions that can be exported.  You can delete this section  
 if you don't export anything, such as for a purely object-oriented module.  
56    
57  =head1 FUNCTIONS  =head1 FUNCTIONS
58    
59  =head2 function1  =head2 new
60    
61    Create new input database object.
62    
63      my $db = new WebPAC::Input(
64            module => 'WebPAC::Input::MARC',
65            code_page => 'ISO-8859-2',
66            low_mem => 1,
67            recode => 'char pairs',
68      );
69    
70    C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
71    L<WebPAC::Input::MARC>.
72    
73    Optional parametar C<code_page> specify application code page (which will be
74    used internally). This should probably be your terminal encoding, and by
75    default, it C<ISO-8859-2>.
76    
77    Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
78    
79    This function will also call low-level C<init> if it exists with same
80    parametars.
81    
82  =cut  =cut
83    
84  sub function1 {  sub new {
85            my $class = shift;
86            my $self = {@_};
87            bless($self, $class);
88    
89            my $log = $self->_get_logger;
90    
91            $log->logconfess("specify low-level file format module") unless ($self->{module});
92            my $module = $self->{module};
93            $module =~ s#::#/#g;
94            $module .= '.pm';
95            $log->debug("require low-level module $self->{module} from $module");
96    
97            require $module;
98            #eval $self->{module} .'->import';
99    
100            # check if required subclasses are implemented
101            foreach my $subclass (qw/open_db fetch_rec init/) {
102                    my $n = $self->{module} . '::' . $subclass;
103                    if (! defined &{ $n }) {
104                            my $missing = "missing $subclass in $self->{module}";
105                            $self->{$subclass} = sub { $log->logwarn($missing) };
106                    } else {
107                            $self->{$subclass} = \&{ $n };
108                    }
109            }
110    
111            if ($self->{init}) {
112                    $log->debug("calling init");
113                    $self->{init}->($self, @_);
114            }
115    
116            $self->{'code_page'} ||= 'ISO-8859-2';
117    
118            # running with low_mem flag? well, use DBM::Deep then.
119            if ($self->{'low_mem'}) {
120                    $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
121    
122                    my $db_file = "data.db";
123    
124                    if (-e $db_file) {
125                            unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
126                            $log->debug("removed '$db_file' from last run");
127                    }
128    
129                    require DBM::Deep;
130    
131                    my $db = new DBM::Deep $db_file;
132    
133                    $log->logdie("DBM::Deep error: $!") unless ($db);
134    
135                    if ($db->error()) {
136                            $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
137                    } else {
138                            $log->debug("using file '$db_file' for DBM::Deep");
139                    }
140    
141                    $self->{'db'} = $db;
142            }
143    
144            $self ? return $self : return undef;
145  }  }
146    
147  =head2 function2  =head2 open
148    
149    This function will read whole database in memory and produce lookups.
150    
151     $input->open(
152            path => '/path/to/database/file',
153            code_page => '852',
154            limit => 500,
155            offset => 6000,
156            lookup => $lookup_obj,
157     );
158    
159    By default, C<code_page> is assumed to be C<852>.
160    
161    C<offset> is optional parametar to position at some offset before reading from database.
162    
163    C<limit> is optional parametar to read just C<limit> records from database
164    
165    Returns size of database, regardless of C<offset> and C<limit>
166    parametars, see also C<size>.
167    
168  =cut  =cut
169    
170  sub function2 {  sub open {
171            my $self = shift;
172            my $arg = {@_};
173    
174            my $log = $self->_get_logger();
175    
176            $log->logcroak("need path") if (! $arg->{'path'});
177            my $code_page = $arg->{'code_page'} || '852';
178    
179            # store data in object
180            $self->{'input_code_page'} = $code_page;
181            foreach my $v (qw/path offset limit/) {
182                    $self->{$v} = $arg->{$v} if ($arg->{$v});
183            }
184    
185            # create Text::Iconv object
186            $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
187    
188            my $filter_ref;
189    
190            if ($self->{recode}) {
191                    my @r = split(/\s/, $self->{recode});
192                    if ($#r % 2 != 1) {
193                            $log->logwarn("recode needs even number of elements (some number of valid pairs)");
194                    } else {
195                            my $recode;
196                            while (@r) {
197                                    my $from = shift @r;
198                                    my $to = shift @r;
199                                    $recode->{$from} = $to;
200                            }
201    
202                            my $regex = join '|' => keys %{ $recode };
203    
204                            $log->debug("using recode regex: $regex");
205                            
206                            $filter_ref = sub {
207                                    my $t = shift;
208                                    $t =~ s/($regex)/$recode->{$1}/g;
209                                    return $t;
210                            };
211    
212                    }
213    
214            }
215    
216            my ($db, $size) = $self->{open_db}->( $self,
217                    path => $arg->{path},
218                    filter => $filter_ref,
219            );
220    
221            unless ($db) {
222                    $log->logwarn("can't open database $arg->{path}, skipping...");
223                    return;
224            }
225    
226            unless ($size) {
227                    $log->logwarn("no records in database $arg->{path}, skipping...");
228                    return;
229            }
230    
231            my $from_rec = 1;
232            my $to_rec = $size;
233    
234            if (my $s = $self->{offset}) {
235                    $log->info("skipping to MFN $s");
236                    $from_rec = $s;
237            } else {
238                    $self->{offset} = $from_rec;
239            }
240    
241            if ($self->{limit}) {
242                    $log->debug("limiting to ",$self->{limit}," records");
243                    $to_rec = $from_rec + $self->{limit} - 1;
244                    $to_rec = $size if ($to_rec > $size);
245            }
246    
247            # store size for later
248            $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
249    
250            $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
251    
252            # read database
253            for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
254    
255                    $log->debug("position: $pos\n");
256    
257                    my $rec = $self->{fetch_rec}->($self, $db, $pos );
258    
259                    $log->debug(sub { Dumper($rec) });
260    
261                    if (! $rec) {
262                            $log->warn("record $pos empty? skipping...");
263                            next;
264                    }
265    
266                    # store
267                    if ($self->{low_mem}) {
268                            $self->{db}->put($pos, $rec);
269                    } else {
270                            $self->{data}->{$pos} = $rec;
271                    }
272    
273                    # create lookup
274                    $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
275    
276                    $self->progress_bar($pos,$to_rec);
277    
278            }
279    
280            $self->{pos} = -1;
281            $self->{last_pcnt} = 0;
282    
283            # store max mfn and return it.
284            $self->{max_pos} = $to_rec;
285            $log->debug("max_pos: $to_rec");
286    
287            return $size;
288  }  }
289    
290  =head1 AUTHOR  =head2 fetch
291    
292  Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>  Fetch next record from database. It will also displays progress bar.
293    
294  =head1 BUGS   my $rec = $isis->fetch;
295    
296  Please report any bugs or feature requests to  Record from this function should probably go to C<data_structure> for
297  C<bug-webpac-input@rt.cpan.org>, or through the web interface at  normalisation.
 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.  
 I will be notified, and then you'll automatically be notified of progress on  
 your bug as I make changes.  
298    
299  =head1 ACKNOWLEDGEMENTS  =cut
300    
301    sub fetch {
302            my $self = shift;
303    
304            my $log = $self->_get_logger();
305    
306            $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
307    
308            if ($self->{pos} == -1) {
309                    $self->{pos} = $self->{offset};
310            } else {
311                    $self->{pos}++;
312            }
313    
314            my $mfn = $self->{pos};
315    
316            if ($mfn > $self->{max_pos}) {
317                    $self->{pos} = $self->{max_pos};
318                    $log->debug("at EOF");
319                    return;
320            }
321    
322            $self->progress_bar($mfn,$self->{max_pos});
323    
324            my $rec;
325    
326            if ($self->{low_mem}) {
327                    $rec = $self->{db}->get($mfn);
328            } else {
329                    $rec = $self->{data}->{$mfn};
330            }
331    
332            $rec ||= 0E0;
333    }
334    
335    =head2 pos
336    
337    Returns current record number (MFN).
338    
339     print $isis->pos;
340    
341    First record in database has position 1.
342    
343    =cut
344    
345    sub pos {
346            my $self = shift;
347            return $self->{pos};
348    }
349    
350    
351    =head2 size
352    
353    Returns number of records in database
354    
355     print $isis->size;
356    
357    Result from this function can be used to loop through all records
358    
359     foreach my $mfn ( 1 ... $isis->size ) { ... }
360    
361    because it takes into account C<offset> and C<limit>.
362    
363    =cut
364    
365    sub size {
366            my $self = shift;
367            return $self->{size};
368    }
369    
370    =head2 seek
371    
372    Seek to specified MFN in file.
373    
374     $isis->seek(42);
375    
376    First record in database has position 1.
377    
378    =cut
379    
380    sub seek {
381            my $self = shift;
382            my $pos = shift || return;
383    
384            my $log = $self->_get_logger();
385    
386            if ($pos < 1) {
387                    $log->warn("seek before first record");
388                    $pos = 1;
389            } elsif ($pos > $self->{max_pos}) {
390                    $log->warn("seek beyond last record");
391                    $pos = $self->{max_pos};
392            }
393    
394            return $self->{pos} = (($pos - 1) || -1);
395    }
396    
397    
398    =head1 MEMORY USAGE
399    
400    C<low_mem> options is double-edged sword. If enabled, WebPAC
401    will run on memory constraint machines (which doesn't have enough
402    physical RAM to create memory structure for whole source database).
403    
404    If your machine has 512Mb or more of RAM and database is around 10000 records,
405    memory shouldn't be an issue. If you don't have enough physical RAM, you
406    might consider using virtual memory (if your operating system is handling it
407    well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
408    parsed structure of ISIS database (this is what C<low_mem> option does).
409    
410    Hitting swap at end of reading source database is probably o.k. However,
411    hitting swap before 90% will dramatically decrease performance and you will
412    be better off with C<low_mem> and using rest of availble memory for
413    operating system disk cache (Linux is particuallary good about this).
414    However, every access to database record will require disk access, so
415    generation phase will be slower 10-100 times.
416    
417    Parsed structures are essential - you just have option to trade RAM memory
418    (which is fast) for disk space (which is slow). Be sure to have planty of
419    disk space if you are using C<low_mem> and thus L<DBM::Deep>.
420    
421    However, when WebPAC is running on desktop machines (or laptops :-), it's
422    highly undesireable for system to start swapping. Using C<low_mem> option can
423    reduce WecPAC memory usage to around 64Mb for same database with lookup
424    fields and sorted indexes which stay in RAM. Performance will suffer, but
425    memory usage will really be minimal. It might be also more confortable to
426    run WebPAC reniced on those machines.
427    
428    
429    =head1 AUTHOR
430    
431    Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
432    
433  =head1 COPYRIGHT & LICENSE  =head1 COPYRIGHT & LICENSE
434    

Legend:
Removed from v.1  
changed lines
  Added in v.416

  ViewVC Help
Powered by ViewVC 1.1.26