/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 726 - (hide annotations)
Fri Sep 29 19:52:17 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 16744 byte(s)
 r1045@llin:  dpavlin | 2006-09-29 21:38:42 +0200
 change low-level API to be OO (and remove various ugly cludges).

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10 dpavlin 308 use Data::Dumper;
11 dpavlin 624 use Encode qw/from_to/;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 707 Version 0.13
20 dpavlin 1
21     =cut
22    
23 dpavlin 707 our $VERSION = '0.13';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101 dpavlin 726 my $module_path = $self->{module};
102     $module_path =~ s#::#/#g;
103     $module_path .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module_path");
105 dpavlin 289
106 dpavlin 726 require $module_path;
107 dpavlin 286
108 dpavlin 285 # check if required subclasses are implemented
109 dpavlin 652 foreach my $subclass (qw/open_db fetch_rec init dump_rec/) {
110 dpavlin 726 # FIXME
111 dpavlin 285 }
112    
113 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
114 dpavlin 9
115 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126     require DBM::Deep;
127    
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135     $log->debug("using file '$db_file' for DBM::Deep");
136     }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 3 $self ? return $self : return undef;
142 dpavlin 1 }
143    
144 dpavlin 285 =head2 open
145    
146     This function will read whole database in memory and produce lookups.
147    
148 dpavlin 286 $input->open(
149 dpavlin 285 path => '/path/to/database/file',
150 dpavlin 624 code_page => 'cp852',
151 dpavlin 286 limit => 500,
152     offset => 6000,
153 dpavlin 506 stats => 1,
154 dpavlin 707 lookup_coderef => sub {
155     my $rec = shift;
156     # store lookups
157 dpavlin 585 },
158 dpavlin 597 modify_records => {
159     900 => { '^a' => { ' : ' => '^b' } },
160     901 => { '*' => { '^b' => ' ; ' } },
161     },
162 dpavlin 636 modify_file => 'conf/modify/mapping.map',
163 dpavlin 285 );
164    
165 dpavlin 624 By default, C<code_page> is assumed to be C<cp852>.
166 dpavlin 285
167 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
168 dpavlin 285
169 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
170 dpavlin 285
171 dpavlin 506 C<stats> create optional report about usage of fields and subfields
172    
173 dpavlin 707 C<lookup_coderef> is closure to called to save data into lookups
174 dpavlin 585
175 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
176     delimiters to subfields, as well as oprations on fields (if subfield is
177     defined as C<*>.
178    
179 dpavlin 636 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
180     (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
181     overrides C<modify_records> if both exists for same input.
182    
183 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
184     parametars, see also C<size>.
185 dpavlin 285
186     =cut
187    
188     sub open {
189     my $self = shift;
190     my $arg = {@_};
191    
192     my $log = $self->_get_logger();
193    
194 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
195     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
196     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
197    
198 dpavlin 707 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
199    
200 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
201 dpavlin 624 my $code_page = $arg->{'code_page'} || 'cp852';
202 dpavlin 285
203     # store data in object
204 dpavlin 292 $self->{'input_code_page'} = $code_page;
205 dpavlin 286 foreach my $v (qw/path offset limit/) {
206 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
207     }
208    
209 dpavlin 416 my $filter_ref;
210 dpavlin 597 my $recode_regex;
211     my $recode_map;
212 dpavlin 416
213     if ($self->{recode}) {
214     my @r = split(/\s/, $self->{recode});
215     if ($#r % 2 != 1) {
216     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
217     } else {
218     while (@r) {
219     my $from = shift @r;
220     my $to = shift @r;
221 dpavlin 597 $recode_map->{$from} = $to;
222 dpavlin 416 }
223    
224 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
225 dpavlin 416
226 dpavlin 597 $log->debug("using recode regex: $recode_regex");
227 dpavlin 416 }
228    
229     }
230    
231 dpavlin 636 my $rec_regex;
232     if (my $p = $arg->{modify_file}) {
233     $log->debug("using modify_file $p");
234     $rec_regex = $self->modify_file_regexps( $p );
235     } elsif (my $h = $arg->{modify_records}) {
236     $log->debug("using modify_records ", Dumper( $h ));
237     $rec_regex = $self->modify_record_regexps(%{ $h });
238     }
239     $log->debug("rec_regex: ", Dumper($rec_regex)) if ($rec_regex);
240 dpavlin 597
241 dpavlin 726 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
242    
243     my $ll_db = $class->new(
244 dpavlin 285 path => $arg->{path},
245 dpavlin 624 # filter => sub {
246     # my ($l,$f_nr) = @_;
247     # return unless defined($l);
248     # from_to($l, $code_page, $self->{'encoding'});
249     # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
250     # return $l;
251     # },
252 dpavlin 523 %{ $arg },
253 dpavlin 285 );
254    
255 dpavlin 726 unless (defined($ll_db)) {
256 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
257     return;
258     }
259    
260 dpavlin 726 my $size = $ll_db->size;
261    
262 dpavlin 285 unless ($size) {
263     $log->logwarn("no records in database $arg->{path}, skipping...");
264     return;
265     }
266    
267 dpavlin 339 my $from_rec = 1;
268     my $to_rec = $size;
269 dpavlin 285
270 dpavlin 286 if (my $s = $self->{offset}) {
271 dpavlin 513 $log->debug("skipping to MFN $s");
272 dpavlin 339 $from_rec = $s;
273 dpavlin 285 } else {
274 dpavlin 339 $self->{offset} = $from_rec;
275 dpavlin 285 }
276    
277 dpavlin 286 if ($self->{limit}) {
278 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
279 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
280     $to_rec = $size if ($to_rec > $size);
281 dpavlin 285 }
282    
283     # store size for later
284 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
285 dpavlin 285
286 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
287 dpavlin 285
288     # read database
289 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
290 dpavlin 285
291 dpavlin 286 $log->debug("position: $pos\n");
292 dpavlin 285
293 dpavlin 726 my $rec = $ll_db->fetch_rec($pos, sub {
294 dpavlin 619 my ($l,$f_nr) = @_;
295 dpavlin 624 # return unless defined($l);
296     # return $l unless ($rec_regex && $f_nr);
297 dpavlin 285
298 dpavlin 625 $log->debug("-=> $f_nr ## $l");
299    
300 dpavlin 624 # codepage conversion and recode_regex
301 dpavlin 626 from_to($l, $code_page, $self->{'encoding'});
302 dpavlin 624 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
303    
304 dpavlin 619 # apply regexps
305     if ($rec_regex && defined($rec_regex->{$f_nr})) {
306     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
307     my $c = 0;
308     foreach my $r (@{ $rec_regex->{$f_nr} }) {
309 dpavlin 626 my $old_l = $l;
310 dpavlin 619 eval '$l =~ ' . $r;
311 dpavlin 626 if ($old_l ne $l) {
312     $log->debug("REGEX on $f_nr eval \$l =~ $r\n## old l: [$old_l]\n## new l: [$l]");
313     }
314 dpavlin 619 $log->error("error applying regex: $r") if ($@);
315     }
316     }
317    
318 dpavlin 625 $log->debug("<=- $f_nr ## $l");
319 dpavlin 619 return $l;
320     });
321    
322 dpavlin 308 $log->debug(sub { Dumper($rec) });
323    
324 dpavlin 285 if (! $rec) {
325 dpavlin 286 $log->warn("record $pos empty? skipping...");
326 dpavlin 285 next;
327     }
328    
329     # store
330 dpavlin 286 if ($self->{low_mem}) {
331     $self->{db}->put($pos, $rec);
332 dpavlin 285 } else {
333 dpavlin 286 $self->{data}->{$pos} = $rec;
334 dpavlin 285 }
335    
336     # create lookup
337 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
338 dpavlin 285
339 dpavlin 506 # update counters for statistics
340     if ($self->{stats}) {
341 dpavlin 593
342 dpavlin 619 # fetch clean record with regexpes applied for statistics
343 dpavlin 726 my $rec = $ll_db->fetch_rec($pos);
344 dpavlin 619
345 dpavlin 593 foreach my $fld (keys %{ $rec }) {
346 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
347 dpavlin 593
348     $log->logdie("invalid record fild $fld, not ARRAY")
349     unless (ref($rec->{ $fld }) eq 'ARRAY');
350    
351     foreach my $row (@{ $rec->{$fld} }) {
352    
353     if (ref($row) eq 'HASH') {
354    
355     foreach my $sf (keys %{ $row }) {
356 dpavlin 619 next if ($sf eq 'subfields');
357 dpavlin 593 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
358     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
359     if (ref($row->{$sf}) eq 'ARRAY');
360 dpavlin 506 }
361 dpavlin 593
362     } else {
363     $self->{_stats}->{repeatable}->{ $fld }++;
364     }
365 dpavlin 506 }
366 dpavlin 593 }
367 dpavlin 506 }
368    
369 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
370 dpavlin 285
371     }
372    
373 dpavlin 286 $self->{pos} = -1;
374     $self->{last_pcnt} = 0;
375 dpavlin 285
376     # store max mfn and return it.
377 dpavlin 339 $self->{max_pos} = $to_rec;
378     $log->debug("max_pos: $to_rec");
379 dpavlin 285
380     return $size;
381     }
382    
383     =head2 fetch
384    
385     Fetch next record from database. It will also displays progress bar.
386    
387     my $rec = $isis->fetch;
388    
389     Record from this function should probably go to C<data_structure> for
390     normalisation.
391    
392     =cut
393    
394     sub fetch {
395     my $self = shift;
396    
397     my $log = $self->_get_logger();
398    
399 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
400 dpavlin 285
401 dpavlin 286 if ($self->{pos} == -1) {
402     $self->{pos} = $self->{offset};
403 dpavlin 285 } else {
404 dpavlin 286 $self->{pos}++;
405 dpavlin 285 }
406    
407 dpavlin 286 my $mfn = $self->{pos};
408 dpavlin 285
409 dpavlin 286 if ($mfn > $self->{max_pos}) {
410     $self->{pos} = $self->{max_pos};
411 dpavlin 285 $log->debug("at EOF");
412     return;
413     }
414    
415 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
416 dpavlin 285
417     my $rec;
418    
419 dpavlin 286 if ($self->{low_mem}) {
420     $rec = $self->{db}->get($mfn);
421 dpavlin 285 } else {
422 dpavlin 286 $rec = $self->{data}->{$mfn};
423 dpavlin 285 }
424    
425     $rec ||= 0E0;
426     }
427    
428     =head2 pos
429    
430     Returns current record number (MFN).
431    
432     print $isis->pos;
433    
434     First record in database has position 1.
435    
436     =cut
437    
438     sub pos {
439     my $self = shift;
440 dpavlin 286 return $self->{pos};
441 dpavlin 285 }
442    
443    
444     =head2 size
445    
446     Returns number of records in database
447    
448     print $isis->size;
449    
450     Result from this function can be used to loop through all records
451    
452     foreach my $mfn ( 1 ... $isis->size ) { ... }
453    
454 dpavlin 286 because it takes into account C<offset> and C<limit>.
455 dpavlin 285
456     =cut
457    
458     sub size {
459     my $self = shift;
460 dpavlin 286 return $self->{size};
461 dpavlin 285 }
462    
463     =head2 seek
464    
465     Seek to specified MFN in file.
466    
467     $isis->seek(42);
468    
469     First record in database has position 1.
470    
471     =cut
472    
473     sub seek {
474     my $self = shift;
475     my $pos = shift || return;
476    
477     my $log = $self->_get_logger();
478    
479     if ($pos < 1) {
480     $log->warn("seek before first record");
481     $pos = 1;
482 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
483 dpavlin 285 $log->warn("seek beyond last record");
484 dpavlin 286 $pos = $self->{max_pos};
485 dpavlin 285 }
486    
487 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
488 dpavlin 285 }
489    
490 dpavlin 506 =head2 stats
491 dpavlin 285
492 dpavlin 506 Dump statistics about field and subfield usage
493    
494 dpavlin 507 print $input->stats;
495 dpavlin 506
496     =cut
497    
498     sub stats {
499     my $self = shift;
500 dpavlin 507
501     my $log = $self->_get_logger();
502    
503     my $s = $self->{_stats};
504     if (! $s) {
505     $log->warn("called stats, but there is no statistics collected");
506     return;
507     }
508    
509     my $max_fld = 0;
510    
511     my $out = join("\n",
512     map {
513     my $f = $_ || die "no field";
514     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
515     $max_fld = $v if ($v > $max_fld);
516    
517 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
518 dpavlin 507
519     if (defined($s->{sf}->{$f})) {
520     map {
521 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
522     $s->{sf}->{$f}->{$_}->{count},
523     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
524     );
525 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
526     }
527    
528     if (my $v_r = $s->{repeatable}->{$f}) {
529     $o .= " ($v_r)" if ($v_r != $v);
530     }
531    
532     $o;
533 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
534 dpavlin 507 );
535    
536     $log->debug( sub { Dumper($s) } );
537    
538     return $out;
539 dpavlin 506 }
540    
541 dpavlin 652 =head2 dump
542    
543     Display humanly readable dump of record
544    
545     =cut
546    
547     sub dump {
548     my $self = shift;
549    
550     return $self->{dump_rec}->($self, $self->{pos});
551    
552     }
553    
554 dpavlin 598 =head2 modify_record_regexps
555 dpavlin 597
556 dpavlin 636 Generate hash with regexpes to be applied using l<filter>.
557 dpavlin 597
558     my $regexpes = $input->modify_record_regexps(
559     900 => { '^a' => { ' : ' => '^b' } },
560     901 => { '*' => { '^b' => ' ; ' } },
561     );
562    
563     =cut
564    
565 dpavlin 636 sub _get_regex {
566     my ($sf,$from,$to) = @_;
567     if ($sf =~ /^\^/) {
568     return
569     's/\Q'. $sf .'\E([^\^]*?)\Q'. $from .'\E([^\^]*?)/'. $sf .'$1'. $to .'$2/';
570     } else {
571     return
572     's/\Q'. $from .'\E/'. $to .'/g';
573     }
574     }
575    
576 dpavlin 597 sub modify_record_regexps {
577     my $self = shift;
578     my $modify_record = {@_};
579    
580     my $regexpes;
581    
582 dpavlin 634 my $log = $self->_get_logger();
583    
584 dpavlin 597 foreach my $f (keys %$modify_record) {
585 dpavlin 634 $log->debug("field: $f");
586    
587 dpavlin 597 foreach my $sf (keys %{ $modify_record->{$f} }) {
588 dpavlin 634 $log->debug("subfield: $sf");
589    
590 dpavlin 597 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
591     my $to = $modify_record->{$f}->{$sf}->{$from};
592     #die "no field?" unless defined($to);
593 dpavlin 634 $log->debug("transform: |$from| -> |$to|");
594 dpavlin 597
595 dpavlin 636 my $regex = _get_regex($sf,$from,$to);
596     push @{ $regexpes->{$f} }, $regex;
597     $log->debug("regex: $regex");
598 dpavlin 597 }
599     }
600     }
601    
602     return $regexpes;
603     }
604    
605 dpavlin 636 =head2 modify_file_regexps
606    
607     Generate hash with regexpes to be applied using l<filter> from
608     pseudo hash/yaml format for regex mappings.
609    
610     It should be obvious:
611    
612     200
613     '^a'
614     ' : ' => '^e'
615     ' = ' => '^d'
616    
617     In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
618     In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
619    
620     my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
621    
622     On undef path it will just return.
623    
624     =cut
625    
626     sub modify_file_regexps {
627     my $self = shift;
628    
629     my $modify_path = shift || return;
630    
631     my $log = $self->_get_logger();
632    
633     my $regexpes;
634    
635 dpavlin 697 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
636 dpavlin 636
637     my ($f,$sf);
638    
639     while(<$fh>) {
640     chomp;
641     next if (/^#/ || /^\s*$/);
642    
643     if (/^\s*(\d+)\s*$/) {
644     $f = $1;
645     $log->debug("field: $f");
646     next;
647     } elsif (/^\s*'([^']*)'\s*$/) {
648     $sf = $1;
649     $log->die("can't define subfiled before field in: $_") unless ($f);
650     $log->debug("subfield: $sf");
651     } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
652     my ($from,$to) = ($1, $2);
653    
654     $log->debug("transform: |$from| -> |$to|");
655    
656     my $regex = _get_regex($sf,$from,$to);
657     push @{ $regexpes->{$f} }, $regex;
658     $log->debug("regex: $regex");
659     }
660     }
661    
662     return $regexpes;
663     }
664    
665 dpavlin 3 =head1 MEMORY USAGE
666 dpavlin 1
667 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
668     will run on memory constraint machines (which doesn't have enough
669     physical RAM to create memory structure for whole source database).
670 dpavlin 1
671 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
672     memory shouldn't be an issue. If you don't have enough physical RAM, you
673     might consider using virtual memory (if your operating system is handling it
674     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
675     parsed structure of ISIS database (this is what C<low_mem> option does).
676 dpavlin 1
677 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
678     hitting swap before 90% will dramatically decrease performance and you will
679     be better off with C<low_mem> and using rest of availble memory for
680     operating system disk cache (Linux is particuallary good about this).
681     However, every access to database record will require disk access, so
682     generation phase will be slower 10-100 times.
683    
684     Parsed structures are essential - you just have option to trade RAM memory
685     (which is fast) for disk space (which is slow). Be sure to have planty of
686     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
687    
688     However, when WebPAC is running on desktop machines (or laptops :-), it's
689     highly undesireable for system to start swapping. Using C<low_mem> option can
690     reduce WecPAC memory usage to around 64Mb for same database with lookup
691     fields and sorted indexes which stay in RAM. Performance will suffer, but
692     memory usage will really be minimal. It might be also more confortable to
693     run WebPAC reniced on those machines.
694    
695    
696     =head1 AUTHOR
697    
698     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
699    
700 dpavlin 1 =head1 COPYRIGHT & LICENSE
701    
702 dpavlin 599 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
703 dpavlin 1
704     This program is free software; you can redistribute it and/or modify it
705     under the same terms as Perl itself.
706    
707     =cut
708    
709     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26