/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 757 - (hide annotations)
Tue Oct 10 10:57:59 2006 UTC (17 years, 6 months ago) by dpavlin
File size: 16788 byte(s)
fix dump (ugly, needs re-visiting)

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10 dpavlin 308 use Data::Dumper;
11 dpavlin 624 use Encode qw/from_to/;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 707 Version 0.13
20 dpavlin 1
21     =cut
22    
23 dpavlin 707 our $VERSION = '0.13';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101 dpavlin 726 my $module_path = $self->{module};
102     $module_path =~ s#::#/#g;
103     $module_path .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module_path");
105 dpavlin 289
106 dpavlin 726 require $module_path;
107 dpavlin 286
108 dpavlin 285 # check if required subclasses are implemented
109 dpavlin 652 foreach my $subclass (qw/open_db fetch_rec init dump_rec/) {
110 dpavlin 726 # FIXME
111 dpavlin 285 }
112    
113 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
114 dpavlin 9
115 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
116     if ($self->{'low_mem'}) {
117     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
118    
119     my $db_file = "data.db";
120    
121     if (-e $db_file) {
122     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123     $log->debug("removed '$db_file' from last run");
124     }
125    
126     require DBM::Deep;
127    
128     my $db = new DBM::Deep $db_file;
129    
130     $log->logdie("DBM::Deep error: $!") unless ($db);
131    
132     if ($db->error()) {
133     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
134     } else {
135     $log->debug("using file '$db_file' for DBM::Deep");
136     }
137    
138     $self->{'db'} = $db;
139     }
140    
141 dpavlin 3 $self ? return $self : return undef;
142 dpavlin 1 }
143    
144 dpavlin 285 =head2 open
145    
146     This function will read whole database in memory and produce lookups.
147    
148 dpavlin 286 $input->open(
149 dpavlin 285 path => '/path/to/database/file',
150 dpavlin 624 code_page => 'cp852',
151 dpavlin 286 limit => 500,
152     offset => 6000,
153 dpavlin 506 stats => 1,
154 dpavlin 707 lookup_coderef => sub {
155     my $rec = shift;
156     # store lookups
157 dpavlin 585 },
158 dpavlin 597 modify_records => {
159     900 => { '^a' => { ' : ' => '^b' } },
160     901 => { '*' => { '^b' => ' ; ' } },
161     },
162 dpavlin 636 modify_file => 'conf/modify/mapping.map',
163 dpavlin 285 );
164    
165 dpavlin 624 By default, C<code_page> is assumed to be C<cp852>.
166 dpavlin 285
167 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
168 dpavlin 285
169 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
170 dpavlin 285
171 dpavlin 506 C<stats> create optional report about usage of fields and subfields
172    
173 dpavlin 707 C<lookup_coderef> is closure to called to save data into lookups
174 dpavlin 585
175 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
176     delimiters to subfields, as well as oprations on fields (if subfield is
177     defined as C<*>.
178    
179 dpavlin 636 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
180     (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
181     overrides C<modify_records> if both exists for same input.
182    
183 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
184     parametars, see also C<size>.
185 dpavlin 285
186     =cut
187    
188     sub open {
189     my $self = shift;
190     my $arg = {@_};
191    
192     my $log = $self->_get_logger();
193    
194 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
195     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
196     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
197    
198 dpavlin 707 $log->debug( $arg->{lookup_coderef} ? '' : 'not ', "using lookup_coderef");
199    
200 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
201 dpavlin 624 my $code_page = $arg->{'code_page'} || 'cp852';
202 dpavlin 285
203     # store data in object
204 dpavlin 292 $self->{'input_code_page'} = $code_page;
205 dpavlin 286 foreach my $v (qw/path offset limit/) {
206 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
207     }
208    
209 dpavlin 416 my $filter_ref;
210 dpavlin 597 my $recode_regex;
211     my $recode_map;
212 dpavlin 416
213     if ($self->{recode}) {
214     my @r = split(/\s/, $self->{recode});
215     if ($#r % 2 != 1) {
216     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
217     } else {
218     while (@r) {
219     my $from = shift @r;
220     my $to = shift @r;
221 dpavlin 597 $recode_map->{$from} = $to;
222 dpavlin 416 }
223    
224 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
225 dpavlin 416
226 dpavlin 597 $log->debug("using recode regex: $recode_regex");
227 dpavlin 416 }
228    
229     }
230    
231 dpavlin 636 my $rec_regex;
232     if (my $p = $arg->{modify_file}) {
233     $log->debug("using modify_file $p");
234     $rec_regex = $self->modify_file_regexps( $p );
235     } elsif (my $h = $arg->{modify_records}) {
236     $log->debug("using modify_records ", Dumper( $h ));
237     $rec_regex = $self->modify_record_regexps(%{ $h });
238     }
239     $log->debug("rec_regex: ", Dumper($rec_regex)) if ($rec_regex);
240 dpavlin 597
241 dpavlin 726 my $class = $self->{module} || $log->logconfess("can't get low-level module name!");
242    
243     my $ll_db = $class->new(
244 dpavlin 285 path => $arg->{path},
245 dpavlin 624 # filter => sub {
246     # my ($l,$f_nr) = @_;
247     # return unless defined($l);
248     # from_to($l, $code_page, $self->{'encoding'});
249     # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
250     # return $l;
251     # },
252 dpavlin 523 %{ $arg },
253 dpavlin 285 );
254    
255 dpavlin 726 unless (defined($ll_db)) {
256 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
257     return;
258     }
259    
260 dpavlin 726 my $size = $ll_db->size;
261    
262 dpavlin 285 unless ($size) {
263     $log->logwarn("no records in database $arg->{path}, skipping...");
264     return;
265     }
266    
267 dpavlin 339 my $from_rec = 1;
268     my $to_rec = $size;
269 dpavlin 285
270 dpavlin 286 if (my $s = $self->{offset}) {
271 dpavlin 513 $log->debug("skipping to MFN $s");
272 dpavlin 339 $from_rec = $s;
273 dpavlin 285 } else {
274 dpavlin 339 $self->{offset} = $from_rec;
275 dpavlin 285 }
276    
277 dpavlin 286 if ($self->{limit}) {
278 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
279 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
280     $to_rec = $size if ($to_rec > $size);
281 dpavlin 285 }
282    
283     # store size for later
284 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
285 dpavlin 285
286 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
287 dpavlin 285
288     # read database
289 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
290 dpavlin 285
291 dpavlin 286 $log->debug("position: $pos\n");
292 dpavlin 285
293 dpavlin 726 my $rec = $ll_db->fetch_rec($pos, sub {
294 dpavlin 619 my ($l,$f_nr) = @_;
295 dpavlin 624 # return unless defined($l);
296     # return $l unless ($rec_regex && $f_nr);
297 dpavlin 285
298 dpavlin 625 $log->debug("-=> $f_nr ## $l");
299    
300 dpavlin 624 # codepage conversion and recode_regex
301 dpavlin 626 from_to($l, $code_page, $self->{'encoding'});
302 dpavlin 624 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
303    
304 dpavlin 619 # apply regexps
305     if ($rec_regex && defined($rec_regex->{$f_nr})) {
306     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
307     my $c = 0;
308     foreach my $r (@{ $rec_regex->{$f_nr} }) {
309 dpavlin 626 my $old_l = $l;
310 dpavlin 619 eval '$l =~ ' . $r;
311 dpavlin 626 if ($old_l ne $l) {
312     $log->debug("REGEX on $f_nr eval \$l =~ $r\n## old l: [$old_l]\n## new l: [$l]");
313     }
314 dpavlin 619 $log->error("error applying regex: $r") if ($@);
315     }
316     }
317    
318 dpavlin 625 $log->debug("<=- $f_nr ## $l");
319 dpavlin 619 return $l;
320     });
321    
322 dpavlin 308 $log->debug(sub { Dumper($rec) });
323    
324 dpavlin 285 if (! $rec) {
325 dpavlin 286 $log->warn("record $pos empty? skipping...");
326 dpavlin 285 next;
327     }
328    
329     # store
330 dpavlin 286 if ($self->{low_mem}) {
331     $self->{db}->put($pos, $rec);
332 dpavlin 285 } else {
333 dpavlin 286 $self->{data}->{$pos} = $rec;
334 dpavlin 285 }
335    
336     # create lookup
337 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
338 dpavlin 285
339 dpavlin 506 # update counters for statistics
340     if ($self->{stats}) {
341 dpavlin 593
342 dpavlin 619 # fetch clean record with regexpes applied for statistics
343 dpavlin 726 my $rec = $ll_db->fetch_rec($pos);
344 dpavlin 619
345 dpavlin 593 foreach my $fld (keys %{ $rec }) {
346 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
347 dpavlin 593
348     $log->logdie("invalid record fild $fld, not ARRAY")
349     unless (ref($rec->{ $fld }) eq 'ARRAY');
350    
351     foreach my $row (@{ $rec->{$fld} }) {
352    
353     if (ref($row) eq 'HASH') {
354    
355     foreach my $sf (keys %{ $row }) {
356 dpavlin 619 next if ($sf eq 'subfields');
357 dpavlin 593 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
358     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
359     if (ref($row->{$sf}) eq 'ARRAY');
360 dpavlin 506 }
361 dpavlin 593
362     } else {
363     $self->{_stats}->{repeatable}->{ $fld }++;
364     }
365 dpavlin 506 }
366 dpavlin 593 }
367 dpavlin 506 }
368    
369 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
370 dpavlin 285
371     }
372    
373 dpavlin 286 $self->{pos} = -1;
374     $self->{last_pcnt} = 0;
375 dpavlin 285
376     # store max mfn and return it.
377 dpavlin 339 $self->{max_pos} = $to_rec;
378     $log->debug("max_pos: $to_rec");
379 dpavlin 285
380 dpavlin 757 # save for dump
381     $self->{ll_db} = $ll_db;
382    
383 dpavlin 285 return $size;
384     }
385    
386     =head2 fetch
387    
388     Fetch next record from database. It will also displays progress bar.
389    
390     my $rec = $isis->fetch;
391    
392     Record from this function should probably go to C<data_structure> for
393     normalisation.
394    
395     =cut
396    
397     sub fetch {
398     my $self = shift;
399    
400     my $log = $self->_get_logger();
401    
402 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
403 dpavlin 285
404 dpavlin 286 if ($self->{pos} == -1) {
405     $self->{pos} = $self->{offset};
406 dpavlin 285 } else {
407 dpavlin 286 $self->{pos}++;
408 dpavlin 285 }
409    
410 dpavlin 286 my $mfn = $self->{pos};
411 dpavlin 285
412 dpavlin 286 if ($mfn > $self->{max_pos}) {
413     $self->{pos} = $self->{max_pos};
414 dpavlin 285 $log->debug("at EOF");
415     return;
416     }
417    
418 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
419 dpavlin 285
420     my $rec;
421    
422 dpavlin 286 if ($self->{low_mem}) {
423     $rec = $self->{db}->get($mfn);
424 dpavlin 285 } else {
425 dpavlin 286 $rec = $self->{data}->{$mfn};
426 dpavlin 285 }
427    
428     $rec ||= 0E0;
429     }
430    
431     =head2 pos
432    
433     Returns current record number (MFN).
434    
435     print $isis->pos;
436    
437     First record in database has position 1.
438    
439     =cut
440    
441     sub pos {
442     my $self = shift;
443 dpavlin 286 return $self->{pos};
444 dpavlin 285 }
445    
446    
447     =head2 size
448    
449     Returns number of records in database
450    
451     print $isis->size;
452    
453     Result from this function can be used to loop through all records
454    
455     foreach my $mfn ( 1 ... $isis->size ) { ... }
456    
457 dpavlin 286 because it takes into account C<offset> and C<limit>.
458 dpavlin 285
459     =cut
460    
461     sub size {
462     my $self = shift;
463 dpavlin 286 return $self->{size};
464 dpavlin 285 }
465    
466     =head2 seek
467    
468     Seek to specified MFN in file.
469    
470     $isis->seek(42);
471    
472     First record in database has position 1.
473    
474     =cut
475    
476     sub seek {
477     my $self = shift;
478     my $pos = shift || return;
479    
480     my $log = $self->_get_logger();
481    
482     if ($pos < 1) {
483     $log->warn("seek before first record");
484     $pos = 1;
485 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
486 dpavlin 285 $log->warn("seek beyond last record");
487 dpavlin 286 $pos = $self->{max_pos};
488 dpavlin 285 }
489    
490 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
491 dpavlin 285 }
492    
493 dpavlin 506 =head2 stats
494 dpavlin 285
495 dpavlin 506 Dump statistics about field and subfield usage
496    
497 dpavlin 507 print $input->stats;
498 dpavlin 506
499     =cut
500    
501     sub stats {
502     my $self = shift;
503 dpavlin 507
504     my $log = $self->_get_logger();
505    
506     my $s = $self->{_stats};
507     if (! $s) {
508     $log->warn("called stats, but there is no statistics collected");
509     return;
510     }
511    
512     my $max_fld = 0;
513    
514     my $out = join("\n",
515     map {
516     my $f = $_ || die "no field";
517     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
518     $max_fld = $v if ($v > $max_fld);
519    
520 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
521 dpavlin 507
522     if (defined($s->{sf}->{$f})) {
523     map {
524 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
525     $s->{sf}->{$f}->{$_}->{count},
526     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
527     );
528 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
529     }
530    
531     if (my $v_r = $s->{repeatable}->{$f}) {
532     $o .= " ($v_r)" if ($v_r != $v);
533     }
534    
535     $o;
536 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
537 dpavlin 507 );
538    
539     $log->debug( sub { Dumper($s) } );
540    
541     return $out;
542 dpavlin 506 }
543    
544 dpavlin 652 =head2 dump
545    
546     Display humanly readable dump of record
547    
548     =cut
549    
550     sub dump {
551     my $self = shift;
552    
553 dpavlin 757 return $self->{ll_db}->dump_rec( $self->{pos} );
554 dpavlin 652
555     }
556    
557 dpavlin 598 =head2 modify_record_regexps
558 dpavlin 597
559 dpavlin 636 Generate hash with regexpes to be applied using l<filter>.
560 dpavlin 597
561     my $regexpes = $input->modify_record_regexps(
562     900 => { '^a' => { ' : ' => '^b' } },
563     901 => { '*' => { '^b' => ' ; ' } },
564     );
565    
566     =cut
567    
568 dpavlin 636 sub _get_regex {
569     my ($sf,$from,$to) = @_;
570     if ($sf =~ /^\^/) {
571     return
572     's/\Q'. $sf .'\E([^\^]*?)\Q'. $from .'\E([^\^]*?)/'. $sf .'$1'. $to .'$2/';
573     } else {
574     return
575     's/\Q'. $from .'\E/'. $to .'/g';
576     }
577     }
578    
579 dpavlin 597 sub modify_record_regexps {
580     my $self = shift;
581     my $modify_record = {@_};
582    
583     my $regexpes;
584    
585 dpavlin 634 my $log = $self->_get_logger();
586    
587 dpavlin 597 foreach my $f (keys %$modify_record) {
588 dpavlin 634 $log->debug("field: $f");
589    
590 dpavlin 597 foreach my $sf (keys %{ $modify_record->{$f} }) {
591 dpavlin 634 $log->debug("subfield: $sf");
592    
593 dpavlin 597 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
594     my $to = $modify_record->{$f}->{$sf}->{$from};
595     #die "no field?" unless defined($to);
596 dpavlin 634 $log->debug("transform: |$from| -> |$to|");
597 dpavlin 597
598 dpavlin 636 my $regex = _get_regex($sf,$from,$to);
599     push @{ $regexpes->{$f} }, $regex;
600     $log->debug("regex: $regex");
601 dpavlin 597 }
602     }
603     }
604    
605     return $regexpes;
606     }
607    
608 dpavlin 636 =head2 modify_file_regexps
609    
610     Generate hash with regexpes to be applied using l<filter> from
611     pseudo hash/yaml format for regex mappings.
612    
613     It should be obvious:
614    
615     200
616     '^a'
617     ' : ' => '^e'
618     ' = ' => '^d'
619    
620     In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
621     In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
622    
623     my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
624    
625     On undef path it will just return.
626    
627     =cut
628    
629     sub modify_file_regexps {
630     my $self = shift;
631    
632     my $modify_path = shift || return;
633    
634     my $log = $self->_get_logger();
635    
636     my $regexpes;
637    
638 dpavlin 697 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
639 dpavlin 636
640     my ($f,$sf);
641    
642     while(<$fh>) {
643     chomp;
644     next if (/^#/ || /^\s*$/);
645    
646     if (/^\s*(\d+)\s*$/) {
647     $f = $1;
648     $log->debug("field: $f");
649     next;
650     } elsif (/^\s*'([^']*)'\s*$/) {
651     $sf = $1;
652     $log->die("can't define subfiled before field in: $_") unless ($f);
653     $log->debug("subfield: $sf");
654     } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
655     my ($from,$to) = ($1, $2);
656    
657     $log->debug("transform: |$from| -> |$to|");
658    
659     my $regex = _get_regex($sf,$from,$to);
660     push @{ $regexpes->{$f} }, $regex;
661     $log->debug("regex: $regex");
662     }
663     }
664    
665     return $regexpes;
666     }
667    
668 dpavlin 3 =head1 MEMORY USAGE
669 dpavlin 1
670 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
671     will run on memory constraint machines (which doesn't have enough
672     physical RAM to create memory structure for whole source database).
673 dpavlin 1
674 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
675     memory shouldn't be an issue. If you don't have enough physical RAM, you
676     might consider using virtual memory (if your operating system is handling it
677     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
678     parsed structure of ISIS database (this is what C<low_mem> option does).
679 dpavlin 1
680 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
681     hitting swap before 90% will dramatically decrease performance and you will
682     be better off with C<low_mem> and using rest of availble memory for
683     operating system disk cache (Linux is particuallary good about this).
684     However, every access to database record will require disk access, so
685     generation phase will be slower 10-100 times.
686    
687     Parsed structures are essential - you just have option to trade RAM memory
688     (which is fast) for disk space (which is slow). Be sure to have planty of
689     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
690    
691     However, when WebPAC is running on desktop machines (or laptops :-), it's
692     highly undesireable for system to start swapping. Using C<low_mem> option can
693     reduce WecPAC memory usage to around 64Mb for same database with lookup
694     fields and sorted indexes which stay in RAM. Performance will suffer, but
695     memory usage will really be minimal. It might be also more confortable to
696     run WebPAC reniced on those machines.
697    
698    
699     =head1 AUTHOR
700    
701     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
702    
703 dpavlin 1 =head1 COPYRIGHT & LICENSE
704    
705 dpavlin 599 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
706 dpavlin 1
707     This program is free software; you can redistribute it and/or modify it
708     under the same terms as Perl itself.
709    
710     =cut
711    
712     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26