/[webpac2]/trunk/lib/WebPAC/Input.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WebPAC/Input.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 697 - (hide annotations)
Mon Sep 25 09:49:28 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 16972 byte(s)
 r988@llin:  dpavlin | 2006-09-25 11:47:07 +0200
 fix die

1 dpavlin 1 package WebPAC::Input;
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 507 use blib;
7    
8 dpavlin 487 use WebPAC::Common;
9 dpavlin 285 use base qw/WebPAC::Common/;
10 dpavlin 308 use Data::Dumper;
11 dpavlin 624 use Encode qw/from_to/;
12 dpavlin 285
13 dpavlin 1 =head1 NAME
14    
15 dpavlin 286 WebPAC::Input - read different file formats into WebPAC
16 dpavlin 1
17     =head1 VERSION
18    
19 dpavlin 636 Version 0.12
20 dpavlin 1
21     =cut
22    
23 dpavlin 636 our $VERSION = '0.12';
24 dpavlin 1
25     =head1 SYNOPSIS
26    
27 dpavlin 286 This module implements input as database which have fixed and known
28     I<size> while indexing and single unique numeric identifier for database
29     position ranging from 1 to I<size>.
30 dpavlin 1
31 dpavlin 286 Simply, something that is indexed by unmber from 1 .. I<size>.
32    
33     Examples of such databases are CDS/ISIS files, MARC files, lines in
34     text file, and so on.
35    
36     Specific file formats are implemented using low-level interface modules,
37     located in C<WebPAC::Input::*> namespace which export C<open_db>,
38     C<fetch_rec> and optional C<init> functions.
39    
40 dpavlin 1 Perhaps a little code snippet.
41    
42 dpavlin 597 use WebPAC::Input;
43 dpavlin 1
44 dpavlin 597 my $db = WebPAC::Input->new(
45     module => 'WebPAC::Input::ISIS',
46 dpavlin 286 low_mem => 1,
47 dpavlin 597 );
48 dpavlin 1
49 dpavlin 597 $db->open( path => '/path/to/database' );
50 dpavlin 416 print "database size: ",$db->size,"\n";
51     while (my $rec = $db->fetch) {
52     # do something with $rec
53     }
54 dpavlin 1
55 dpavlin 286
56    
57 dpavlin 1 =head1 FUNCTIONS
58    
59 dpavlin 3 =head2 new
60 dpavlin 1
61 dpavlin 3 Create new input database object.
62    
63 dpavlin 9 my $db = new WebPAC::Input(
64 dpavlin 286 module => 'WebPAC::Input::MARC',
65 dpavlin 585 encoding => 'ISO-8859-2',
66 dpavlin 10 low_mem => 1,
67 dpavlin 416 recode => 'char pairs',
68 dpavlin 483 no_progress_bar => 1,
69 dpavlin 9 );
70 dpavlin 3
71 dpavlin 597 C<module> is low-level file format module. See L<WebPAC::Input::ISIS> and
72 dpavlin 286 L<WebPAC::Input::MARC>.
73    
74 dpavlin 585 Optional parametar C<encoding> specify application code page (which will be
75 dpavlin 9 used internally). This should probably be your terminal encoding, and by
76     default, it C<ISO-8859-2>.
77    
78 dpavlin 10 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
79    
80 dpavlin 483 C<recode> is optional string constisting of character or words pairs that
81     should be replaced in input stream.
82    
83     C<no_progress_bar> disables progress bar output on C<STDOUT>
84    
85 dpavlin 285 This function will also call low-level C<init> if it exists with same
86     parametars.
87    
88 dpavlin 1 =cut
89    
90 dpavlin 3 sub new {
91 dpavlin 285 my $class = shift;
92     my $self = {@_};
93 dpavlin 3 bless($self, $class);
94    
95 dpavlin 285 my $log = $self->_get_logger;
96    
97 dpavlin 585 $log->logconfess("code_page argument is not suppored any more. change it to encoding") if ($self->{lookup});
98     $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_ref") if ($self->{lookup});
99    
100 dpavlin 286 $log->logconfess("specify low-level file format module") unless ($self->{module});
101     my $module = $self->{module};
102     $module =~ s#::#/#g;
103     $module .= '.pm';
104     $log->debug("require low-level module $self->{module} from $module");
105 dpavlin 289
106 dpavlin 286 require $module;
107 dpavlin 289 #eval $self->{module} .'->import';
108 dpavlin 286
109 dpavlin 285 # check if required subclasses are implemented
110 dpavlin 652 foreach my $subclass (qw/open_db fetch_rec init dump_rec/) {
111 dpavlin 289 my $n = $self->{module} . '::' . $subclass;
112     if (! defined &{ $n }) {
113 dpavlin 290 my $missing = "missing $subclass in $self->{module}";
114 dpavlin 301 $self->{$subclass} = sub { $log->logwarn($missing) };
115 dpavlin 286 } else {
116 dpavlin 289 $self->{$subclass} = \&{ $n };
117 dpavlin 286 }
118 dpavlin 285 }
119    
120 dpavlin 289 if ($self->{init}) {
121 dpavlin 285 $log->debug("calling init");
122 dpavlin 289 $self->{init}->($self, @_);
123 dpavlin 285 }
124    
125 dpavlin 585 $self->{'encoding'} ||= 'ISO-8859-2';
126 dpavlin 9
127 dpavlin 10 # running with low_mem flag? well, use DBM::Deep then.
128     if ($self->{'low_mem'}) {
129     $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
130    
131     my $db_file = "data.db";
132    
133     if (-e $db_file) {
134     unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
135     $log->debug("removed '$db_file' from last run");
136     }
137    
138     require DBM::Deep;
139    
140     my $db = new DBM::Deep $db_file;
141    
142     $log->logdie("DBM::Deep error: $!") unless ($db);
143    
144     if ($db->error()) {
145     $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
146     } else {
147     $log->debug("using file '$db_file' for DBM::Deep");
148     }
149    
150     $self->{'db'} = $db;
151     }
152    
153 dpavlin 3 $self ? return $self : return undef;
154 dpavlin 1 }
155    
156 dpavlin 285 =head2 open
157    
158     This function will read whole database in memory and produce lookups.
159    
160 dpavlin 286 $input->open(
161 dpavlin 285 path => '/path/to/database/file',
162 dpavlin 624 code_page => 'cp852',
163 dpavlin 286 limit => 500,
164     offset => 6000,
165 dpavlin 285 lookup => $lookup_obj,
166 dpavlin 506 stats => 1,
167 dpavlin 585 lookup_ref => sub {
168     my ($k,$v) = @_;
169     # store lookup $k => $v
170     },
171 dpavlin 597 modify_records => {
172     900 => { '^a' => { ' : ' => '^b' } },
173     901 => { '*' => { '^b' => ' ; ' } },
174     },
175 dpavlin 636 modify_file => 'conf/modify/mapping.map',
176 dpavlin 285 );
177    
178 dpavlin 624 By default, C<code_page> is assumed to be C<cp852>.
179 dpavlin 285
180 dpavlin 286 C<offset> is optional parametar to position at some offset before reading from database.
181 dpavlin 285
182 dpavlin 286 C<limit> is optional parametar to read just C<limit> records from database
183 dpavlin 285
184 dpavlin 506 C<stats> create optional report about usage of fields and subfields
185    
186 dpavlin 597 C<lookup_coderef> is closure to call when adding C<< key => 'value' >> combinations to
187 dpavlin 585 lookup.
188    
189 dpavlin 597 C<modify_records> specify mapping from subfields to delimiters or from
190     delimiters to subfields, as well as oprations on fields (if subfield is
191     defined as C<*>.
192    
193 dpavlin 636 C<modify_file> is alternative for C<modify_records> above which preserves order and offers
194     (hopefully) simplier sintax than YAML or perl (see L</modify_file_regex>). This option
195     overrides C<modify_records> if both exists for same input.
196    
197 dpavlin 286 Returns size of database, regardless of C<offset> and C<limit>
198     parametars, see also C<size>.
199 dpavlin 285
200     =cut
201    
202     sub open {
203     my $self = shift;
204     my $arg = {@_};
205    
206     my $log = $self->_get_logger();
207    
208 dpavlin 585 $log->logconfess("lookup argument is not suppored any more. rewrite call to lookup_coderef") if ($arg->{lookup});
209     $log->logconfess("lookup_coderef must be CODE, not ",ref($arg->{lookup_coderef}))
210     if ($arg->{lookup_coderef} && ref($arg->{lookup_coderef}) ne 'CODE');
211    
212 dpavlin 285 $log->logcroak("need path") if (! $arg->{'path'});
213 dpavlin 624 my $code_page = $arg->{'code_page'} || 'cp852';
214 dpavlin 285
215     # store data in object
216 dpavlin 292 $self->{'input_code_page'} = $code_page;
217 dpavlin 286 foreach my $v (qw/path offset limit/) {
218 dpavlin 285 $self->{$v} = $arg->{$v} if ($arg->{$v});
219     }
220    
221 dpavlin 416 my $filter_ref;
222 dpavlin 597 my $recode_regex;
223     my $recode_map;
224 dpavlin 416
225     if ($self->{recode}) {
226     my @r = split(/\s/, $self->{recode});
227     if ($#r % 2 != 1) {
228     $log->logwarn("recode needs even number of elements (some number of valid pairs)");
229     } else {
230     while (@r) {
231     my $from = shift @r;
232     my $to = shift @r;
233 dpavlin 597 $recode_map->{$from} = $to;
234 dpavlin 416 }
235    
236 dpavlin 597 $recode_regex = join '|' => keys %{ $recode_map };
237 dpavlin 416
238 dpavlin 597 $log->debug("using recode regex: $recode_regex");
239 dpavlin 416 }
240    
241     }
242    
243 dpavlin 636 my $rec_regex;
244     if (my $p = $arg->{modify_file}) {
245     $log->debug("using modify_file $p");
246     $rec_regex = $self->modify_file_regexps( $p );
247     } elsif (my $h = $arg->{modify_records}) {
248     $log->debug("using modify_records ", Dumper( $h ));
249     $rec_regex = $self->modify_record_regexps(%{ $h });
250     }
251     $log->debug("rec_regex: ", Dumper($rec_regex)) if ($rec_regex);
252 dpavlin 597
253 dpavlin 289 my ($db, $size) = $self->{open_db}->( $self,
254 dpavlin 285 path => $arg->{path},
255 dpavlin 624 # filter => sub {
256     # my ($l,$f_nr) = @_;
257     # return unless defined($l);
258     # from_to($l, $code_page, $self->{'encoding'});
259     # $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
260     # return $l;
261     # },
262 dpavlin 523 %{ $arg },
263 dpavlin 285 );
264    
265 dpavlin 496 unless (defined($db)) {
266 dpavlin 285 $log->logwarn("can't open database $arg->{path}, skipping...");
267     return;
268     }
269    
270     unless ($size) {
271     $log->logwarn("no records in database $arg->{path}, skipping...");
272     return;
273     }
274    
275 dpavlin 339 my $from_rec = 1;
276     my $to_rec = $size;
277 dpavlin 285
278 dpavlin 286 if (my $s = $self->{offset}) {
279 dpavlin 513 $log->debug("skipping to MFN $s");
280 dpavlin 339 $from_rec = $s;
281 dpavlin 285 } else {
282 dpavlin 339 $self->{offset} = $from_rec;
283 dpavlin 285 }
284    
285 dpavlin 286 if ($self->{limit}) {
286 dpavlin 301 $log->debug("limiting to ",$self->{limit}," records");
287 dpavlin 339 $to_rec = $from_rec + $self->{limit} - 1;
288     $to_rec = $size if ($to_rec > $size);
289 dpavlin 285 }
290    
291     # store size for later
292 dpavlin 339 $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
293 dpavlin 285
294 dpavlin 585 $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{encoding}", $self->{stats} ? ' [stats]' : '');
295 dpavlin 285
296     # read database
297 dpavlin 339 for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
298 dpavlin 285
299 dpavlin 286 $log->debug("position: $pos\n");
300 dpavlin 285
301 dpavlin 652 my $rec = $self->{fetch_rec}->($self, $pos, sub {
302 dpavlin 619 my ($l,$f_nr) = @_;
303 dpavlin 624 # return unless defined($l);
304     # return $l unless ($rec_regex && $f_nr);
305 dpavlin 285
306 dpavlin 625 $log->debug("-=> $f_nr ## $l");
307    
308 dpavlin 624 # codepage conversion and recode_regex
309 dpavlin 626 from_to($l, $code_page, $self->{'encoding'});
310 dpavlin 624 $l =~ s/($recode_regex)/$recode_map->{$1}/g if ($recode_regex && $recode_map);
311    
312 dpavlin 619 # apply regexps
313     if ($rec_regex && defined($rec_regex->{$f_nr})) {
314     $log->logconfess("regexps->{$f_nr} must be ARRAY") if (ref($rec_regex->{$f_nr}) ne 'ARRAY');
315     my $c = 0;
316     foreach my $r (@{ $rec_regex->{$f_nr} }) {
317 dpavlin 626 my $old_l = $l;
318 dpavlin 619 eval '$l =~ ' . $r;
319 dpavlin 626 if ($old_l ne $l) {
320     $log->debug("REGEX on $f_nr eval \$l =~ $r\n## old l: [$old_l]\n## new l: [$l]");
321     }
322 dpavlin 619 $log->error("error applying regex: $r") if ($@);
323     }
324     }
325    
326 dpavlin 625 $log->debug("<=- $f_nr ## $l");
327 dpavlin 619 return $l;
328     });
329    
330 dpavlin 308 $log->debug(sub { Dumper($rec) });
331    
332 dpavlin 285 if (! $rec) {
333 dpavlin 286 $log->warn("record $pos empty? skipping...");
334 dpavlin 285 next;
335     }
336    
337     # store
338 dpavlin 286 if ($self->{low_mem}) {
339     $self->{db}->put($pos, $rec);
340 dpavlin 285 } else {
341 dpavlin 286 $self->{data}->{$pos} = $rec;
342 dpavlin 285 }
343    
344     # create lookup
345 dpavlin 585 $arg->{'lookup_coderef'}->( $rec ) if ($rec && $arg->{'lookup_coderef'});
346 dpavlin 285
347 dpavlin 506 # update counters for statistics
348     if ($self->{stats}) {
349 dpavlin 593
350 dpavlin 619 # fetch clean record with regexpes applied for statistics
351 dpavlin 652 my $rec = $self->{fetch_rec}->($self, $pos);
352 dpavlin 619
353 dpavlin 593 foreach my $fld (keys %{ $rec }) {
354 dpavlin 506 $self->{_stats}->{fld}->{ $fld }++;
355 dpavlin 593
356     $log->logdie("invalid record fild $fld, not ARRAY")
357     unless (ref($rec->{ $fld }) eq 'ARRAY');
358    
359     foreach my $row (@{ $rec->{$fld} }) {
360    
361     if (ref($row) eq 'HASH') {
362    
363     foreach my $sf (keys %{ $row }) {
364 dpavlin 619 next if ($sf eq 'subfields');
365 dpavlin 593 $self->{_stats}->{sf}->{ $fld }->{ $sf }->{count}++;
366     $self->{_stats}->{sf}->{ $fld }->{ $sf }->{repeatable}++
367     if (ref($row->{$sf}) eq 'ARRAY');
368 dpavlin 506 }
369 dpavlin 593
370     } else {
371     $self->{_stats}->{repeatable}->{ $fld }++;
372     }
373 dpavlin 506 }
374 dpavlin 593 }
375 dpavlin 506 }
376    
377 dpavlin 483 $self->progress_bar($pos,$to_rec) unless ($self->{no_progress_bar});
378 dpavlin 285
379     }
380    
381 dpavlin 286 $self->{pos} = -1;
382     $self->{last_pcnt} = 0;
383 dpavlin 285
384     # store max mfn and return it.
385 dpavlin 339 $self->{max_pos} = $to_rec;
386     $log->debug("max_pos: $to_rec");
387 dpavlin 285
388     return $size;
389     }
390    
391     =head2 fetch
392    
393     Fetch next record from database. It will also displays progress bar.
394    
395     my $rec = $isis->fetch;
396    
397     Record from this function should probably go to C<data_structure> for
398     normalisation.
399    
400     =cut
401    
402     sub fetch {
403     my $self = shift;
404    
405     my $log = $self->_get_logger();
406    
407 dpavlin 286 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
408 dpavlin 285
409 dpavlin 286 if ($self->{pos} == -1) {
410     $self->{pos} = $self->{offset};
411 dpavlin 285 } else {
412 dpavlin 286 $self->{pos}++;
413 dpavlin 285 }
414    
415 dpavlin 286 my $mfn = $self->{pos};
416 dpavlin 285
417 dpavlin 286 if ($mfn > $self->{max_pos}) {
418     $self->{pos} = $self->{max_pos};
419 dpavlin 285 $log->debug("at EOF");
420     return;
421     }
422    
423 dpavlin 483 $self->progress_bar($mfn,$self->{max_pos}) unless ($self->{no_progress_bar});
424 dpavlin 285
425     my $rec;
426    
427 dpavlin 286 if ($self->{low_mem}) {
428     $rec = $self->{db}->get($mfn);
429 dpavlin 285 } else {
430 dpavlin 286 $rec = $self->{data}->{$mfn};
431 dpavlin 285 }
432    
433     $rec ||= 0E0;
434     }
435    
436     =head2 pos
437    
438     Returns current record number (MFN).
439    
440     print $isis->pos;
441    
442     First record in database has position 1.
443    
444     =cut
445    
446     sub pos {
447     my $self = shift;
448 dpavlin 286 return $self->{pos};
449 dpavlin 285 }
450    
451    
452     =head2 size
453    
454     Returns number of records in database
455    
456     print $isis->size;
457    
458     Result from this function can be used to loop through all records
459    
460     foreach my $mfn ( 1 ... $isis->size ) { ... }
461    
462 dpavlin 286 because it takes into account C<offset> and C<limit>.
463 dpavlin 285
464     =cut
465    
466     sub size {
467     my $self = shift;
468 dpavlin 286 return $self->{size};
469 dpavlin 285 }
470    
471     =head2 seek
472    
473     Seek to specified MFN in file.
474    
475     $isis->seek(42);
476    
477     First record in database has position 1.
478    
479     =cut
480    
481     sub seek {
482     my $self = shift;
483     my $pos = shift || return;
484    
485     my $log = $self->_get_logger();
486    
487     if ($pos < 1) {
488     $log->warn("seek before first record");
489     $pos = 1;
490 dpavlin 286 } elsif ($pos > $self->{max_pos}) {
491 dpavlin 285 $log->warn("seek beyond last record");
492 dpavlin 286 $pos = $self->{max_pos};
493 dpavlin 285 }
494    
495 dpavlin 286 return $self->{pos} = (($pos - 1) || -1);
496 dpavlin 285 }
497    
498 dpavlin 506 =head2 stats
499 dpavlin 285
500 dpavlin 506 Dump statistics about field and subfield usage
501    
502 dpavlin 507 print $input->stats;
503 dpavlin 506
504     =cut
505    
506     sub stats {
507     my $self = shift;
508 dpavlin 507
509     my $log = $self->_get_logger();
510    
511     my $s = $self->{_stats};
512     if (! $s) {
513     $log->warn("called stats, but there is no statistics collected");
514     return;
515     }
516    
517     my $max_fld = 0;
518    
519     my $out = join("\n",
520     map {
521     my $f = $_ || die "no field";
522     my $v = $s->{fld}->{$f} || die "no s->{fld}->{$f}";
523     $max_fld = $v if ($v > $max_fld);
524    
525 dpavlin 519 my $o = sprintf("%4s %d ~", $f, $v);
526 dpavlin 507
527     if (defined($s->{sf}->{$f})) {
528     map {
529 dpavlin 593 $o .= sprintf(" %s:%d%s", $_,
530     $s->{sf}->{$f}->{$_}->{count},
531     $s->{sf}->{$f}->{$_}->{repeatable} ? '*' : '',
532     );
533 dpavlin 507 } sort keys %{ $s->{sf}->{$f} };
534     }
535    
536     if (my $v_r = $s->{repeatable}->{$f}) {
537     $o .= " ($v_r)" if ($v_r != $v);
538     }
539    
540     $o;
541 dpavlin 519 } sort { $a cmp $b } keys %{ $s->{fld} }
542 dpavlin 507 );
543    
544     $log->debug( sub { Dumper($s) } );
545    
546     return $out;
547 dpavlin 506 }
548    
549 dpavlin 652 =head2 dump
550    
551     Display humanly readable dump of record
552    
553     =cut
554    
555     sub dump {
556     my $self = shift;
557    
558     return $self->{dump_rec}->($self, $self->{pos});
559    
560     }
561    
562 dpavlin 598 =head2 modify_record_regexps
563 dpavlin 597
564 dpavlin 636 Generate hash with regexpes to be applied using l<filter>.
565 dpavlin 597
566     my $regexpes = $input->modify_record_regexps(
567     900 => { '^a' => { ' : ' => '^b' } },
568     901 => { '*' => { '^b' => ' ; ' } },
569     );
570    
571     =cut
572    
573 dpavlin 636 sub _get_regex {
574     my ($sf,$from,$to) = @_;
575     if ($sf =~ /^\^/) {
576     return
577     's/\Q'. $sf .'\E([^\^]*?)\Q'. $from .'\E([^\^]*?)/'. $sf .'$1'. $to .'$2/';
578     } else {
579     return
580     's/\Q'. $from .'\E/'. $to .'/g';
581     }
582     }
583    
584 dpavlin 597 sub modify_record_regexps {
585     my $self = shift;
586     my $modify_record = {@_};
587    
588     my $regexpes;
589    
590 dpavlin 634 my $log = $self->_get_logger();
591    
592 dpavlin 597 foreach my $f (keys %$modify_record) {
593 dpavlin 634 $log->debug("field: $f");
594    
595 dpavlin 597 foreach my $sf (keys %{ $modify_record->{$f} }) {
596 dpavlin 634 $log->debug("subfield: $sf");
597    
598 dpavlin 597 foreach my $from (keys %{ $modify_record->{$f}->{$sf} }) {
599     my $to = $modify_record->{$f}->{$sf}->{$from};
600     #die "no field?" unless defined($to);
601 dpavlin 634 $log->debug("transform: |$from| -> |$to|");
602 dpavlin 597
603 dpavlin 636 my $regex = _get_regex($sf,$from,$to);
604     push @{ $regexpes->{$f} }, $regex;
605     $log->debug("regex: $regex");
606 dpavlin 597 }
607     }
608     }
609    
610     return $regexpes;
611     }
612    
613 dpavlin 636 =head2 modify_file_regexps
614    
615     Generate hash with regexpes to be applied using l<filter> from
616     pseudo hash/yaml format for regex mappings.
617    
618     It should be obvious:
619    
620     200
621     '^a'
622     ' : ' => '^e'
623     ' = ' => '^d'
624    
625     In field I<200> find C<'^a'> and then C<' : '>, and replace it with C<'^e'>.
626     In field I<200> find C<'^a'> and then C<' = '>, and replace it with C<'^d'>.
627    
628     my $regexpes = $input->modify_file_regexps( 'conf/modify/common.pl' );
629    
630     On undef path it will just return.
631    
632     =cut
633    
634     sub modify_file_regexps {
635     my $self = shift;
636    
637     my $modify_path = shift || return;
638    
639     my $log = $self->_get_logger();
640    
641     my $regexpes;
642    
643 dpavlin 697 CORE::open(my $fh, $modify_path) || $log->logdie("can't open modify file $modify_path: $!");
644 dpavlin 636
645     my ($f,$sf);
646    
647     while(<$fh>) {
648     chomp;
649     next if (/^#/ || /^\s*$/);
650    
651     if (/^\s*(\d+)\s*$/) {
652     $f = $1;
653     $log->debug("field: $f");
654     next;
655     } elsif (/^\s*'([^']*)'\s*$/) {
656     $sf = $1;
657     $log->die("can't define subfiled before field in: $_") unless ($f);
658     $log->debug("subfield: $sf");
659     } elsif (/^\s*'([^']*)'\s*=>\s*'([^']*)'\s*$/) {
660     my ($from,$to) = ($1, $2);
661    
662     $log->debug("transform: |$from| -> |$to|");
663    
664     my $regex = _get_regex($sf,$from,$to);
665     push @{ $regexpes->{$f} }, $regex;
666     $log->debug("regex: $regex");
667     }
668     }
669    
670     return $regexpes;
671     }
672    
673 dpavlin 3 =head1 MEMORY USAGE
674 dpavlin 1
675 dpavlin 3 C<low_mem> options is double-edged sword. If enabled, WebPAC
676     will run on memory constraint machines (which doesn't have enough
677     physical RAM to create memory structure for whole source database).
678 dpavlin 1
679 dpavlin 3 If your machine has 512Mb or more of RAM and database is around 10000 records,
680     memory shouldn't be an issue. If you don't have enough physical RAM, you
681     might consider using virtual memory (if your operating system is handling it
682     well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
683     parsed structure of ISIS database (this is what C<low_mem> option does).
684 dpavlin 1
685 dpavlin 3 Hitting swap at end of reading source database is probably o.k. However,
686     hitting swap before 90% will dramatically decrease performance and you will
687     be better off with C<low_mem> and using rest of availble memory for
688     operating system disk cache (Linux is particuallary good about this).
689     However, every access to database record will require disk access, so
690     generation phase will be slower 10-100 times.
691    
692     Parsed structures are essential - you just have option to trade RAM memory
693     (which is fast) for disk space (which is slow). Be sure to have planty of
694     disk space if you are using C<low_mem> and thus L<DBM::Deep>.
695    
696     However, when WebPAC is running on desktop machines (or laptops :-), it's
697     highly undesireable for system to start swapping. Using C<low_mem> option can
698     reduce WecPAC memory usage to around 64Mb for same database with lookup
699     fields and sorted indexes which stay in RAM. Performance will suffer, but
700     memory usage will really be minimal. It might be also more confortable to
701     run WebPAC reniced on those machines.
702    
703    
704     =head1 AUTHOR
705    
706     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
707    
708 dpavlin 1 =head1 COPYRIGHT & LICENSE
709    
710 dpavlin 599 Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
711 dpavlin 1
712     This program is free software; you can redistribute it and/or modify it
713     under the same terms as Perl itself.
714    
715     =cut
716    
717     1; # End of WebPAC::Input

  ViewVC Help
Powered by ViewVC 1.1.26