/[webpac]/trunk2/lib/WebPAC.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk2/lib/WebPAC.pm

Parent Directory | Revision Log | View Patch Patch

-revision 398 by dpavlin,
Sat Jul 24 13:48:08 2004 UTC
+revision 555 by dpavlin,
Fri Oct 29 22:09:04 2004 UTC
 Line 9 
 use Config::IniFiles;
  use XML::Simple;
  use Template;
  use Log::Log4perl qw(get_logger :levels);
+ use Time::HiRes qw(time);
  use Data::Dumper;
-Line 29 
 This module implements methods used by W
+Line 30 
 This module implements methods used by W
  =head2 new
- This will create new instance of WebPAC using configuration specified by C<config_file>.
+ Create new instance of WebPAC using configuration specified by C<config_file>.
   my $webpac = new WebPAC(
          config_file => 'name.conf',
-         [code_page => 'ISO-8859-2',]
+         code_page => 'ISO-8859-2',
+         low_mem => 1,
+         filter => {
+                 'lower' => sub { lc($_[0]) },
+         },
   );
  Default C<code_page> is C<ISO-8859-2>.
- It will also read configuration files
+ Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
+ There is optinal parametar C<filter> which specify different filters which
+ can be applied using C<filter{name}> notation.
+ This method will also read configuration files
  C<global.conf> (used by indexer and Web font-end)
  and configuration file specified by C<config_file>
  which describes databases to be indexed.
-Line 59 
 sub new {
+Line 69 
 sub new {
          my $self = {@_};
          bless($self, $class);
+         $self->{'start_t'} = time();
          my $log_file = $self->{'log'} || "log.conf";
          Log::Log4perl->init($log_file);
-Line 106 
 sub new {
+Line 118 
 sub new {
                  EVAL_PERL => 1,
          );
+         # running with low_mem flag? well, use DBM::Deep then.
+         if ($self->{'low_mem'}) {
+                 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
+                 my $db_file = "data.db";
+                 if (-e $db_file) {
+                         unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
+                         $log->debug("removed '$db_file' from last run");
+                 }
+                 require DBM::Deep;
+                 my $db = new DBM::Deep $db_file;
+                 $log->logdie("DBM::Deep error: $!") unless ($db);
+                 if ($db->error()) {
+                         $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
+                 } else {
+                         $log->debug("using file '$db_file' for DBM::Deep");
+                 }
+                 $self->{'db'} = $db;
+         }
+         $log->debug("filters defined: ",Dumper($self->{'filter'}));
          return $self;
  }
-Line 116 
 Open CDS/ISIS database using OpenIsis mo
+Line 156 
 Open CDS/ISIS database using OpenIsis mo
   $webpac->open_isis(
          filename => '/data/ISIS/ISIS',
          code_page => '852',
-         limit_mfn => '500',
+         limit_mfn => 500,
+         start_mfn => 6000,
          lookup => [ ... ],
   );
  By default, ISIS code page is assumed to be C<852>.
+ If optional parametar C<start_mfn> is set, this will be first MFN to read
+ from database (so you can skip beginning of your database if you need to).
  If optional parametar C<limit_mfn> is set, it will read just 500 records
  from database in example above.
-Line 149 
 sub open_isis {
+Line 193 
 sub open_isis {
          $log->logcroak("need filename") if (! $arg->{'filename'});
          my $code_page = $arg->{'code_page'} || '852';
+         $log->logdie("can't find database ",$arg->{'filename'}) unless (glob($arg->{'filename'}.'.*'));
+         # store data in object
+         $self->{'isis_filename'} = $arg->{'filename'};
+         $self->{'isis_code_page'} = $code_page;
          use OpenIsis;
          #$self->{'isis_code_page'} = $code_page;
-Line 157 
 sub open_isis {
+Line 207 
 sub open_isis {
          my $cp = Text::Iconv->new($code_page,$self->{'code_page'});
          $log->info("reading ISIS database '",$arg->{'filename'},"'");
+         $log->debug("isis code page: $code_page");
          my $isis_db = OpenIsis::open($arg->{'filename'});
          my $maxmfn = OpenIsis::maxRowid( $isis_db ) || 1;
+         my $startmfn = 1;
-         $maxmfn = $self->{limit_mfn} if ($self->{limit_mfn});
+         if (my $s = $self->{'start_mfn'}) {
+                 $log->info("skipping to MFN $s");
+                 $startmfn = $s;
+         } else {
+                 $self->{'start_mfn'} = $startmfn;
+         }
+         $maxmfn = $startmfn + $self->{limit_mfn} if ($self->{limit_mfn});
-         $log->info("processing $maxmfn records...");
+         $log->info("processing ",($maxmfn-$startmfn)." records...");
          # read database
-         for (my $mfn = 1; $mfn <= $maxmfn; $mfn++) {
+         for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
+                 $log->debug("mfn: $mfn\n");
+                 my $rec;
                  # read record
                  my $row = OpenIsis::read( $isis_db, $mfn );
-Line 186 
 sub open_isis {
+Line 250 
 sub open_isis {
                                                  $val = $l;
                                          }
-                                         push @{$self->{'data'}->{$mfn}->{$k}}, $val;
+                                         push @{$rec->{$k}}, $val;
                                  }
                          } else {
-                                 push @{$self->{'data'}->{$mfn}->{'000'}}, $mfn;
+                                 push @{$rec->{'000'}}, $mfn;
                          }
                  }
+                 $log->confess("record $mfn empty?") unless ($rec);
+                 # store
+                 if ($self->{'low_mem'}) {
+                         $self->{'db'}->put($mfn, $rec);
+                 } else {
+                         $self->{'data'}->{$mfn} = $rec;
+                 }
                  # create lookup
-                 my $rec = $self->{'data'}->{$mfn};
                  $self->create_lookup($rec, @{$arg->{'lookup'}});
                  $self->progress_bar($mfn,$maxmfn);
          }
-         $self->{'current_mfn'} = 1;
+         $self->{'current_mfn'} = -1;
          $self->{'last_pcnt'} = 0;
+         $log->debug("max mfn: $maxmfn");
          # store max mfn and return it.
          return $self->{'max_mfn'} = $maxmfn;
  }
-Line 223 
 sub fetch_rec {
+Line 297 
 sub fetch_rec {
          my $log = $self->_get_logger();
-         my $mfn = $self->{'current_mfn'}++ || $log->logconfess("it seems that you didn't load database!");
+         $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
+         if ($self->{'current_mfn'} == -1) {
+                 $self->{'current_mfn'} = $self->{'start_mfn'};
+         } else {
+                 $self->{'current_mfn'}++;
+         }
+         my $mfn = $self->{'current_mfn'};
          if ($mfn > $self->{'max_mfn'}) {
                  $self->{'current_mfn'} = $self->{'max_mfn'};
-Line 233 
 sub fetch_rec {
+Line 315 
 sub fetch_rec {
          $self->progress_bar($mfn,$self->{'max_mfn'});
-         return $self->{'data'}->{$mfn};
+         if ($self->{'low_mem'}) {
+                 return $self->{'db'}->get($mfn);
+         } else {
+                 return $self->{'data'}->{$mfn};
+         }
+ }
+ =head2 mfn
+ Returns current record number (MFN).
+  print $webpac->mfn;
+ =cut
+ sub mfn {
+         my $self = shift;
+         return $self->{'current_mfn'};
  }
  =head2 progress_bar
-Line 261 
 sub progress_bar {
+Line 360 
 sub progress_bar {
          $self->{'last_pcnt'} ||= 1;
-         $self->{'last_pcnt'} = $curr if ($curr < $self->{'last_pcnt'});
+         my $p = int($curr * 100 / $max) || 1;
+         # reset on re-run
+         if ($p < $self->{'last_pcnt'}) {
+                 $self->{'last_pcnt'} = $p;
+                 $self->{'last_t'} = time();
+                 $self->{'last_curr'} = undef;
+         }
+         $self->{'last_t'} ||= time();
-         my $p = int($curr * 100 / $max);
          if ($p != $self->{'last_pcnt'}) {
-                 printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$curr,$max,"=" x ($p/2).">", $p );
+                 my $last_curr = $self->{'last_curr'} || $curr;
+                 my $t = time();
+                 my $rate = ($curr - $last_curr) / (($t - $self->{'last_t'} || 1));
+                 my $eta = ($max-$curr) / ($rate || 1);
+                 printf STDERR ("%5d [%-38s] %-5d %0.1f/s %s\r",$curr,"=" x ($p/3)."$p%>", $max, $rate, $self->fmt_time($eta));
                  $self->{'last_pcnt'} = $p;
+                 $self->{'last_t'} = time();
+                 $self->{'last_curr'} = $curr;
          }
+         print STDERR "\n" if ($p == 100);
+ }
+ =head2 fmt_time
+ Format time (in seconds) for display.
+  print $webpac->fmt_time(time());
+ This method is called by L<progress_bar> to display remaining time.
+ =cut
+ sub fmt_time {
+         my $self = shift;
+         my $t = shift || 0;
+         my $out = "";
+         my ($ss,$mm,$hh) = gmtime($t);
+         $out .= "${hh}h" if ($hh);
+         $out .= sprintf("%02d:%02d", $mm,$ss);
+         $out .= "  " if ($hh == 0);
+         return $out;
  }
  =head2 open_import_xml
-Line 329 
 sub create_lookup {
+Line 467 
 sub create_lookup {
          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
          foreach my $i (@_) {
-                 if ($i->{'eval'}) {
+                 $log->logconfess("need key") unless defined($i->{'key'});
-                         my $eval = $self->fill_in($rec,$i->{'eval'});
+                 $log->logconfess("need val") unless defined($i->{'val'});
-                         my $key = $self->fill_in($rec,$i->{'key'});
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                 if (defined($i->{'eval'})) {
-                         if ($key && @val && eval $eval) {
+                         # eval first, so we can skip fill_in for key and val
+                         my $eval = $self->fill_in($rec,$i->{'eval'}) || next;
+                         if ($self->_eval($eval)) {
+                                 my $key = $self->fill_in($rec,$i->{'key'}) || next;
+                                 my @val = $self->fill_in($rec,$i->{'val'}) || next;
                                  $log->debug("stored $key = ",sub { join(" | ",@val) });
                                  push @{$self->{'lookup'}->{$key}}, @val;
                          }
                  } else {
-                         my $key = $self->fill_in($rec,$i->{'key'});
+                         my $key = $self->fill_in($rec,$i->{'key'}) || next;
-                         my @val = $self->fill_in($rec,$i->{'val'});
+                         my @val = $self->fill_in($rec,$i->{'val'}) || next;
-                         if ($key && @val) {
+                         $log->debug("stored $key = ",sub { join(" | ",@val) });
-                                 $log->debug("stored $key = ",sub { join(" | ",@val) });
+                         push @{$self->{'lookup'}->{$key}}, @val;
-                                 push @{$self->{'lookup'}->{$key}}, @val;
-                         }
                  }
          }
  }
-Line 430 
 sub fill_in {
+Line 570 
 sub fill_in {
          # iteration (for repeatable fields)
          my $i = shift || 0;
+         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
          # FIXME remove for speedup?
          $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-Line 443 
 sub fill_in {
+Line 585 
 sub fill_in {
          # remove eval{...} from beginning
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+         my $filter_name;
+         # remove filter{...} from beginning
+         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
          # do actual replacement of placeholders
+         # repeatable fields
          $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
+         # non-repeatable fields
+         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
          if ($found) {
                  $log->debug("format: $format");
-Line 452 
 sub fill_in {
+Line 601 
 sub fill_in {
                          my $eval = $self->fill_in($rec,$eval_code,$i);
                          return if (! $self->_eval($eval));
                  }
+                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
+                         $log->debug("filter '$filter_name' for $format");
+                         $format = $self->{'filter'}->{$filter_name}->($format);
+                         return unless(defined($format));
+                         $log->debug("filter result: $format");
+                 }
                  # do we have lookups?
                  if ($format =~ /$LOOKUP_REGEX/o) {
                          $log->debug("format '$format' has lookup");
-Line 544 
 sub parse {
+Line 699 
 sub parse {
          # remove eval{...} from beginning
          $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+         my $filter_name;
+         # remove filter{...} from beginning
+         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
          my $prefix;
          my $all_found=0;
-         while ($format =~ s/^(.*?)v(\d+)(?:\^(\w))?//s) {
+         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
                  my $del = $1 || '';
                  $prefix ||= $del if ($all_found == 0);
+                 # repeatable index
+                 my $r = $i;
+                 $r = 0 if (lc("$2") eq 's');
                  my $found = 0;
-                 my $tmp = $self->get_data(\$rec,$2,$3,$i,\$found);
+                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
                  if ($found) {
                          push @out, $del;
-Line 577 
 sub parse {
+Line 740 
 sub parse {
          }
          if ($eval_code) {
-                 my $eval = $self->fill_in($rec,$eval_code,$i);
+                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
-                 $log->debug("about to eval{",$eval,"} format: $out");
+                 $log->debug("about to eval{$eval} format: $out");
                  return if (! $self->_eval($eval));
          }
+         if ($filter_name && $self->{'filter'}->{$filter_name}) {
+                 $log->debug("about to filter{$filter_name} format: $out");
+                 $out = $self->{'filter'}->{$filter_name}->($out);
+                 return unless(defined($out));
+                 $log->debug("filter result: $out");
+         }
          return $out;
  }
-Line 647 
 sub fill_in_to_arr {
+Line 817 
 sub fill_in_to_arr {
          return @arr;
  }
+ =head2 sort_arr
+ Sort array ignoring case and html in data
+  my @sorted = $webpac->sort_arr(@unsorted);
+ =cut
+ sub sort_arr {
+         my $self = shift;
+         my $log = $self->_get_logger();
+         # FIXME add Schwartzian Transformation?
+         my @sorted = sort {
+                 $a =~ s#<[^>]+/*>##;
+                 $b =~ s#<[^>]+/*>##;
+                 lc($b) cmp lc($a)
+         } @_;
+         $log->debug("sorted values: ",sub { join(", ",@sorted) });
+         return @sorted;
+ }
  =head2 data_structure
-Line 703 
 sub data_structure {
+Line 898 
 sub data_structure {
                          }
                          next if (! @v);
+                         if ($tag->{'sort'}) {
+                                 @v = $self->sort_arr(@v);
+                                 $log->warn("sort within tag is usually not what you want!");
+                         }
                          # use format?
                          if ($tag->{'format_name'}) {
                                  @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
-Line 717 
 sub data_structure {
+Line 917 
 sub data_structure {
                                  next; # don't return headline in data_structure!
                          }
-                         # does tag have type?
+                         # delimiter will join repeatable fields
-                         if ($tag->{'type'}) {
+                         if ($tag->{'delimiter'}) {
-                                 push @{$row->{$tag->{'type'}}}, @v;
+                                 @v = ( join($tag->{'delimiter'}, @v) );
-                         } else {
+                         }
-                                 push @{$row->{'display'}}, @v;
-                                 push @{$row->{'swish'}}, @v;
+                         # default types
+                         my @types = qw(display swish);
+                         # override by type attribute
+                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
+                         foreach my $type (@types) {
+                                 # append to previous line?
+                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
+                                 if ($tag->{'append'}) {
+                                         # I will delimit appended part with
+                                         # delimiter (or ,)
+                                         my $d = $tag->{'delimiter'};
+                                         # default delimiter
+                                         $d ||= " ";
+                                         my $last = pop @{$row->{$type}};
+                                         $d = "" if (! $last);
+                                         $last .= $d . join($d, @v);
+                                         push @{$row->{$type}}, $last;
+                                 } else {
+                                         push @{$row->{$type}}, @v;
+                                 }
                          }
-Line 735 
 sub data_structure {
+Line 958 
 sub data_structure {
                          my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
                          $row->{'name'} = $name ? $self->_x($name) : $field;
+                         # post-sort all values in field
+                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+                                 $log->warn("sort at field tag not implemented");
+                         }
                          push @ds, $row;
                          $log->debug("row $field: ",sub { Dumper($row) });
-Line 775 
 sub output {
+Line 1003 
 sub output {
          return $out;
  }
+ =head2 output_file
+ Create output from in-memory data structure using Template Toolkit template
+ to a file.
+  $webpac->output_file(
+         file => 'out.txt',
+         template => 'text.tt',
+         data => @ds
+  );
+ =cut
+ sub output_file {
+         my $self = shift;
+         my $args = {@_};
+         my $log = $self->_get_logger();
+         my $file = $args->{'file'} || $log->logconfess("need file name");
+         $log->debug("creating file ",$file);
+         open(my $fh, ">", $file) || $log->logdie("can't open output file '$file': $!");
+         print $fh $self->output(
+                 template => $args->{'template'},
+                 data => $args->{'data'},
+         ) || $log->logdie("print: $!");
+         close($fh) || $log->logdie("close: $!");
+ }
  =head2 apply_format
  Apply format specified in tag with C<format_name="name"> and
-Line 848 
 sub _eval {
+Line 1108 
 sub _eval {
          $log->debug("eval: ",$code," [",$ret,"]");
-         return $ret || 0;
+         return $ret || undef;
  }
  =head2 _sort_by_order
-Line 918 
 B<This is different from normal Log4perl
+Line 1178 
 B<This is different from normal Log4perl
  also use method names, and not only classes (which are just few)
  to filter logging.
+ =head1 MEMORY USAGE
+ C<low_mem> options is double-edged sword. If enabled, WebPAC
+ will run on memory constraint machines (which doesn't have enough
+ physical RAM to create memory structure for whole source database).
+ If your machine has 512Mb or more of RAM and database is around 10000 records,
+ memory shouldn't be an issue. If you don't have enough physical RAM, you
+ might consider using virtual memory (if your operating system is handling it
+ well, like on FreeBSD or Linux) instead of dropping to L<DBD::Deep> to handle
+ parsed structure of ISIS database (this is what C<low_mem> option does).
+ Hitting swap at end of reading source database is probably o.k. However,
+ hitting swap before 90% will dramatically decrease performance and you will
+ be better off with C<low_mem> and using rest of availble memory for
+ operating system disk cache (Linux is particuallary good about this).
+ However, every access to database record will require disk access, so
+ generation phase will be slower 10-100 times.
+ Parsed structures are essential - you just have option to trade RAM memory
+ (which is fast) for disk space (which is slow). Be sure to have planty of
+ disk space if you are using C<low_mem> and thus L<DBD::Deep>.
+ However, when WebPAC is running on desktop machines (or laptops :-), it's
+ highly undesireable for system to start swapping. Using C<low_mem> option can
+ reduce WecPAC memory usage to around 64Mb for same database with lookup
+ fields and sorted indexes which stay in RAM. Performance will suffer, but
+ memory usage will really be minimal. It might be also more confortable to
+ run WebPAC reniced on those machines.
  =cut
 ;

 Legend:



Removed from v.398
 


changed lines


 
Added in v.555
 Legend:



Removed from v.398
 


changed lines


 
Added in v.555
-Removed from v.398
+Added in v.555

	ViewVC Help
Powered by ViewVC 1.1.26