--- trunk/run.pl 2006/05/14 22:24:18 504 +++ trunk/run.pl 2006/05/15 17:23:38 509 @@ -52,9 +52,14 @@ =item --force-set -force conversion Cpath> in C from +force conversion C<< normalize->path >> in C from C<.xml> to C<.pl> +=item --stats + +disable indexing and dump statistics about field and subfield +usage for each input + =back =cut @@ -67,6 +72,7 @@ my $debug = 0; my $only_db_name; my $force_set = 0; +my $stats = 0; GetOptions( "limit=i" => \$limit, @@ -77,6 +83,7 @@ "config" => \$config, "debug" => \$debug, "force-set" => \$force_set, + "stats" => \$stats, ); $config = LoadFile($config); @@ -86,9 +93,15 @@ die "no databases in config file!\n" unless ($config->{databases}); my $log = _new WebPAC::Common()->_get_logger(); +$log->info( "-" x 79 ); my $use_indexer = $config->{use_indexer} || 'hyperestraier'; -$log->info("using $use_indexer indexing engine..."); +if ($stats) { + $log->debug("option --stats disables update of indexing engine..."); + $use_indexer = undef; +} else { + $log->info("using $use_indexer indexing engine..."); +} my $total_rows = 0; my $start_t = time(); @@ -99,29 +112,32 @@ my $indexer; - my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration"); - $indexer_config->{database} = $database; - $indexer_config->{clean} = $clean; - $indexer_config->{label} = $db_config->{name}; - - if ($use_indexer eq 'hyperestraier') { - - # open Hyper Estraier database - use WebPAC::Output::Estraier '0.10'; - $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } ); - - } elsif ($use_indexer eq 'kinosearch') { - - # open KinoSearch - use WebPAC::Output::KinoSearch; - $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path}); - $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } ); + if ($use_indexer) { + my $indexer_config = $config->{$use_indexer} || $log->logdie("can't find '$use_indexer' part in confguration"); + $indexer_config->{database} = $database; + $indexer_config->{clean} = $clean; + $indexer_config->{label} = $db_config->{name}; + + if ($use_indexer eq 'hyperestraier') { + + # open Hyper Estraier database + use WebPAC::Output::Estraier '0.10'; + $indexer = new WebPAC::Output::Estraier( %{ $indexer_config } ); + + } elsif ($use_indexer eq 'kinosearch') { + + # open KinoSearch + use WebPAC::Output::KinoSearch; + $indexer_config->{clean} = 1 unless (-e $indexer_config->{index_path}); + $indexer = new WebPAC::Output::KinoSearch( %{ $indexer_config } ); - } else { - $log->logdie("unknown use_indexer: $use_indexer"); + } else { + $log->logdie("unknown use_indexer: $use_indexer"); + } + + $log->logide("can't continue without valid indexer") unless ($indexer); } - $log->logide("can't continue without valid indexer") unless ($indexer); # # now WebPAC::Store @@ -181,6 +197,7 @@ offset => $offset, lookup => $lookup, recode => $input->{recode}, + stats => $stats, ); $log->logdie("can't create input using $input_module") unless ($input); @@ -239,25 +256,38 @@ push @{ $row->{'000'} }, $pos; } - my $ds = $n ? $n->data_structure($row) : - WebPAC::Normalize::Set::data_structure( + + my $ds; + if ($n) { + $ds = $n->data_structure($row); + } else { + $ds = WebPAC::Normalize::Set::data_structure( row => $row, rules => $rules, lookup => $lookup->lookup_hash, ); + $db->save_ds( + id => $mfn, + ds => $ds, + prefix => $input->{name}, + ) if ($ds && !$stats); + } + $indexer->add( id => $input->{name} . "/" . $mfn, ds => $ds, type => $config->{$use_indexer}->{type}, - ); + ) if ($indexer); $total_rows++; } + $log->info("statistics of fields usage:\n", $input_db->stats) if ($stats); + }; - eval { $indexer->finish } if ($indexer->can('finish')); + eval { $indexer->finish } if ($indexer && $indexer->can('finish')); my $dt = time() - $start_t; $log->info("$total_rows records indexed in " .