--- cvs-head/script/index_ora 2002/01/27 15:27:38 71 +++ cvs-head/script/index_ora 2002/03/14 17:27:22 75 @@ -1,14 +1,14 @@ #!/usr/bin/perl -w # -*- Mode: Perl -*- # $Basename$ -# $Revision: 1.8 $ +# $Revision: 1.12 $ # Author : Ulrich Pfeifer # Created On : Mon Dec 31 13:57:11 2001 # Last Modified By: Ulrich Pfeifer # Last Modified On: Fri Jan 4 15:59:20 2002 # Language : CPerl # -# (C) Copyright 2001, UUNET Deutschland GmbH, Germany +# (C) Copyright 2001, Ulrich Pfeifer # use 5.007; @@ -26,16 +26,24 @@ use WAIT::Parse::Ora; use WAIT::Document::Ora; use WAIT::InvertedIndex; +use Data::Dumper; $DB_BTREE->{'cachesize'} = 200_000 ; +use lib "/usr/local/apache/lib"; +use lib "/online/www/sites/ora/catalogsearch/run/lib"; +use oreilly_de_catalog::config; +use oreilly_de_catalog::wait_filter; + my %OPT = ( - database => 'DB', - dir => $WAIT::Config->{WAIT_home} || '/tmp', + database => 'oreilly_de_catalog', + dir => oreilly_de_catalog::config::WAITDIR, table => 'ora', ); +my $droot = oreilly_de_catalog::config::CATALOG; + GetOptions(\%OPT, 'database=s', 'dir=s', @@ -52,23 +60,21 @@ my $layout = new WAIT::Parse::Ora; -use lib "/usr/local/apache/lib"; -use oreilly_de_catalog::wait_handler; - -my $stem = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem']; -my $text = [{ - 'prefix' => ['OR_tr_20020124', 'OR_lc_20020124'], - 'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'], - }, - 'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop']; -my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex']; -my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125']; +my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem']; +# my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop']; +my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125']; +my $wplus = ['split', 'OR_lc_20020125', 'OR_mixedonly_20020221']; +my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex']; +my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125']; +# split6 is better than split13 or split10: it allows them to enter +# shorter sequences when searching. +my $isbn = ['split6', 'OR_isbn_20020127']; my $cwd = cwd; my %D; -my $access = tie %D, 'WAIT::Document::Ora', @ARGV, - or die "Couldn't tie to file: $!\n"; +my $access = tie %D, 'WAIT::Document::Ora', $droot, + or die "Couldn't tie to dir $droot: $!\n"; my $tb = $db->create_table(name => $OPT{table}, attr => ['author', 'isbn', 'title', @@ -77,39 +83,90 @@ access => $access, invindex => [ - 'title' => $stem, - 'about' => $stem, - 'text' => $text, + 'aboutauthor' => $text, + 'aboutauthor' => $wplus, + 'abouttranslator' => $text, + 'abouttranslator' => $wplus, + 'abstract' => $text, + 'abstract' => $wplus, 'author' => $text, 'colophon' => $text, - 'author' => $sound, - 'isbn' => $text, + 'colophon' => $wplus, + 'desc' => $text, + 'desc' => $wplus, + 'inx' => $text, + 'inx' => $wplus, + 'isbn' => $isbn, + 'subtitle' => $text, + 'subtitle' => $wplus, + 'title' => $text, + 'title' => $wplus, + 'title_orig' => $text, + 'title_orig' => $wplus, + 'toc' => $text, + 'toc' => $wplus, + 'translator' => $text, + 'translator' => $wplus, ] ); die "Couldn't create table $OPT{table}: $@\n" unless $tb; my ($did, $value); binmode STDOUT, ":utf8"; -while (($did, $value) = each %D) { +my $ALL; +my $traceALL = 0; # expensive +my $done = 0; +my $todo = keys %D; +my $lasttimeround = my $starttime = time; + +DOC: while (($did, $value) = each %D) { + # next unless $did eq "jscook"; my $record = $layout->split($value); my $headline = $record->{title}; $headline =~ s/\s+/ /sg; - printf "%15s %s\n", $record->{isbn}, substr($headline,0,60); + # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60); + printf "%15s %s...\n", $did, substr($headline,0,60); $tb->insert('docid' => $did, headline => $headline, %{$record}); + $done++; + my $spenttime = time - $starttime; + my $averagetime = $spenttime/$done; + my $left = $todo-$done; + printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n", + time - $lasttimeround, + $done, + $left, + $spenttime, + $averagetime, + $left*$averagetime, + ); + $lasttimeround = time; + if ($traceALL) { # costs a lot when reaching the 100th file or so + $ALL->{$did} = $record; + open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die; + print F Data::Dumper::Dumper($ALL); + close F or die "Couldn't close debug.dump: $!";; + } } +undef $ALL; $tb->set(top=>1); my $tritb = $db->create_table( name => "$OPT{table}_fallback", - attr => [qw(docid headline)], + attr => [qw(docid headline)], # name + # "headline" + # only for + # sman invindex => [ headline => $trigr ], ); my %dict; for my $f ($tb->fields) { my(@idx) = @{$tb->table->{inverted}{$f} || []}; for my $idx (@idx) { + my $name = $idx->name; + next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/; + # irrelevant for alternatives my @keys = $idx->keys; @dict{@keys} = (); } @@ -121,23 +178,40 @@ Dump $headline; $maxdebug--; } + # printf "%s\n", substr($headline,0,60); $tritb->insert(docid => $headline, headline => $headline); } $tritb->set(top=>1); -$tritb->close; -$tb->close(); -$db->close(); - -# Now we have a new database with a very long name and we want that -# database to be accessible with the $OPT{database} name +$tritb->close or die "Couldn't close table: $!"; +$tb->close() or die "Couldn't close table: $!"; +$db->close() or die "Couldn't close database: $!"; + +# Atomically relinking symlink: now we have a new database with a very +# long name "$OPT{database}-$jobid" (e.g. +# oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database +# to be accessible with the oreilly_de_catalog name. use File::Spec; -my $long_dir = "$OPT{database}-$jobid"; -my $want_dir = File::Spec->catdir($OPT{dir}, $OPT{database}); -my $prel_slink = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$"); -unlink $prel_slink; # may fail -symlink $long_dir, $prel_slink or die "Could not symlink $long_dir, $prel_slink: $!"; -rename $prel_slink, $want_dir or die "Could not rename $prel_slink, $want_dir: $!"; +chdir $OPT{dir} or die; +my $dir = "$OPT{database}-$jobid"; +my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database}); +my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$"); +unlink $sltmp; # may fail +symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!"; +rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!"; +warn "$slwant now points to $dir"; +system("chmod 777 $slwant/*/read")==0 or die; + +opendir DIR, "." or die "Could not opendir .: $!"; +for my $dirent (readdir DIR) { + next if $dirent =~ /^\./; + next unless $dirent =~ /^$OPT{database}(.*)/; + my $ext = $1 or next; + next unless -M $dirent > 4; + warn "removing old index $dirent"; + File::Path::rmtree($dirent); +} +closedir DIR; $WAIT::Config = $WAIT::Config; # make perl -w happy