/[wait]/trunk/script/index_ora

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/script/index_ora

Parent Directory | Revision Log | View Patch Patch

-cvs-head/script/index_ora
revision 65 by laperla,
Wed Jan 23 12:22:54 2002 UTC
+trunk/script/index_ora
revision 88 by dpavlin,
Mon May 24 13:44:01 2004 UTC
 Line 1
  #!/usr/bin/perl -w
  #                              -*- Mode: Perl -*-
  # $Basename$
- # $Revision: 1.4 $
+ # $Revision: 1.14 $
  # Author          : Ulrich Pfeifer
  # Created On      : Mon Dec 31 13:57:11 2001
  # Last Modified By: Ulrich Pfeifer
  # Last Modified On: Fri Jan  4 15:59:20 2002
  # Language        : CPerl
  #
- # (C) Copyright 2001, UUNET Deutschland GmbH, Germany
+ # (C) Copyright 2001, Ulrich Pfeifer
  #
+ use 5.007;
  use strict;
+ use Devel::Peek qw(Dump);
  use File::Path;
  use DB_File;
  use Getopt::Long;
  use Cwd;
- require WAIT::Config;
+ BEGIN {require WAIT::Config;}
- require WAIT::Database;
+ use WAIT::Database;
- require WAIT::Parse::Ora;
+ use WAIT::Parse::Ora;
- require WAIT::Document::Ora;
+ use WAIT::Document::Ora;
- require WAIT::InvertedIndex;
+ use WAIT::InvertedIndex;
+ use Data::Dumper;
  $DB_BTREE->{'cachesize'} = 200_000 ;
- my %OPT = (clean    => 0,
+ use lib "/usr/local/apache/lib";
-            database => 'DB',
+ use lib "/online/www/sites/ora/catalogsearch/run/lib";
-            dir      => $WAIT::Config->{WAIT_home} || '/tmp',
+ use oreilly_de_catalog::config;
+ use oreilly_de_catalog::wait_filter;
+ my %OPT = (
+            database => 'oreilly_de_catalog',
+            dir      => oreilly_de_catalog::config::WAITDIR,
             table    => 'ora',
            );
+ my $droot = oreilly_de_catalog::config::CATALOG;
  GetOptions(\%OPT,
-            'clean!',
             'database=s',
             'dir=s',
             'table=s',
+            'verbose!',
+            'debug!',
            ) || die "Usage: ...\n";
- if ($OPT{clean} and -d "$OPT{dir}/$OPT{database}") {
+ my @localtime = localtime;
-   my $tmp = WAIT::Database->open(name        => $OPT{database},
+ $localtime[5] += 1900;
-                                  'directory' => $OPT{dir})
+ $localtime[4]++;
-     or die "Could not open table $OPT{table}: $@\n";
+ my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
-   my $tbl = $tmp->table(name => $OPT{table});
+ my $db = WAIT::Database->create(name      => "$OPT{database}-$jobid",
-   $tbl->drop if $tbl;
+                                 directory => $OPT{dir})
-   rmtree("$OPT{dir}/$OPT{database}/$OPT{table}", 1, 1)
+     or die "Could not create database $OPT{database}: $@\n";
-     if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
-   $tmp->close;
- }
- my $db;
- unless (-d "$OPT{dir}/$OPT{database}") {
-   $db = WAIT::Database->create(name       => $OPT{database},
-                               'directory' => $OPT{dir})
-     or die "Could not open database $OPT{database}: $@\n";
- }
- else {
-   $db = WAIT::Database->open(name        => $OPT{database},
-                              'directory' => $OPT{dir})
-     or die "Could not open table $OPT{table}: $@\n";
- }
  my $layout = new WAIT::Parse::Ora;
- my $stem  = ['isotr', 'isolc', 'split2', 'stop', 'Stem'];
+ # my $stem  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
- my $text  = [{
+ # my $text  = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
-               'prefix'    => ['isotr', 'isolc'],
+ my $text  = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
-               'intervall' => ['isotr', 'isolc'],
+ my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
-              },
+ # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
-              'isotr', 'isolc', 'split2', 'stop'];
+ my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
- my $sound = ['isotr', 'isolc', 'split2', 'Soundex'],;
+ # split6 is better than split13 or split10: it allows them to enter
+ # shorter sequences when searching.
+ my $isbn  = ['split6', 'OR_isbn_20020127'];
  my $cwd = cwd;
  my %D;
- my $access = tie %D, 'WAIT::Document::Ora', @ARGV,
+ my $access = tie %D, 'WAIT::Document::Ora', $droot,
-   or die "Couldn't tie to file: $!\n";
+   or die "Couldn't tie to dir $droot: $!\n";
  my $tb = $db->create_table(name     => $OPT{table},
                             attr     => ['author', 'isbn', 'title',
-Line 85 
 my $tb = $db->create_table(name     => $
+Line 86 
 my $tb = $db->create_table(name     => $
                             access   => $access,
                             invindex =>
                             [
-                             'title'  => $stem,
+                             'aboutauthor'  => $text,
-                             'about'  => $stem,
+                             'aboutauthor'  => $wplus,
-                             'text'   => $text,
+                             'abouttranslator'  => $text,
+                             'abouttranslator'  => $wplus,
+                             'abstract' => $text,
+                             'abstract' => $wplus,
                              'author' => $text,
+                             'chapter' => $text,
+                             'chapter' => $wplus,
                              'colophon' => $text,
-                             'author' => $sound,
+                             'colophon' => $wplus,
-                             'isbn'   => $text,
+                             'desc'   => $text,
+                             'desc'   => $wplus,
+                             'inx'   => $text,
+                             'inx'   => $wplus,
+                             'isbn'   => $isbn,
+                             'subtitle'  => $text,
+                             'subtitle'  => $wplus,
+                             'title'  => $text,
+                             'title'  => $wplus,
+                             'title_orig'  => $text,
+                             'title_orig'  => $wplus,
+                             'toc'   => $text,
+                             'toc'   => $wplus,
+                             'translator'  => $text,
+                             'translator'  => $wplus,
                             ]
                            );
  die "Couldn't create table $OPT{table}: $@\n" unless $tb;
  my ($did, $value);
- while (($did, $value) = each %D) {
+ binmode STDOUT, ":utf8";
+ my $ALL;
+ my $traceALL = 0;
+ $traceALL = 1 if $OPT{debug}; # expensive
+ my $done = 0;
+ my $todo = keys %D;
+ my $lasttimeround = my $starttime = time;
+ DOC: while (($did, $value) = each %D) {
+   my $superdebug = 0;
+   if ($superdebug && $OPT{debug}) {
+     next unless $did =~ /perltb/;
+   }
+   printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
    my $record   = $layout->split($value);
    my $headline = $record->{title};
    $headline =~ s/\s+/ /sg;
-   printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
+   # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
+   printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
+   if ($superdebug && $OPT{debug}) {
+     # $record = { chapter => $record->{chapter}};
+   }
+   $done++;
+   next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
    $tb->insert('docid'  => $did,
                headline => $headline,
                %{$record});
+   my $spenttime = time - $starttime;
+   my $averagetime = $spenttime/$done;
+   my $left = $todo-$done;
+   printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
+                time - $lasttimeround,
+                $done,
+                $left,
+                $spenttime,
+                $averagetime,
+                $left*$averagetime,
+               ) if $OPT{verbose};
+   $lasttimeround = time;
+   if ($traceALL) { # costs a lot when reaching the 100th file or so
+     $ALL->{$did} = $record;
+     open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
+     print F Data::Dumper::Dumper($ALL);
+     close F  or die "Couldn't close debug.dump: $!";;
+   }
  }
+ undef $ALL;
  $tb->set(top=>1);
- $tb->close();
- $db->close();
+ my $tritb = $db->create_table(
+                               name => "$OPT{table}_fallback",
+                               attr => [qw(docid headline)], # name
+                                                             # "headline"
+                                                             # only for
+                                                             # sman
+                               invindex => [ headline => $trigr ],
+                              );
+ my %dict;
+ for my $f ($tb->fields) {
+   my(@idx) = @{$tb->table->{inverted}{$f} || []};
+   for my $idx (@idx) {
+     my $name = $idx->name;
+     next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
+               # irrelevant for alternatives
+     my @keys = $idx->keys;
+     @dict{@keys} = ();
+   }
+ }
+ my @dictkeys = grep s/^p//, keys %dict;
+ my $maxdebug = 5;
+ for my $headline (@dictkeys) {
+   if ($maxdebug && $headline =~ /[^\040-\177]/) {
+     Dump $headline;
+     $maxdebug--;
+   }
+   # printf "%s\n", substr($headline,0,60);
+   $tritb->insert(docid => $headline, headline => $headline);
+ }
+ $tritb->set(top=>1);
+ $tritb->close or die "Couldn't close table: $!";
+ $tb->close() or die "Couldn't close table: $!";
+ $db->close() or die "Couldn't close database: $!";
+ # Atomically relinking symlink: now we have a new database with a very
+ # long name "$OPT{database}-$jobid" (e.g.
+ # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
+ # to be accessible with the oreilly_de_catalog name.
+ use File::Spec;
+ chdir $OPT{dir} or die;
+ my $dir    = "$OPT{database}-$jobid";
+ my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
+ my $sltmp  = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
+ unlink $sltmp; # may fail
+ symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
+ rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
+ print "$slwant now points to $dir\n" if $OPT{verbose};
+ system("chmod 777 $slwant/*/read")==0 or die;
+ opendir DIR, "." or die "Could not opendir .: $!";
+ for my $dirent (readdir DIR) {
+   next if $dirent =~ /^\./;
+   next unless $dirent =~ /^$OPT{database}(.*)/;
+   my $ext = $1 or next;
+   next unless -M $dirent > 4;
+   warn "removing old index $dirent";
+   File::Path::rmtree($dirent);
+ }
+ closedir DIR;
  $WAIT::Config = $WAIT::Config; # make perl -w happy
-Line 125 
 index_ora - generate an WAIT index for O
+Line 242 
 index_ora - generate an WAIT index for O
  =head1 SYNOPSIS
  B<index_ora>
- [B<-clean>] [B<-noclean>]
  [B<-database> I<dbname>]
  [B<-dir> I<directory>]
  [B<-table> I<table name>]
-Line 137 
 I<directory>
+Line 253 
 I<directory>
  =over 5
- =item B<-clean> / B<-noclean>
- Clean the table before indexing. Default is B<off>.
  =item B<-database> I<dbname>
  Specify database name. Default is F<DB>.

 Legend:



Removed from v.65
 


changed lines


 
Added in v.88
 Legend:



Removed from v.65
 


changed lines


 
Added in v.88
-Removed from v.65
+Added in v.88

	ViewVC Help
Powered by ViewVC 1.1.26