/[wait]/trunk/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

cvs-head/script/index_ora revision 74 by laperla, Fri Mar 8 21:18:51 2002 UTC trunk/script/index_ora revision 88 by dpavlin, Mon May 24 13:44:01 2004 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl -w  #!/usr/bin/perl -w
2  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
3  # $Basename$  # $Basename$
4  # $Revision: 1.11 $  # $Revision: 1.14 $
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Mon Dec 31 13:57:11 2001  # Created On      : Mon Dec 31 13:57:11 2001
7  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
# Line 31  use Data::Dumper; Line 31  use Data::Dumper;
31    
32  $DB_BTREE->{'cachesize'} = 200_000 ;  $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34    use lib "/usr/local/apache/lib";
35    use lib "/online/www/sites/ora/catalogsearch/run/lib";
36    use oreilly_de_catalog::config;
37    use oreilly_de_catalog::wait_filter;
38    
39  my %OPT = (  my %OPT = (
40             database => 'oreilly_de_catalog',             database => 'oreilly_de_catalog',
41             dir      => '/usr/local/apache/data',             dir      => oreilly_de_catalog::config::WAITDIR,
42             table    => 'ora',             table    => 'ora',
43            );            );
44    
45    my $droot = oreilly_de_catalog::config::CATALOG;
46    
47  GetOptions(\%OPT,  GetOptions(\%OPT,
48             'database=s',             'database=s',
49             'dir=s',             'dir=s',
50             'table=s',             'table=s',
51               'verbose!',
52               'debug!',
53            ) || die "Usage: ...\n";            ) || die "Usage: ...\n";
54    
55  my @localtime = localtime;  my @localtime = localtime;
# Line 53  my $db = WAIT::Database->create(name Line 62  my $db = WAIT::Database->create(name
62    
63  my $layout = new WAIT::Parse::Ora;  my $layout = new WAIT::Parse::Ora;
64    
65  use lib "/usr/local/apache/lib";  # my $stem  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
66  use oreilly_de_catalog::wait_filter;  # my $text  = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
67    my $text  = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
68  my $stem  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];  my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
69  my $text  = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop'];  # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
 my $wplus = ['split2', 'OR_lc_20020125', 'OR_mixedonly_20020221'];  
 my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];  
70  my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];  my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
71    
72  # split6 is better than split13 or split10: it allows them to enter  # split6 is better than split13 or split10: it allows them to enter
73  # shorter sequences when searching.  # shorter sequences when searching.
74  my $isbn  = ['split6', 'OR_isbn_20020127'];  my $isbn  = ['split6', 'OR_isbn_20020127'];
75    
76  my $cwd = cwd;  my $cwd = cwd;
77    
 my $droot = shift or die "Usage: $0 <options> Document-Rootdirectories";  
   
78  my %D;  my %D;
79  my $access = tie %D, 'WAIT::Document::Ora', $droot,  my $access = tie %D, 'WAIT::Document::Ora', $droot,
80    or die "Couldn't tie to dir $droot: $!\n";    or die "Couldn't tie to dir $droot: $!\n";
# Line 87  my $tb = $db->create_table(name     => $ Line 93  my $tb = $db->create_table(name     => $
93                              'abstract' => $text,                              'abstract' => $text,
94                              'abstract' => $wplus,                              'abstract' => $wplus,
95                              'author' => $text,                              'author' => $text,
96                                'chapter' => $text,
97                                'chapter' => $wplus,
98                              'colophon' => $text,                              'colophon' => $text,
99                              'colophon' => $wplus,                              'colophon' => $wplus,
100                              'desc'   => $text,                              'desc'   => $text,
# Line 111  die "Couldn't create table $OPT{table}: Line 119  die "Couldn't create table $OPT{table}:
119  my ($did, $value);  my ($did, $value);
120  binmode STDOUT, ":utf8";  binmode STDOUT, ":utf8";
121  my $ALL;  my $ALL;
122  my $traceALL = 0; # expensive  my $traceALL = 0;
123    $traceALL = 1 if $OPT{debug}; # expensive
124  my $done = 0;  my $done = 0;
125  my $todo = keys %D;  my $todo = keys %D;
126  my $lasttimeround = my $starttime = time;  my $lasttimeround = my $starttime = time;
127    
128  DOC: while (($did, $value) = each %D) {  DOC: while (($did, $value) = each %D) {
129      my $superdebug = 0;
130      if ($superdebug && $OPT{debug}) {
131        next unless $did =~ /perltb/;
132      }
133      printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
134    my $record   = $layout->split($value);    my $record   = $layout->split($value);
135    my $headline = $record->{title};    my $headline = $record->{title};
136    $headline =~ s/\s+/ /sg;    $headline =~ s/\s+/ /sg;
137    # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);    # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
138    printf "%15s %s...\n", $did, substr($headline,0,60);    printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
139      if ($superdebug && $OPT{debug}) {
140        # $record = { chapter => $record->{chapter}};
141      }
142      $done++;
143      next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
144    $tb->insert('docid'  => $did,    $tb->insert('docid'  => $did,
145                headline => $headline,                headline => $headline,
146                %{$record});                %{$record});
   $done++;  
147    my $spenttime = time - $starttime;    my $spenttime = time - $starttime;
148    my $averagetime = $spenttime/$done;    my $averagetime = $spenttime/$done;
149    my $left = $todo-$done;    my $left = $todo-$done;
# Line 136  DOC: while (($did, $value) = each %D) { Line 154  DOC: while (($did, $value) = each %D) {
154                 $spenttime,                 $spenttime,
155                 $averagetime,                 $averagetime,
156                 $left*$averagetime,                 $left*$averagetime,
157                );                ) if $OPT{verbose};
158    $lasttimeround = time;    $lasttimeround = time;
159    if ($traceALL) { # costs a lot when reaching the 100th file or so    if ($traceALL) { # costs a lot when reaching the 100th file or so
160      $ALL->{$did} = $record;      $ALL->{$did} = $record;
161      open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;      open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
162      print F Data::Dumper::Dumper($ALL);      print F Data::Dumper::Dumper($ALL);
163      close F;      close F  or die "Couldn't close debug.dump: $!";;
164    }    }
165  }  }
166  undef $ALL;  undef $ALL;
# Line 178  for my $headline (@dictkeys) { Line 196  for my $headline (@dictkeys) {
196    $tritb->insert(docid => $headline, headline => $headline);    $tritb->insert(docid => $headline, headline => $headline);
197  }  }
198  $tritb->set(top=>1);  $tritb->set(top=>1);
199  $tritb->close;  $tritb->close or die "Couldn't close table: $!";
200  $tb->close();  $tb->close() or die "Couldn't close table: $!";
201  $db->close();  $db->close() or die "Couldn't close database: $!";
202    
203  # Atomically relinking symlink: now we have a new database with a very  # Atomically relinking symlink: now we have a new database with a very
204  # long name "$OPT{database}-$jobid" (e.g.  # long name "$OPT{database}-$jobid" (e.g.
# Line 195  my $sltmp  = File::Spec->catdir($OPT{dir Line 213  my $sltmp  = File::Spec->catdir($OPT{dir
213  unlink $sltmp; # may fail  unlink $sltmp; # may fail
214  symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";  symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
215  rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";  rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
216  warn "$slwant now points to $dir";  print "$slwant now points to $dir\n" if $OPT{verbose};
217  system("chmod 777 $slwant/*/read")==0 or die;  system("chmod 777 $slwant/*/read")==0 or die;
218    
219    opendir DIR, "." or die "Could not opendir .: $!";
220    for my $dirent (readdir DIR) {
221      next if $dirent =~ /^\./;
222      next unless $dirent =~ /^$OPT{database}(.*)/;
223      my $ext = $1 or next;
224      next unless -M $dirent > 4;
225      warn "removing old index $dirent";
226      File::Path::rmtree($dirent);
227    }
228    closedir DIR;
229    
230  $WAIT::Config = $WAIT::Config; # make perl -w happy  $WAIT::Config = $WAIT::Config; # make perl -w happy
231    
232    

Legend:
Removed from v.74  
changed lines
  Added in v.88

  ViewVC Help
Powered by ViewVC 1.1.26