/[wait]/trunk/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 109 - (show annotations)
Tue Jul 13 17:50:27 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 8662 byte(s)
pod fixes

1 #!/usr/bin/perl -w
2 # -*- Mode: Perl -*-
3 # $Basename$
4 # $Revision: 1.14 $
5 # Author : Ulrich Pfeifer
6 # Created On : Mon Dec 31 13:57:11 2001
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 15:59:20 2002
9 # Language : CPerl
10 #
11 # (C) Copyright 2001, Ulrich Pfeifer
12 #
13
14 use 5.007;
15
16 use strict;
17 use Devel::Peek qw(Dump);
18
19 use File::Path;
20 use DB_File;
21 use Getopt::Long;
22 use Cwd;
23
24 BEGIN {require WAIT::Config;}
25 use WAIT::Database;
26 use WAIT::Parse::Ora;
27 use WAIT::Document::Ora;
28 use WAIT::InvertedIndex;
29 use Data::Dumper;
30
31
32 $DB_BTREE->{'cachesize'} = 200_000 ;
33
34 use lib "/usr/local/apache/lib";
35 use lib "/online/www/sites/ora/catalogsearch/run/lib";
36 use oreilly_de_catalog::config;
37 use oreilly_de_catalog::wait_filter;
38
39 my %OPT = (
40 database => 'oreilly_de_catalog',
41 dir => oreilly_de_catalog::config::WAITDIR,
42 table => 'ora',
43 );
44
45 my $droot = oreilly_de_catalog::config::CATALOG;
46
47 GetOptions(\%OPT,
48 'database=s',
49 'dir=s',
50 'table=s',
51 'verbose!',
52 'debug!',
53 ) || die "Usage: ...\n";
54
55 my @localtime = localtime;
56 $localtime[5] += 1900;
57 $localtime[4]++;
58 my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
59 my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
60 directory => $OPT{dir})
61 or die "Could not create database $OPT{database}: $@\n";
62
63 my $layout = new WAIT::Parse::Ora;
64
65 # my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
66 # my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
67 my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
68 my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
69 # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
70 my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
71
72 # split6 is better than split13 or split10: it allows them to enter
73 # shorter sequences when searching.
74 my $isbn = ['split6', 'OR_isbn_20020127'];
75
76 my $cwd = cwd;
77
78 my %D;
79 my $access = tie %D, 'WAIT::Document::Ora', $droot,
80 or die "Couldn't tie to dir $droot: $!\n";
81
82 my $tb = $db->create_table(name => $OPT{table},
83 attr => ['author', 'isbn', 'title',
84 'headline', 'docid'],
85 layout => $layout,
86 access => $access,
87 invindex =>
88 [
89 'aboutauthor' => $text,
90 'aboutauthor' => $wplus,
91 'abouttranslator' => $text,
92 'abouttranslator' => $wplus,
93 'abstract' => $text,
94 'abstract' => $wplus,
95 'author' => $text,
96 'chapter' => $text,
97 'chapter' => $wplus,
98 'colophon' => $text,
99 'colophon' => $wplus,
100 'desc' => $text,
101 'desc' => $wplus,
102 'inx' => $text,
103 'inx' => $wplus,
104 'isbn' => $isbn,
105 'subtitle' => $text,
106 'subtitle' => $wplus,
107 'title' => $text,
108 'title' => $wplus,
109 'title_orig' => $text,
110 'title_orig' => $wplus,
111 'toc' => $text,
112 'toc' => $wplus,
113 'translator' => $text,
114 'translator' => $wplus,
115 ]
116 );
117 die "Couldn't create table $OPT{table}: $@\n" unless $tb;
118
119 my ($did, $value);
120 binmode STDOUT, ":utf8";
121 my $ALL;
122 my $traceALL = 0;
123 $traceALL = 1 if $OPT{debug}; # expensive
124 my $done = 0;
125 my $todo = keys %D;
126 my $lasttimeround = my $starttime = time;
127
128 DOC: while (($did, $value) = each %D) {
129 my $superdebug = 0;
130 if ($superdebug && $OPT{debug}) {
131 next unless $did =~ /perltb/;
132 }
133 printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
134 my $record = $layout->split($value);
135 my $headline = $record->{title};
136 $headline =~ s/\s+/ /sg;
137 # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
138 printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
139 if ($superdebug && $OPT{debug}) {
140 # $record = { chapter => $record->{chapter}};
141 }
142 $done++;
143 next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
144 $tb->insert('docid' => $did,
145 headline => $headline,
146 %{$record});
147 my $spenttime = time - $starttime;
148 my $averagetime = $spenttime/$done;
149 my $left = $todo-$done;
150 printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
151 time - $lasttimeround,
152 $done,
153 $left,
154 $spenttime,
155 $averagetime,
156 $left*$averagetime,
157 ) if $OPT{verbose};
158 $lasttimeround = time;
159 if ($traceALL) { # costs a lot when reaching the 100th file or so
160 $ALL->{$did} = $record;
161 open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
162 print F Data::Dumper::Dumper($ALL);
163 close F or die "Couldn't close debug.dump: $!";;
164 }
165 }
166 undef $ALL;
167 $tb->set(top=>1);
168
169 my $tritb = $db->create_table(
170 name => "$OPT{table}_fallback",
171 attr => [qw(docid headline)], # name
172 # "headline"
173 # only for
174 # sman
175 invindex => [ headline => $trigr ],
176 );
177 my %dict;
178 for my $f ($tb->fields) {
179 my(@idx) = @{$tb->table->{inverted}{$f} || []};
180 for my $idx (@idx) {
181 my $name = $idx->name;
182 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
183 # irrelevant for alternatives
184 my @keys = $idx->keys;
185 @dict{@keys} = ();
186 }
187 }
188 my @dictkeys = grep s/^p//, keys %dict;
189 my $maxdebug = 5;
190 for my $headline (@dictkeys) {
191 if ($maxdebug && $headline =~ /[^\040-\177]/) {
192 Dump $headline;
193 $maxdebug--;
194 }
195 # printf "%s\n", substr($headline,0,60);
196 $tritb->insert(docid => $headline, headline => $headline);
197 }
198 $tritb->set(top=>1);
199 $tritb->close or die "Couldn't close table: $!";
200 $tb->close() or die "Couldn't close table: $!";
201 $db->close() or die "Couldn't close database: $!";
202
203 # Atomically relinking symlink: now we have a new database with a very
204 # long name "$OPT{database}-$jobid" (e.g.
205 # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
206 # to be accessible with the oreilly_de_catalog name.
207
208 use File::Spec;
209 chdir $OPT{dir} or die;
210 my $dir = "$OPT{database}-$jobid";
211 my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
212 my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
213 unlink $sltmp; # may fail
214 symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
215 rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
216 print "$slwant now points to $dir\n" if $OPT{verbose};
217 system("chmod 777 $slwant/*/read")==0 or die;
218
219 opendir DIR, "." or die "Could not opendir .: $!";
220 for my $dirent (readdir DIR) {
221 next if $dirent =~ /^\./;
222 next unless $dirent =~ /^$OPT{database}(.*)/;
223 my $ext = $1 or next;
224 next unless -M $dirent > 4;
225 warn "removing old index $dirent";
226 File::Path::rmtree($dirent);
227 }
228 closedir DIR;
229
230 $WAIT::Config = $WAIT::Config; # make perl -w happy
231
232
233 __END__
234 ## ###################################################################
235 ## pod
236 ## ###################################################################
237
238 =head1 NAME
239
240 index_ora - generate an WAIT index for O'Reilly catalog
241
242 =head1 SYNOPSIS
243
244 B<index_ora>
245 [B<-database> I<dbname>]
246 [B<-dir> I<directory>]
247 [B<-table> I<table name>]
248 I<directory>
249
250 =head1 DESCRIPTION
251
252 =head1 OPTIONS
253
254 =over 5
255
256 =item B<-database> I<dbname>
257
258 Specify database name. Default is F<DB>.
259
260 =item B<-dir> I<directory>
261
262 Alternate directory where databases are located. Default is the
263 directory specified during configuration of WAIT.
264
265 =item B<-table> I<table name>
266
267 Specify an alternate table name. Default is C<ora>.
268
269 =back
270
271 =head1 AUTHOR
272
273 Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
274

Properties

Name Value
cvs2svn:cvs-rev 1.14

  ViewVC Help
Powered by ViewVC 1.1.26