/[wait]/cvs-head/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (show annotations)
Fri Mar 8 21:18:51 2002 UTC (22 years, 1 month ago) by laperla
File size: 7468 byte(s)
- much better markup in the docs makes parsing so much easier and more
  reliable.

- New documents added: inx and toc.

- Output of index_ora more helpful and additional option of setting
  $traceALL that allows us to debug what the parser passes on to WAIT.

1 #!/usr/bin/perl -w
2 # -*- Mode: Perl -*-
3 # $Basename$
4 # $Revision: 1.11 $
5 # Author : Ulrich Pfeifer
6 # Created On : Mon Dec 31 13:57:11 2001
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 15:59:20 2002
9 # Language : CPerl
10 #
11 # (C) Copyright 2001, Ulrich Pfeifer
12 #
13
14 use 5.007;
15
16 use strict;
17 use Devel::Peek qw(Dump);
18
19 use File::Path;
20 use DB_File;
21 use Getopt::Long;
22 use Cwd;
23
24 BEGIN {require WAIT::Config;}
25 use WAIT::Database;
26 use WAIT::Parse::Ora;
27 use WAIT::Document::Ora;
28 use WAIT::InvertedIndex;
29 use Data::Dumper;
30
31
32 $DB_BTREE->{'cachesize'} = 200_000 ;
33
34 my %OPT = (
35 database => 'oreilly_de_catalog',
36 dir => '/usr/local/apache/data',
37 table => 'ora',
38 );
39
40 GetOptions(\%OPT,
41 'database=s',
42 'dir=s',
43 'table=s',
44 ) || die "Usage: ...\n";
45
46 my @localtime = localtime;
47 $localtime[5] += 1900;
48 $localtime[4]++;
49 my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
50 my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
51 directory => $OPT{dir})
52 or die "Could not create database $OPT{database}: $@\n";
53
54 my $layout = new WAIT::Parse::Ora;
55
56 use lib "/usr/local/apache/lib";
57 use oreilly_de_catalog::wait_filter;
58
59 my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
60 my $text = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop'];
61 my $wplus = ['split2', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
62 my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
63 my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
64 # split6 is better than split13 or split10: it allows them to enter
65 # shorter sequences when searching.
66 my $isbn = ['split6', 'OR_isbn_20020127'];
67
68 my $cwd = cwd;
69
70 my $droot = shift or die "Usage: $0 <options> Document-Rootdirectories";
71
72 my %D;
73 my $access = tie %D, 'WAIT::Document::Ora', $droot,
74 or die "Couldn't tie to dir $droot: $!\n";
75
76 my $tb = $db->create_table(name => $OPT{table},
77 attr => ['author', 'isbn', 'title',
78 'headline', 'docid'],
79 layout => $layout,
80 access => $access,
81 invindex =>
82 [
83 'aboutauthor' => $text,
84 'aboutauthor' => $wplus,
85 'abouttranslator' => $text,
86 'abouttranslator' => $wplus,
87 'abstract' => $text,
88 'abstract' => $wplus,
89 'author' => $text,
90 'colophon' => $text,
91 'colophon' => $wplus,
92 'desc' => $text,
93 'desc' => $wplus,
94 'inx' => $text,
95 'inx' => $wplus,
96 'isbn' => $isbn,
97 'subtitle' => $text,
98 'subtitle' => $wplus,
99 'title' => $text,
100 'title' => $wplus,
101 'title_orig' => $text,
102 'title_orig' => $wplus,
103 'toc' => $text,
104 'toc' => $wplus,
105 'translator' => $text,
106 'translator' => $wplus,
107 ]
108 );
109 die "Couldn't create table $OPT{table}: $@\n" unless $tb;
110
111 my ($did, $value);
112 binmode STDOUT, ":utf8";
113 my $ALL;
114 my $traceALL = 0; # expensive
115 my $done = 0;
116 my $todo = keys %D;
117 my $lasttimeround = my $starttime = time;
118
119 DOC: while (($did, $value) = each %D) {
120 my $record = $layout->split($value);
121 my $headline = $record->{title};
122 $headline =~ s/\s+/ /sg;
123 # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
124 printf "%15s %s...\n", $did, substr($headline,0,60);
125 $tb->insert('docid' => $did,
126 headline => $headline,
127 %{$record});
128 $done++;
129 my $spenttime = time - $starttime;
130 my $averagetime = $spenttime/$done;
131 my $left = $todo-$done;
132 printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
133 time - $lasttimeround,
134 $done,
135 $left,
136 $spenttime,
137 $averagetime,
138 $left*$averagetime,
139 );
140 $lasttimeround = time;
141 if ($traceALL) { # costs a lot when reaching the 100th file or so
142 $ALL->{$did} = $record;
143 open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
144 print F Data::Dumper::Dumper($ALL);
145 close F;
146 }
147 }
148 undef $ALL;
149 $tb->set(top=>1);
150
151 my $tritb = $db->create_table(
152 name => "$OPT{table}_fallback",
153 attr => [qw(docid headline)], # name
154 # "headline"
155 # only for
156 # sman
157 invindex => [ headline => $trigr ],
158 );
159 my %dict;
160 for my $f ($tb->fields) {
161 my(@idx) = @{$tb->table->{inverted}{$f} || []};
162 for my $idx (@idx) {
163 my $name = $idx->name;
164 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
165 # irrelevant for alternatives
166 my @keys = $idx->keys;
167 @dict{@keys} = ();
168 }
169 }
170 my @dictkeys = grep s/^p//, keys %dict;
171 my $maxdebug = 5;
172 for my $headline (@dictkeys) {
173 if ($maxdebug && $headline =~ /[^\040-\177]/) {
174 Dump $headline;
175 $maxdebug--;
176 }
177 # printf "%s\n", substr($headline,0,60);
178 $tritb->insert(docid => $headline, headline => $headline);
179 }
180 $tritb->set(top=>1);
181 $tritb->close;
182 $tb->close();
183 $db->close();
184
185 # Atomically relinking symlink: now we have a new database with a very
186 # long name "$OPT{database}-$jobid" (e.g.
187 # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
188 # to be accessible with the oreilly_de_catalog name.
189
190 use File::Spec;
191 chdir $OPT{dir} or die;
192 my $dir = "$OPT{database}-$jobid";
193 my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
194 my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
195 unlink $sltmp; # may fail
196 symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
197 rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
198 warn "$slwant now points to $dir";
199 system("chmod 777 $slwant/*/read")==0 or die;
200
201 $WAIT::Config = $WAIT::Config; # make perl -w happy
202
203
204 __END__
205 ## ###################################################################
206 ## pod
207 ## ###################################################################
208
209 =head1 NAME
210
211 index_ora - generate an WAIT index for O'Reilly catalog
212
213 =head1 SYNOPSIS
214
215 B<index_ora>
216 [B<-database> I<dbname>]
217 [B<-dir> I<directory>]
218 [B<-table> I<table name>]
219 I<directory>
220
221 =head1 DESCRIPTION
222
223 =head1 OPTIONS
224
225 =over 5
226
227 =item B<-database> I<dbname>
228
229 Specify database name. Default is F<DB>.
230
231 =item B<-dir> I<directory>
232
233 Alternate directory where databases are located. Default is the
234 directory specified during configuration of WAIT.
235
236 =item B<-table> I<table name>
237
238 Specify an alternate table name. Default is C<ora>.
239
240 =head1 AUTHOR
241
242 Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
243

Properties

Name Value
cvs2svn:cvs-rev 1.11

  ViewVC Help
Powered by ViewVC 1.1.26