/[wait]/cvs-head/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 75 - (hide annotations)
Thu Mar 14 17:27:22 2002 UTC (22 years, 2 months ago) by laperla
File size: 8131 byte(s)
- no stop, s/split2/split;

1 ulpfr 55 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4 laperla 75 # $Revision: 1.12 $
5 ulpfr 55 # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 62 # Last Modified On: Fri Jan 4 15:59:20 2002
9 ulpfr 55 # Language : CPerl
10     #
11 laperla 73 # (C) Copyright 2001, Ulrich Pfeifer
12 ulpfr 55 #
13    
14 laperla 69 use 5.007;
15    
16 ulpfr 55 use strict;
17 laperla 71 use Devel::Peek qw(Dump);
18 laperla 69
19 ulpfr 55 use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24 laperla 69 BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29 laperla 74 use Data::Dumper;
30 ulpfr 55
31    
32     $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34 laperla 75 use lib "/usr/local/apache/lib";
35     use lib "/online/www/sites/ora/catalogsearch/run/lib";
36     use oreilly_de_catalog::config;
37     use oreilly_de_catalog::wait_filter;
38    
39 laperla 67 my %OPT = (
40 laperla 73 database => 'oreilly_de_catalog',
41 laperla 75 dir => oreilly_de_catalog::config::WAITDIR,
42 ulpfr 55 table => 'ora',
43     );
44    
45 laperla 75 my $droot = oreilly_de_catalog::config::CATALOG;
46    
47 ulpfr 55 GetOptions(\%OPT,
48     'database=s',
49     'dir=s',
50     'table=s',
51     ) || die "Usage: ...\n";
52    
53 laperla 68 my @localtime = localtime;
54     $localtime[5] += 1900;
55     $localtime[4]++;
56     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
57     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
58     directory => $OPT{dir})
59     or die "Could not create database $OPT{database}: $@\n";
60 ulpfr 55
61     my $layout = new WAIT::Parse::Ora;
62    
63 laperla 74 my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
64 laperla 75 # my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
65     my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
66     my $wplus = ['split', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
67 laperla 74 my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
68     my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
69 laperla 73 # split6 is better than split13 or split10: it allows them to enter
70     # shorter sequences when searching.
71 laperla 72 my $isbn = ['split6', 'OR_isbn_20020127'];
72 ulpfr 55
73     my $cwd = cwd;
74    
75     my %D;
76 laperla 73 my $access = tie %D, 'WAIT::Document::Ora', $droot,
77     or die "Couldn't tie to dir $droot: $!\n";
78 ulpfr 55
79     my $tb = $db->create_table(name => $OPT{table},
80 ulpfr 62 attr => ['author', 'isbn', 'title',
81 ulpfr 55 'headline', 'docid'],
82     layout => $layout,
83     access => $access,
84     invindex =>
85     [
86 laperla 72 'aboutauthor' => $text,
87 laperla 73 'aboutauthor' => $wplus,
88 laperla 74 'abouttranslator' => $text,
89     'abouttranslator' => $wplus,
90 laperla 72 'abstract' => $text,
91 laperla 73 'abstract' => $wplus,
92 ulpfr 55 'author' => $text,
93 laperla 65 'colophon' => $text,
94 laperla 73 'colophon' => $wplus,
95 laperla 74 'desc' => $text,
96     'desc' => $wplus,
97     'inx' => $text,
98     'inx' => $wplus,
99 laperla 72 'isbn' => $isbn,
100 laperla 74 'subtitle' => $text,
101     'subtitle' => $wplus,
102     'title' => $text,
103     'title' => $wplus,
104     'title_orig' => $text,
105     'title_orig' => $wplus,
106     'toc' => $text,
107     'toc' => $wplus,
108     'translator' => $text,
109     'translator' => $wplus,
110 ulpfr 55 ]
111     );
112     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
113    
114     my ($did, $value);
115 laperla 69 binmode STDOUT, ":utf8";
116 laperla 74 my $ALL;
117     my $traceALL = 0; # expensive
118     my $done = 0;
119     my $todo = keys %D;
120     my $lasttimeround = my $starttime = time;
121    
122     DOC: while (($did, $value) = each %D) {
123 laperla 75 # next unless $did eq "jscook";
124 ulpfr 55 my $record = $layout->split($value);
125     my $headline = $record->{title};
126     $headline =~ s/\s+/ /sg;
127 laperla 74 # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
128     printf "%15s %s...\n", $did, substr($headline,0,60);
129 ulpfr 55 $tb->insert('docid' => $did,
130     headline => $headline,
131     %{$record});
132 laperla 74 $done++;
133     my $spenttime = time - $starttime;
134     my $averagetime = $spenttime/$done;
135     my $left = $todo-$done;
136     printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
137     time - $lasttimeround,
138     $done,
139     $left,
140     $spenttime,
141     $averagetime,
142     $left*$averagetime,
143     );
144     $lasttimeround = time;
145     if ($traceALL) { # costs a lot when reaching the 100th file or so
146     $ALL->{$did} = $record;
147     open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
148     print F Data::Dumper::Dumper($ALL);
149 laperla 75 close F or die "Couldn't close debug.dump: $!";;
150 laperla 74 }
151 ulpfr 55 }
152 laperla 74 undef $ALL;
153 ulpfr 55 $tb->set(top=>1);
154 laperla 71
155     my $tritb = $db->create_table(
156     name => "$OPT{table}_fallback",
157 laperla 72 attr => [qw(docid headline)], # name
158     # "headline"
159     # only for
160     # sman
161 laperla 71 invindex => [ headline => $trigr ],
162     );
163     my %dict;
164     for my $f ($tb->fields) {
165     my(@idx) = @{$tb->table->{inverted}{$f} || []};
166     for my $idx (@idx) {
167 laperla 72 my $name = $idx->name;
168 laperla 73 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
169     # irrelevant for alternatives
170 laperla 71 my @keys = $idx->keys;
171     @dict{@keys} = ();
172     }
173     }
174     my @dictkeys = grep s/^p//, keys %dict;
175     my $maxdebug = 5;
176     for my $headline (@dictkeys) {
177     if ($maxdebug && $headline =~ /[^\040-\177]/) {
178     Dump $headline;
179     $maxdebug--;
180     }
181 laperla 72 # printf "%s\n", substr($headline,0,60);
182 laperla 71 $tritb->insert(docid => $headline, headline => $headline);
183     }
184     $tritb->set(top=>1);
185 laperla 75 $tritb->close or die "Couldn't close table: $!";
186     $tb->close() or die "Couldn't close table: $!";
187     $db->close() or die "Couldn't close database: $!";
188 ulpfr 55
189 laperla 73 # Atomically relinking symlink: now we have a new database with a very
190 laperla 74 # long name "$OPT{database}-$jobid" (e.g.
191     # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
192     # to be accessible with the oreilly_de_catalog name.
193 laperla 68
194     use File::Spec;
195 laperla 74 chdir $OPT{dir} or die;
196 laperla 73 my $dir = "$OPT{database}-$jobid";
197     my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
198     my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
199     unlink $sltmp; # may fail
200     symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
201     rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
202     warn "$slwant now points to $dir";
203     system("chmod 777 $slwant/*/read")==0 or die;
204 laperla 68
205 laperla 75 opendir DIR, "." or die "Could not opendir .: $!";
206     for my $dirent (readdir DIR) {
207     next if $dirent =~ /^\./;
208     next unless $dirent =~ /^$OPT{database}(.*)/;
209     my $ext = $1 or next;
210     next unless -M $dirent > 4;
211     warn "removing old index $dirent";
212     File::Path::rmtree($dirent);
213     }
214     closedir DIR;
215    
216 ulpfr 55 $WAIT::Config = $WAIT::Config; # make perl -w happy
217    
218    
219     __END__
220     ## ###################################################################
221     ## pod
222     ## ###################################################################
223    
224     =head1 NAME
225    
226     index_ora - generate an WAIT index for O'Reilly catalog
227    
228     =head1 SYNOPSIS
229    
230     B<index_ora>
231     [B<-database> I<dbname>]
232     [B<-dir> I<directory>]
233     [B<-table> I<table name>]
234     I<directory>
235    
236     =head1 DESCRIPTION
237    
238     =head1 OPTIONS
239    
240     =over 5
241    
242     =item B<-database> I<dbname>
243    
244     Specify database name. Default is F<DB>.
245    
246     =item B<-dir> I<directory>
247    
248     Alternate directory where databases are located. Default is the
249     directory specified during configuration of WAIT.
250    
251     =item B<-table> I<table name>
252    
253     Specify an alternate table name. Default is C<ora>.
254    
255     =head1 AUTHOR
256    
257     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
258    

Properties

Name Value
cvs2svn:cvs-rev 1.12

  ViewVC Help
Powered by ViewVC 1.1.26