/[wait]/trunk/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 109 - (hide annotations)
Tue Jul 13 17:50:27 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 8662 byte(s)
pod fixes

1 ulpfr 55 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4 laperla 77 # $Revision: 1.14 $
5 ulpfr 55 # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 62 # Last Modified On: Fri Jan 4 15:59:20 2002
9 ulpfr 55 # Language : CPerl
10     #
11 laperla 73 # (C) Copyright 2001, Ulrich Pfeifer
12 ulpfr 55 #
13    
14 laperla 69 use 5.007;
15    
16 ulpfr 55 use strict;
17 laperla 71 use Devel::Peek qw(Dump);
18 laperla 69
19 ulpfr 55 use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24 laperla 69 BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29 laperla 74 use Data::Dumper;
30 ulpfr 55
31    
32     $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34 laperla 75 use lib "/usr/local/apache/lib";
35     use lib "/online/www/sites/ora/catalogsearch/run/lib";
36     use oreilly_de_catalog::config;
37     use oreilly_de_catalog::wait_filter;
38    
39 laperla 67 my %OPT = (
40 laperla 73 database => 'oreilly_de_catalog',
41 laperla 75 dir => oreilly_de_catalog::config::WAITDIR,
42 ulpfr 55 table => 'ora',
43     );
44    
45 laperla 75 my $droot = oreilly_de_catalog::config::CATALOG;
46    
47 ulpfr 55 GetOptions(\%OPT,
48     'database=s',
49     'dir=s',
50     'table=s',
51 laperla 76 'verbose!',
52     'debug!',
53 ulpfr 55 ) || die "Usage: ...\n";
54    
55 laperla 68 my @localtime = localtime;
56     $localtime[5] += 1900;
57     $localtime[4]++;
58     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
59     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
60     directory => $OPT{dir})
61     or die "Could not create database $OPT{database}: $@\n";
62 ulpfr 55
63     my $layout = new WAIT::Parse::Ora;
64    
65 laperla 76 # my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
66 laperla 75 # my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
67     my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
68 laperla 76 my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
69     # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
70 laperla 74 my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
71 laperla 76
72 laperla 73 # split6 is better than split13 or split10: it allows them to enter
73     # shorter sequences when searching.
74 laperla 72 my $isbn = ['split6', 'OR_isbn_20020127'];
75 ulpfr 55
76     my $cwd = cwd;
77    
78     my %D;
79 laperla 73 my $access = tie %D, 'WAIT::Document::Ora', $droot,
80     or die "Couldn't tie to dir $droot: $!\n";
81 ulpfr 55
82     my $tb = $db->create_table(name => $OPT{table},
83 ulpfr 62 attr => ['author', 'isbn', 'title',
84 ulpfr 55 'headline', 'docid'],
85     layout => $layout,
86     access => $access,
87     invindex =>
88     [
89 laperla 72 'aboutauthor' => $text,
90 laperla 73 'aboutauthor' => $wplus,
91 laperla 74 'abouttranslator' => $text,
92     'abouttranslator' => $wplus,
93 laperla 72 'abstract' => $text,
94 laperla 73 'abstract' => $wplus,
95 ulpfr 55 'author' => $text,
96 laperla 76 'chapter' => $text,
97     'chapter' => $wplus,
98 laperla 65 'colophon' => $text,
99 laperla 73 'colophon' => $wplus,
100 laperla 74 'desc' => $text,
101     'desc' => $wplus,
102     'inx' => $text,
103     'inx' => $wplus,
104 laperla 72 'isbn' => $isbn,
105 laperla 74 'subtitle' => $text,
106     'subtitle' => $wplus,
107     'title' => $text,
108     'title' => $wplus,
109     'title_orig' => $text,
110     'title_orig' => $wplus,
111     'toc' => $text,
112     'toc' => $wplus,
113     'translator' => $text,
114     'translator' => $wplus,
115 ulpfr 55 ]
116     );
117     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
118    
119     my ($did, $value);
120 laperla 69 binmode STDOUT, ":utf8";
121 laperla 74 my $ALL;
122 laperla 76 my $traceALL = 0;
123     $traceALL = 1 if $OPT{debug}; # expensive
124 laperla 74 my $done = 0;
125     my $todo = keys %D;
126     my $lasttimeround = my $starttime = time;
127    
128     DOC: while (($did, $value) = each %D) {
129 laperla 76 my $superdebug = 0;
130     if ($superdebug && $OPT{debug}) {
131 laperla 77 next unless $did =~ /perltb/;
132 laperla 76 }
133 laperla 77 printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
134 ulpfr 55 my $record = $layout->split($value);
135     my $headline = $record->{title};
136     $headline =~ s/\s+/ /sg;
137 laperla 74 # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
138 laperla 76 printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
139     if ($superdebug && $OPT{debug}) {
140     # $record = { chapter => $record->{chapter}};
141     }
142     $done++;
143     next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
144 ulpfr 55 $tb->insert('docid' => $did,
145     headline => $headline,
146     %{$record});
147 laperla 74 my $spenttime = time - $starttime;
148     my $averagetime = $spenttime/$done;
149     my $left = $todo-$done;
150     printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
151     time - $lasttimeround,
152     $done,
153     $left,
154     $spenttime,
155     $averagetime,
156     $left*$averagetime,
157 laperla 76 ) if $OPT{verbose};
158 laperla 74 $lasttimeround = time;
159     if ($traceALL) { # costs a lot when reaching the 100th file or so
160     $ALL->{$did} = $record;
161     open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
162     print F Data::Dumper::Dumper($ALL);
163 laperla 75 close F or die "Couldn't close debug.dump: $!";;
164 laperla 74 }
165 ulpfr 55 }
166 laperla 74 undef $ALL;
167 ulpfr 55 $tb->set(top=>1);
168 laperla 71
169     my $tritb = $db->create_table(
170     name => "$OPT{table}_fallback",
171 laperla 72 attr => [qw(docid headline)], # name
172     # "headline"
173     # only for
174     # sman
175 laperla 71 invindex => [ headline => $trigr ],
176     );
177     my %dict;
178     for my $f ($tb->fields) {
179     my(@idx) = @{$tb->table->{inverted}{$f} || []};
180     for my $idx (@idx) {
181 laperla 72 my $name = $idx->name;
182 laperla 73 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
183     # irrelevant for alternatives
184 laperla 71 my @keys = $idx->keys;
185     @dict{@keys} = ();
186     }
187     }
188     my @dictkeys = grep s/^p//, keys %dict;
189     my $maxdebug = 5;
190     for my $headline (@dictkeys) {
191     if ($maxdebug && $headline =~ /[^\040-\177]/) {
192     Dump $headline;
193     $maxdebug--;
194     }
195 laperla 72 # printf "%s\n", substr($headline,0,60);
196 laperla 71 $tritb->insert(docid => $headline, headline => $headline);
197     }
198     $tritb->set(top=>1);
199 laperla 75 $tritb->close or die "Couldn't close table: $!";
200     $tb->close() or die "Couldn't close table: $!";
201     $db->close() or die "Couldn't close database: $!";
202 ulpfr 55
203 laperla 73 # Atomically relinking symlink: now we have a new database with a very
204 laperla 74 # long name "$OPT{database}-$jobid" (e.g.
205     # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
206     # to be accessible with the oreilly_de_catalog name.
207 laperla 68
208     use File::Spec;
209 laperla 74 chdir $OPT{dir} or die;
210 laperla 73 my $dir = "$OPT{database}-$jobid";
211     my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
212     my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
213     unlink $sltmp; # may fail
214     symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
215     rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
216 laperla 76 print "$slwant now points to $dir\n" if $OPT{verbose};
217 laperla 73 system("chmod 777 $slwant/*/read")==0 or die;
218 laperla 68
219 laperla 75 opendir DIR, "." or die "Could not opendir .: $!";
220     for my $dirent (readdir DIR) {
221     next if $dirent =~ /^\./;
222     next unless $dirent =~ /^$OPT{database}(.*)/;
223     my $ext = $1 or next;
224     next unless -M $dirent > 4;
225     warn "removing old index $dirent";
226     File::Path::rmtree($dirent);
227     }
228     closedir DIR;
229    
230 ulpfr 55 $WAIT::Config = $WAIT::Config; # make perl -w happy
231    
232    
233     __END__
234     ## ###################################################################
235     ## pod
236     ## ###################################################################
237    
238     =head1 NAME
239    
240     index_ora - generate an WAIT index for O'Reilly catalog
241    
242     =head1 SYNOPSIS
243    
244     B<index_ora>
245     [B<-database> I<dbname>]
246     [B<-dir> I<directory>]
247     [B<-table> I<table name>]
248     I<directory>
249    
250     =head1 DESCRIPTION
251    
252     =head1 OPTIONS
253    
254     =over 5
255    
256     =item B<-database> I<dbname>
257    
258     Specify database name. Default is F<DB>.
259    
260     =item B<-dir> I<directory>
261    
262     Alternate directory where databases are located. Default is the
263     directory specified during configuration of WAIT.
264    
265     =item B<-table> I<table name>
266    
267     Specify an alternate table name. Default is C<ora>.
268    
269 dpavlin 109 =back
270    
271 ulpfr 55 =head1 AUTHOR
272    
273     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
274    

Properties

Name Value
cvs2svn:cvs-rev 1.14

  ViewVC Help
Powered by ViewVC 1.1.26