/[wait]/branches/unido/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /branches/unido/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 106 - (hide annotations)
Tue Jul 13 12:22:09 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 8655 byte(s)
Changes made by Andreas J. Koenig <andreas.koenig(at)anima.de> for Unido project

1 dpavlin 106 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4     # $Revision: 1.14 $
5     # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8     # Last Modified On: Fri Jan 4 15:59:20 2002
9     # Language : CPerl
10     #
11     # (C) Copyright 2001, Ulrich Pfeifer
12     #
13    
14     use 5.007;
15    
16     use strict;
17     use Devel::Peek qw(Dump);
18    
19     use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24     BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29     use Data::Dumper;
30    
31    
32     $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34     use lib "/usr/local/apache/lib";
35     use lib "/online/www/sites/ora/catalogsearch/run/lib";
36     use oreilly_de_catalog::config;
37     use oreilly_de_catalog::wait_filter;
38    
39     my %OPT = (
40     database => 'oreilly_de_catalog',
41     dir => oreilly_de_catalog::config::WAITDIR,
42     table => 'ora',
43     );
44    
45     my $droot = oreilly_de_catalog::config::CATALOG;
46    
47     GetOptions(\%OPT,
48     'database=s',
49     'dir=s',
50     'table=s',
51     'verbose!',
52     'debug!',
53     ) || die "Usage: ...\n";
54    
55     my @localtime = localtime;
56     $localtime[5] += 1900;
57     $localtime[4]++;
58     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
59     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
60     directory => $OPT{dir})
61     or die "Could not create database $OPT{database}: $@\n";
62    
63     my $layout = new WAIT::Parse::Ora;
64    
65     # my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
66     # my $text = ['OR_tr_20020124', 'split2', 'OR_minus_20020311', 'OR_lc_20020125', 'split2', 'stop'];
67     my $text = ['OR_tr_20020124', 'split', 'OR_minus_20020311', 'OR_lc_20020125'];
68     my $wplus = ['OR_split_20020401', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
69     # my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
70     my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
71    
72     # split6 is better than split13 or split10: it allows them to enter
73     # shorter sequences when searching.
74     my $isbn = ['split6', 'OR_isbn_20020127'];
75    
76     my $cwd = cwd;
77    
78     my %D;
79     my $access = tie %D, 'WAIT::Document::Ora', $droot,
80     or die "Couldn't tie to dir $droot: $!\n";
81    
82     my $tb = $db->create_table(name => $OPT{table},
83     attr => ['author', 'isbn', 'title',
84     'headline', 'docid'],
85     layout => $layout,
86     access => $access,
87     invindex =>
88     [
89     'aboutauthor' => $text,
90     'aboutauthor' => $wplus,
91     'abouttranslator' => $text,
92     'abouttranslator' => $wplus,
93     'abstract' => $text,
94     'abstract' => $wplus,
95     'author' => $text,
96     'chapter' => $text,
97     'chapter' => $wplus,
98     'colophon' => $text,
99     'colophon' => $wplus,
100     'desc' => $text,
101     'desc' => $wplus,
102     'inx' => $text,
103     'inx' => $wplus,
104     'isbn' => $isbn,
105     'subtitle' => $text,
106     'subtitle' => $wplus,
107     'title' => $text,
108     'title' => $wplus,
109     'title_orig' => $text,
110     'title_orig' => $wplus,
111     'toc' => $text,
112     'toc' => $wplus,
113     'translator' => $text,
114     'translator' => $wplus,
115     ]
116     );
117     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
118    
119     my ($did, $value);
120     binmode STDOUT, ":utf8";
121     my $ALL;
122     my $traceALL = 0;
123     $traceALL = 1 if $OPT{debug}; # expensive
124     my $done = 0;
125     my $todo = keys %D;
126     my $lasttimeround = my $starttime = time;
127    
128     DOC: while (($did, $value) = each %D) {
129     my $superdebug = 0;
130     if ($superdebug && $OPT{debug}) {
131     next unless $did =~ /perltb/;
132     }
133     printf "%15s...(%d/%d)\n", $did, $done, $todo if $OPT{verbose};
134     my $record = $layout->split($value);
135     my $headline = $record->{title};
136     $headline =~ s/\s+/ /sg;
137     # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
138     printf "%15s %s...\n", $did, substr($headline,0,60) if $OPT{verbose};
139     if ($superdebug && $OPT{debug}) {
140     # $record = { chapter => $record->{chapter}};
141     }
142     $done++;
143     next if oreilly_de_catalog::config::DONT_INDEX_JUST_PARSE() ;
144     $tb->insert('docid' => $did,
145     headline => $headline,
146     %{$record});
147     my $spenttime = time - $starttime;
148     my $averagetime = $spenttime/$done;
149     my $left = $todo-$done;
150     printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
151     time - $lasttimeround,
152     $done,
153     $left,
154     $spenttime,
155     $averagetime,
156     $left*$averagetime,
157     ) if $OPT{verbose};
158     $lasttimeround = time;
159     if ($traceALL) { # costs a lot when reaching the 100th file or so
160     $ALL->{$did} = $record;
161     open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
162     print F Data::Dumper::Dumper($ALL);
163     close F or die "Couldn't close debug.dump: $!";;
164     }
165     }
166     undef $ALL;
167     $tb->set(top=>1);
168    
169     my $tritb = $db->create_table(
170     name => "$OPT{table}_fallback",
171     attr => [qw(docid headline)], # name
172     # "headline"
173     # only for
174     # sman
175     invindex => [ headline => $trigr ],
176     );
177     my %dict;
178     for my $f ($tb->fields) {
179     my(@idx) = @{$tb->table->{inverted}{$f} || []};
180     for my $idx (@idx) {
181     my $name = $idx->name;
182     next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
183     # irrelevant for alternatives
184     my @keys = $idx->keys;
185     @dict{@keys} = ();
186     }
187     }
188     my @dictkeys = grep s/^p//, keys %dict;
189     my $maxdebug = 5;
190     for my $headline (@dictkeys) {
191     if ($maxdebug && $headline =~ /[^\040-\177]/) {
192     Dump $headline;
193     $maxdebug--;
194     }
195     # printf "%s\n", substr($headline,0,60);
196     $tritb->insert(docid => $headline, headline => $headline);
197     }
198     $tritb->set(top=>1);
199     $tritb->close or die "Couldn't close table: $!";
200     $tb->close() or die "Couldn't close table: $!";
201     $db->close() or die "Couldn't close database: $!";
202    
203     # Atomically relinking symlink: now we have a new database with a very
204     # long name "$OPT{database}-$jobid" (e.g.
205     # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
206     # to be accessible with the oreilly_de_catalog name.
207    
208     use File::Spec;
209     chdir $OPT{dir} or die;
210     my $dir = "$OPT{database}-$jobid";
211     my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
212     my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
213     unlink $sltmp; # may fail
214     symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
215     rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
216     print "$slwant now points to $dir\n" if $OPT{verbose};
217     system("chmod 777 $slwant/*/read")==0 or die;
218    
219     opendir DIR, "." or die "Could not opendir .: $!";
220     for my $dirent (readdir DIR) {
221     next if $dirent =~ /^\./;
222     next unless $dirent =~ /^$OPT{database}(.*)/;
223     my $ext = $1 or next;
224     next unless -M $dirent > 4;
225     warn "removing old index $dirent";
226     File::Path::rmtree($dirent);
227     }
228     closedir DIR;
229    
230     $WAIT::Config = $WAIT::Config; # make perl -w happy
231    
232    
233     __END__
234     ## ###################################################################
235     ## pod
236     ## ###################################################################
237    
238     =head1 NAME
239    
240     index_ora - generate an WAIT index for O'Reilly catalog
241    
242     =head1 SYNOPSIS
243    
244     B<index_ora>
245     [B<-database> I<dbname>]
246     [B<-dir> I<directory>]
247     [B<-table> I<table name>]
248     I<directory>
249    
250     =head1 DESCRIPTION
251    
252     =head1 OPTIONS
253    
254     =over 5
255    
256     =item B<-database> I<dbname>
257    
258     Specify database name. Default is F<DB>.
259    
260     =item B<-dir> I<directory>
261    
262     Alternate directory where databases are located. Default is the
263     directory specified during configuration of WAIT.
264    
265     =item B<-table> I<table name>
266    
267     Specify an alternate table name. Default is C<ora>.
268    
269     =head1 AUTHOR
270    
271     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
272    

  ViewVC Help
Powered by ViewVC 1.1.26