/[wait]/cvs-head/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 74 - (hide annotations)
Fri Mar 8 21:18:51 2002 UTC (22 years, 2 months ago) by laperla
File size: 7468 byte(s)
- much better markup in the docs makes parsing so much easier and more
  reliable.

- New documents added: inx and toc.

- Output of index_ora more helpful and additional option of setting
  $traceALL that allows us to debug what the parser passes on to WAIT.

1 ulpfr 55 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4 laperla 74 # $Revision: 1.11 $
5 ulpfr 55 # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 62 # Last Modified On: Fri Jan 4 15:59:20 2002
9 ulpfr 55 # Language : CPerl
10     #
11 laperla 73 # (C) Copyright 2001, Ulrich Pfeifer
12 ulpfr 55 #
13    
14 laperla 69 use 5.007;
15    
16 ulpfr 55 use strict;
17 laperla 71 use Devel::Peek qw(Dump);
18 laperla 69
19 ulpfr 55 use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24 laperla 69 BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29 laperla 74 use Data::Dumper;
30 ulpfr 55
31    
32     $DB_BTREE->{'cachesize'} = 200_000 ;
33    
34 laperla 67 my %OPT = (
35 laperla 73 database => 'oreilly_de_catalog',
36     dir => '/usr/local/apache/data',
37 ulpfr 55 table => 'ora',
38     );
39    
40     GetOptions(\%OPT,
41     'database=s',
42     'dir=s',
43     'table=s',
44     ) || die "Usage: ...\n";
45    
46 laperla 68 my @localtime = localtime;
47     $localtime[5] += 1900;
48     $localtime[4]++;
49     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
50     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
51     directory => $OPT{dir})
52     or die "Could not create database $OPT{database}: $@\n";
53 ulpfr 55
54     my $layout = new WAIT::Parse::Ora;
55    
56 laperla 69 use lib "/usr/local/apache/lib";
57 laperla 74 use oreilly_de_catalog::wait_filter;
58 laperla 69
59 laperla 74 my $stem = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop', 'Stem'];
60     my $text = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'stop'];
61     my $wplus = ['split2', 'OR_lc_20020125', 'OR_mixedonly_20020221'];
62     my $sound = ['OR_tr_20020124', 'OR_lc_20020125', 'split2', 'Soundex'];
63     my $trigr = ['OR_lc_20020125', 'OR_trigrams_20020125'];
64 laperla 73 # split6 is better than split13 or split10: it allows them to enter
65     # shorter sequences when searching.
66 laperla 72 my $isbn = ['split6', 'OR_isbn_20020127'];
67 ulpfr 55
68     my $cwd = cwd;
69    
70 laperla 73 my $droot = shift or die "Usage: $0 <options> Document-Rootdirectories";
71    
72 ulpfr 55 my %D;
73 laperla 73 my $access = tie %D, 'WAIT::Document::Ora', $droot,
74     or die "Couldn't tie to dir $droot: $!\n";
75 ulpfr 55
76     my $tb = $db->create_table(name => $OPT{table},
77 ulpfr 62 attr => ['author', 'isbn', 'title',
78 ulpfr 55 'headline', 'docid'],
79     layout => $layout,
80     access => $access,
81     invindex =>
82     [
83 laperla 72 'aboutauthor' => $text,
84 laperla 73 'aboutauthor' => $wplus,
85 laperla 74 'abouttranslator' => $text,
86     'abouttranslator' => $wplus,
87 laperla 72 'abstract' => $text,
88 laperla 73 'abstract' => $wplus,
89 ulpfr 55 'author' => $text,
90 laperla 65 'colophon' => $text,
91 laperla 73 'colophon' => $wplus,
92 laperla 74 'desc' => $text,
93     'desc' => $wplus,
94     'inx' => $text,
95     'inx' => $wplus,
96 laperla 72 'isbn' => $isbn,
97 laperla 74 'subtitle' => $text,
98     'subtitle' => $wplus,
99     'title' => $text,
100     'title' => $wplus,
101     'title_orig' => $text,
102     'title_orig' => $wplus,
103     'toc' => $text,
104     'toc' => $wplus,
105     'translator' => $text,
106     'translator' => $wplus,
107 ulpfr 55 ]
108     );
109     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
110    
111     my ($did, $value);
112 laperla 69 binmode STDOUT, ":utf8";
113 laperla 74 my $ALL;
114     my $traceALL = 0; # expensive
115     my $done = 0;
116     my $todo = keys %D;
117     my $lasttimeround = my $starttime = time;
118    
119     DOC: while (($did, $value) = each %D) {
120 ulpfr 55 my $record = $layout->split($value);
121     my $headline = $record->{title};
122     $headline =~ s/\s+/ /sg;
123 laperla 74 # printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
124     printf "%15s %s...\n", $did, substr($headline,0,60);
125 ulpfr 55 $tb->insert('docid' => $did,
126     headline => $headline,
127     %{$record});
128 laperla 74 $done++;
129     my $spenttime = time - $starttime;
130     my $averagetime = $spenttime/$done;
131     my $left = $todo-$done;
132     printf("%2d secs, %3d done, %3d left, %4d s done, %5.1f s avg, %4d s left\n",
133     time - $lasttimeround,
134     $done,
135     $left,
136     $spenttime,
137     $averagetime,
138     $left*$averagetime,
139     );
140     $lasttimeround = time;
141     if ($traceALL) { # costs a lot when reaching the 100th file or so
142     $ALL->{$did} = $record;
143     open F, ">:utf8", "$OPT{dir}/$OPT{database}-$jobid/debug.dump" or die;
144     print F Data::Dumper::Dumper($ALL);
145     close F;
146     }
147 ulpfr 55 }
148 laperla 74 undef $ALL;
149 ulpfr 55 $tb->set(top=>1);
150 laperla 71
151     my $tritb = $db->create_table(
152     name => "$OPT{table}_fallback",
153 laperla 72 attr => [qw(docid headline)], # name
154     # "headline"
155     # only for
156     # sman
157 laperla 71 invindex => [ headline => $trigr ],
158     );
159     my %dict;
160     for my $f ($tb->fields) {
161     my(@idx) = @{$tb->table->{inverted}{$f} || []};
162     for my $idx (@idx) {
163 laperla 72 my $name = $idx->name;
164 laperla 73 next if $name =~ /(_|\b)(mixedonly|Stem|Soundex)(\b|_)/;
165     # irrelevant for alternatives
166 laperla 71 my @keys = $idx->keys;
167     @dict{@keys} = ();
168     }
169     }
170     my @dictkeys = grep s/^p//, keys %dict;
171     my $maxdebug = 5;
172     for my $headline (@dictkeys) {
173     if ($maxdebug && $headline =~ /[^\040-\177]/) {
174     Dump $headline;
175     $maxdebug--;
176     }
177 laperla 72 # printf "%s\n", substr($headline,0,60);
178 laperla 71 $tritb->insert(docid => $headline, headline => $headline);
179     }
180     $tritb->set(top=>1);
181     $tritb->close;
182 ulpfr 55 $tb->close();
183     $db->close();
184    
185 laperla 73 # Atomically relinking symlink: now we have a new database with a very
186 laperla 74 # long name "$OPT{database}-$jobid" (e.g.
187     # oreilly_de_catalog-2002-01-28_16:12_16467) and we want that database
188     # to be accessible with the oreilly_de_catalog name.
189 laperla 68
190     use File::Spec;
191 laperla 74 chdir $OPT{dir} or die;
192 laperla 73 my $dir = "$OPT{database}-$jobid";
193     my $slwant = File::Spec->catdir($OPT{dir}, $OPT{database});
194     my $sltmp = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
195     unlink $sltmp; # may fail
196     symlink $dir, $sltmp or die "Couldn't symlink $dir, $sltmp: $!";
197     rename $sltmp, $slwant or die "Couldn't rename $sltmp, $slwant: $!";
198     warn "$slwant now points to $dir";
199     system("chmod 777 $slwant/*/read")==0 or die;
200 laperla 68
201 ulpfr 55 $WAIT::Config = $WAIT::Config; # make perl -w happy
202    
203    
204     __END__
205     ## ###################################################################
206     ## pod
207     ## ###################################################################
208    
209     =head1 NAME
210    
211     index_ora - generate an WAIT index for O'Reilly catalog
212    
213     =head1 SYNOPSIS
214    
215     B<index_ora>
216     [B<-database> I<dbname>]
217     [B<-dir> I<directory>]
218     [B<-table> I<table name>]
219     I<directory>
220    
221     =head1 DESCRIPTION
222    
223     =head1 OPTIONS
224    
225     =over 5
226    
227     =item B<-database> I<dbname>
228    
229     Specify database name. Default is F<DB>.
230    
231     =item B<-dir> I<directory>
232    
233     Alternate directory where databases are located. Default is the
234     directory specified during configuration of WAIT.
235    
236     =item B<-table> I<table name>
237    
238     Specify an alternate table name. Default is C<ora>.
239    
240     =head1 AUTHOR
241    
242     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
243    

Properties

Name Value
cvs2svn:cvs-rev 1.11

  ViewVC Help
Powered by ViewVC 1.1.26