/[wait]/cvs-head/script/index_ora
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/script/index_ora

Parent Directory Parent Directory | Revision Log Revision Log


Revision 71 - (hide annotations)
Sun Jan 27 15:27:38 2002 UTC (22 years, 4 months ago) by laperla
File size: 5081 byte(s)
- trigrams working

1 ulpfr 55 #!/usr/bin/perl -w
2     # -*- Mode: Perl -*-
3     # $Basename$
4 laperla 71 # $Revision: 1.8 $
5 ulpfr 55 # Author : Ulrich Pfeifer
6     # Created On : Mon Dec 31 13:57:11 2001
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 62 # Last Modified On: Fri Jan 4 15:59:20 2002
9 ulpfr 55 # Language : CPerl
10     #
11     # (C) Copyright 2001, UUNET Deutschland GmbH, Germany
12     #
13    
14 laperla 69 use 5.007;
15    
16 ulpfr 55 use strict;
17 laperla 71 use Devel::Peek qw(Dump);
18 laperla 69
19 ulpfr 55 use File::Path;
20     use DB_File;
21     use Getopt::Long;
22     use Cwd;
23    
24 laperla 69 BEGIN {require WAIT::Config;}
25     use WAIT::Database;
26     use WAIT::Parse::Ora;
27     use WAIT::Document::Ora;
28     use WAIT::InvertedIndex;
29 ulpfr 55
30    
31     $DB_BTREE->{'cachesize'} = 200_000 ;
32    
33 laperla 67 my %OPT = (
34 ulpfr 55 database => 'DB',
35     dir => $WAIT::Config->{WAIT_home} || '/tmp',
36     table => 'ora',
37     );
38    
39     GetOptions(\%OPT,
40     'database=s',
41     'dir=s',
42     'table=s',
43     ) || die "Usage: ...\n";
44    
45 laperla 68 my @localtime = localtime;
46     $localtime[5] += 1900;
47     $localtime[4]++;
48     my $jobid = sprintf "%04s-%02s-%02s_%02s:%02s_%d", @localtime[5,4,3,2,1], $$;
49     my $db = WAIT::Database->create(name => "$OPT{database}-$jobid",
50     directory => $OPT{dir})
51     or die "Could not create database $OPT{database}: $@\n";
52 ulpfr 55
53     my $layout = new WAIT::Parse::Ora;
54    
55 laperla 69 use lib "/usr/local/apache/lib";
56     use oreilly_de_catalog::wait_handler;
57    
58     my $stem = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop', 'Stem'];
59 ulpfr 55 my $text = [{
60 laperla 69 'prefix' => ['OR_tr_20020124', 'OR_lc_20020124'],
61     'intervall' => ['OR_tr_20020124', 'OR_lc_20020124'],
62 ulpfr 55 },
63 laperla 69 'OR_tr_20020124', 'OR_lc_20020124', 'split2', 'stop'];
64 laperla 71 my $sound = ['OR_tr_20020124', 'OR_lc_20020124', 'split2', 'Soundex'];
65     my $trigr = ['OR_lc_20020124', 'OR_trigrams_20020125'];
66 ulpfr 55
67     my $cwd = cwd;
68    
69     my %D;
70     my $access = tie %D, 'WAIT::Document::Ora', @ARGV,
71     or die "Couldn't tie to file: $!\n";
72    
73     my $tb = $db->create_table(name => $OPT{table},
74 ulpfr 62 attr => ['author', 'isbn', 'title',
75 ulpfr 55 'headline', 'docid'],
76     layout => $layout,
77     access => $access,
78     invindex =>
79     [
80     'title' => $stem,
81 ulpfr 59 'about' => $stem,
82 ulpfr 55 'text' => $text,
83     'author' => $text,
84 laperla 65 'colophon' => $text,
85 ulpfr 62 'author' => $sound,
86     'isbn' => $text,
87 ulpfr 55 ]
88     );
89     die "Couldn't create table $OPT{table}: $@\n" unless $tb;
90    
91     my ($did, $value);
92 laperla 69 binmode STDOUT, ":utf8";
93 ulpfr 55 while (($did, $value) = each %D) {
94     my $record = $layout->split($value);
95     my $headline = $record->{title};
96     $headline =~ s/\s+/ /sg;
97 ulpfr 62 printf "%15s %s\n", $record->{isbn}, substr($headline,0,60);
98 ulpfr 55 $tb->insert('docid' => $did,
99     headline => $headline,
100     %{$record});
101     }
102     $tb->set(top=>1);
103 laperla 71
104     my $tritb = $db->create_table(
105     name => "$OPT{table}_fallback",
106     attr => [qw(docid headline)],
107     invindex => [ headline => $trigr ],
108     );
109     my %dict;
110     for my $f ($tb->fields) {
111     my(@idx) = @{$tb->table->{inverted}{$f} || []};
112     for my $idx (@idx) {
113     my @keys = $idx->keys;
114     @dict{@keys} = ();
115     }
116     }
117     my @dictkeys = grep s/^p//, keys %dict;
118     my $maxdebug = 5;
119     for my $headline (@dictkeys) {
120     if ($maxdebug && $headline =~ /[^\040-\177]/) {
121     Dump $headline;
122     $maxdebug--;
123     }
124     $tritb->insert(docid => $headline, headline => $headline);
125     }
126     $tritb->set(top=>1);
127     $tritb->close;
128 ulpfr 55 $tb->close();
129     $db->close();
130    
131 laperla 68 # Now we have a new database with a very long name and we want that
132     # database to be accessible with the $OPT{database} name
133    
134     use File::Spec;
135     my $long_dir = "$OPT{database}-$jobid";
136     my $want_dir = File::Spec->catdir($OPT{dir}, $OPT{database});
137     my $prel_slink = File::Spec->catdir($OPT{dir}, "$OPT{database}-$$");
138     unlink $prel_slink; # may fail
139     symlink $long_dir, $prel_slink or die "Could not symlink $long_dir, $prel_slink: $!";
140     rename $prel_slink, $want_dir or die "Could not rename $prel_slink, $want_dir: $!";
141    
142 ulpfr 55 $WAIT::Config = $WAIT::Config; # make perl -w happy
143    
144    
145     __END__
146     ## ###################################################################
147     ## pod
148     ## ###################################################################
149    
150     =head1 NAME
151    
152     index_ora - generate an WAIT index for O'Reilly catalog
153    
154     =head1 SYNOPSIS
155    
156     B<index_ora>
157     [B<-database> I<dbname>]
158     [B<-dir> I<directory>]
159     [B<-table> I<table name>]
160     I<directory>
161    
162     =head1 DESCRIPTION
163    
164     =head1 OPTIONS
165    
166     =over 5
167    
168     =item B<-database> I<dbname>
169    
170     Specify database name. Default is F<DB>.
171    
172     =item B<-dir> I<directory>
173    
174     Alternate directory where databases are located. Default is the
175     directory specified during configuration of WAIT.
176    
177     =item B<-table> I<table name>
178    
179     Specify an alternate table name. Default is C<ora>.
180    
181     =head1 AUTHOR
182    
183     Ulrich Pfeifer E<lt>F<pfeifer@wait.de>E<gt>
184    

Properties

Name Value
cvs2svn:cvs-rev 1.8

  ViewVC Help
Powered by ViewVC 1.1.26