4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sat Nov 11 16:32:38 2000 |
# Last Modified On: Mon Dec 31 14:30:05 2001 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# |
# |
10 |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
16 |
use Fcntl; |
use Fcntl; |
17 |
use WAIT::Filter; |
use WAIT::Filter; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC); |
use vars qw(%FUNC $VERSION); |
20 |
|
|
21 |
|
$VERSION = "1.801"; # others test if we are loaded by checking $VERSION |
22 |
|
|
23 |
# The dictionary has three different key types: |
# The dictionary has three different key types: |
24 |
# 'o'.$word |
# 'o'.$word |
159 |
my $dbh = $self->{dbh}; # for convenience |
my $dbh = $self->{dbh}; # for convenience |
160 |
|
|
161 |
my $O = pack('C', 0xff)."o"; |
my $O = pack('C', 0xff)."o"; |
162 |
my ($word, $value) = ($O.$;); |
my ($word, $value) = ($O.$;); # $word and $value are modified by seq! |
163 |
$dbh->seq($word, $value, R_CURSOR); |
if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) { |
164 |
|
# warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O"; |
165 |
|
return $self->{old_index} = 0; |
166 |
|
} |
167 |
for (my $i=0; $i<10;$i++) { |
for (my $i=0; $i<10;$i++) { |
168 |
if ($value !~ /^\d+$/) { |
if ($value !~ /^\d+$/) { |
169 |
|
# warn "DEBUG: word[$word]value[$value], not an old index"; |
170 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
171 |
} |
} |
172 |
if ($dbh->seq($word, $value, R_NEXT) or # no values left |
if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left |
173 |
$word !~ /^$O/o # no $O values left |
$word !~ /^$O$;/o # no $O values left |
174 |
) { |
) { |
175 |
# we are not sure enough that this is an old index |
# we are not sure enough that this is an old index |
176 |
|
# warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index"; |
177 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
178 |
} |
} |
179 |
} |
} |
180 |
|
# warn "DEBUG: old index"; |
181 |
return $self->{old_index} = 1; |
return $self->{old_index} = 1; |
182 |
} |
} |
183 |
|
|
255 |
# inverse document frequence gives the score for a term. This sort |
# inverse document frequence gives the score for a term. This sort |
256 |
# order can be exploited for tuning of single term queries. |
# order can be exploited for tuning of single term queries. |
257 |
|
|
258 |
|
for my $did (keys %$post) { # sanity check |
259 |
|
unless ($self->{db}->{"m". $did}) { |
260 |
|
warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]"; |
261 |
|
$self->{db}->{"m". $did} = 1; # fails if we have not opened for writing |
262 |
|
} |
263 |
|
} |
264 |
for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} |
for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} |
265 |
<=> |
<=> |
266 |
$post->{$a} / $self->{db}->{'m'. $a} |
$post->{$a} / $self->{db}->{'m'. $a} |
288 |
|
|
289 |
grep $occ{$_}++, &{$self->{func}}(@_); |
grep $occ{$_}++, &{$self->{func}}(@_); |
290 |
|
|
291 |
|
# Be prepared for "Odd number of elements in hash assignment" |
292 |
|
local $SIG{__WARN__} = sub { |
293 |
|
my $warning = shift; |
294 |
|
chomp $warning; |
295 |
|
warn "Catching warning[$warning] during delete of key[$key]"; |
296 |
|
}; |
297 |
for (keys %occ) {# may reorder posting list |
for (keys %occ) {# may reorder posting list |
298 |
my %post = unpack 'w*', $db->{'p'.$_}; |
my %post = unpack 'w*', $db->{'p'.$_}; |
299 |
delete $post{$key}; |
delete $post{$key}; |
421 |
&{$self->{func}}(@_); |
&{$self->{func}}(@_); |
422 |
} |
} |
423 |
|
|
|
sub keys { |
|
|
my $self = shift; |
|
|
|
|
|
defined $self->{db} or $self->open; |
|
|
keys %{$self->{db}}; |
|
|
} |
|
|
|
|
424 |
sub search_prefix { |
sub search_prefix { |
425 |
my $self = shift; |
my $self = shift; |
426 |
|
|
464 |
# check which words occur in the index. |
# check which words occur in the index. |
465 |
grep { $self->{db}->{'o'.$_} } @_; |
grep { $self->{db}->{'o'.$_} } @_; |
466 |
|
|
467 |
return () unless @terms; # nothing to search for |
return unless @terms; |
468 |
|
|
469 |
# We special-case one term queries here. If the index was sorted, |
# We special-case one term queries here. If the index was sorted, |
470 |
# choping off the rest of the list will return the same ranking. |
# choping off the rest of the list will return the same ranking. |
480 |
} |
} |
481 |
|
|
482 |
for (my $i=1; $i<@res; $i+=2) { |
for (my $i=1; $i<@res; $i+=2) { |
483 |
$res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; |
# $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; |
484 |
|
# above was written badly, allows two DIV_ZERO problems. |
485 |
|
my $maxtf = $self->{db}->{"m". $res[$i-1]}; |
486 |
|
unless ($maxtf) { |
487 |
|
warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]"; |
488 |
|
$maxtf = 1; |
489 |
|
} |
490 |
|
$res[$i] = ($res[$i] / $maxtf) * $idf; |
491 |
} |
} |
492 |
|
|
493 |
return @res |
return @res |
691 |
if ($self->{mode} & O_RDWR) { |
if ($self->{mode} & O_RDWR) { |
692 |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
693 |
while (my($key, $value) = each %{$self->{cache}}) { |
while (my($key, $value) = each %{$self->{cache}}) { |
694 |
|
$self->{db}->{"p". $key} ||= ""; |
695 |
if ($self->{reorg}) { |
if ($self->{reorg}) { |
696 |
$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} |
$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} |
697 |
. $value); |
. $value); |
727 |
} |
} |
728 |
} |
} |
729 |
|
|
730 |
|
sub keys { |
731 |
|
my $self = shift; |
732 |
|
|
733 |
|
defined $self->{db} or $self->open; |
734 |
|
keys %{$self->{db}}; |
735 |
|
} |
736 |
|
|
737 |
1; |
1; |
738 |
|
|