4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sun Nov 12 14:20:56 2000 |
# Last Modified On: Sat Apr 27 16:13:55 2002 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# |
# |
10 |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
# (C) Copyright 1996-2002, Ulrich Pfeifer |
11 |
# |
# |
12 |
|
|
13 |
package WAIT::InvertedIndex; |
package WAIT::InvertedIndex; |
14 |
use strict; |
use strict; |
15 |
use DB_File; |
use BerkeleyDB; |
16 |
use Fcntl; |
use Fcntl; |
17 |
use WAIT::Filter; |
use WAIT::Filter; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC $VERSION); |
use vars qw(%FUNC $VERSION); |
20 |
|
|
21 |
$VERSION = "1.801"; # others test if we are loaded by checking $VERSION |
$VERSION = "1.900"; # others test if we are loaded by checking $VERSION |
22 |
|
|
23 |
# The dictionary has three different key types: |
# The dictionary has three different key types: |
24 |
# 'o'.$word |
# 'o'.$word |
25 |
# |
# |
26 |
# The document frequency is the number of documents a term occurs |
# The document frequency is the number of documents a term occurs |
27 |
# in. The idea is that a term occuring in a significant part of the |
# in. The idea is that a term occuring in a significant portion of the |
28 |
# documents is not too significant. |
# documents is not too significant. |
29 |
# |
# |
30 |
# 'm'.$word |
# 'm'.$word |
31 |
# |
# |
32 |
# The maximum term frequency of a document is the frequency of the |
# The maximum term frequency of a document is the frequency of the |
33 |
# most frequent term in the document. It is related to the document |
# most frequent term in the document. It is related to the document |
34 |
# length obviously. A document in which the most frequnet term occurs |
# length obviously. A document in which the most frequent term occurs |
35 |
# 100 times is probably much longer than a document whichs most |
# 100 times is probably much longer than a document whichs most |
36 |
# frequent term occurs five time. |
# frequent term occurs five time. |
37 |
# |
# |
156 |
|
|
157 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
158 |
$self->sync; |
$self->sync; |
159 |
my $dbh = $self->{dbh}; # for convenience |
my $dbh = $self->{dbh} or return $self->{old_index} = 0; # for convenience |
160 |
|
|
161 |
my $O = pack('C', 0xff)."o"; |
my $O = pack('C', 0xff)."o"; |
162 |
my ($word, $value) = ($O.$;); # $word and $value are modified! |
my ($word, $value) = ($O.$;); # $word and $value are modified by seq! |
163 |
$dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0; |
if ( my $ret = $dbh->seq($word, $value, DB_CURSOR) ) { |
164 |
|
# warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O"; |
165 |
|
return $self->{old_index} = 0; |
166 |
|
} |
167 |
for (my $i=0; $i<10;$i++) { |
for (my $i=0; $i<10;$i++) { |
168 |
if ($value !~ /^\d+$/) { |
if ($value !~ /^\d+$/) { |
169 |
|
# warn "DEBUG: word[$word]value[$value], not an old index"; |
170 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
171 |
} |
} |
172 |
if ($dbh->seq($word, $value, R_NEXT) or # no values left |
if (my $ret = $dbh->seq($word, $value, DB_NEXT) or # no values left |
173 |
$word !~ /^$O/o # no $O values left |
$word !~ /^$O$;/o # no $O values left |
174 |
) { |
) { |
175 |
# we are not sure enough that this is an old index |
# we are not sure enough that this is an old index |
176 |
|
# warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index"; |
177 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
178 |
} |
} |
179 |
} |
} |
180 |
|
# warn "DEBUG: old index"; |
181 |
return $self->{old_index} = 1; |
return $self->{old_index} = 1; |
182 |
} |
} |
183 |
|
|
190 |
} else { |
} else { |
191 |
$self->{func} = |
$self->{func} = |
192 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
193 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
$self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree', |
194 |
$self->{mode}, 0664, $DB_BTREE); |
-Filename => $self->{file}, |
195 |
|
-Subname => $self->{name}, |
196 |
|
-Mode => $self->{mode}; |
197 |
$self->{cache} = {} |
$self->{cache} = {} |
198 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
199 |
$self->{cdict} = {} |
$self->{cdict} = {} |
253 |
my $r = ''; |
my $r = ''; |
254 |
|
|
255 |
# Sort posting list by increasing ratio of maximum term frequency (~ |
# Sort posting list by increasing ratio of maximum term frequency (~ |
256 |
# "document length") and term frequency. This rati multipied by the |
# "document length") and term frequency. This ratio multipied by the |
257 |
# inverse document frequence gives the score for a term. This sort |
# inverse document frequence gives the score for a term. This sort |
258 |
# order can be exploited for tuning of single term queries. |
# order can be exploited for tuning of single term queries. |
259 |
|
|
330 |
$last = (defined $last)?'p'.$last:'q'; |
$last = (defined $last)?'p'.$last:'q'; |
331 |
|
|
332 |
# set the cursor to $first |
# set the cursor to $first |
333 |
$dbh->seq($first, $value, R_CURSOR); |
$dbh->seq($first, $value, DB_CURSOR); |
334 |
|
|
335 |
# $first would be after the last word |
# $first would be after the last word |
336 |
return () if $first gt $last; |
return () if $first gt $last; |
337 |
|
|
338 |
push @result, substr($first,1); |
push @result, substr($first,1); |
339 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
while (!$dbh->seq($word, $value, DB_NEXT)) { |
340 |
# We should limit this to a "resonable" number of words |
# We should limit this to a "resonable" number of words |
341 |
last if $word gt $last; |
last if $word gt $last; |
342 |
push @result, substr($word,1); |
push @result, substr($word,1); |
364 |
($prefix) = &{$self->{'pfunc'}}($prefix); |
($prefix) = &{$self->{'pfunc'}}($prefix); |
365 |
} |
} |
366 |
|
|
367 |
if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) { |
if ($dbh->seq($word = 'p'.$prefix, $value, DB_CURRENT)) { |
368 |
return (); |
return (); |
369 |
} |
} |
370 |
return () if $word !~ /^p$prefix/; |
return () if $word !~ /^p$prefix/; |
371 |
push @result, substr($word,1); |
push @result, substr($word,1); |
372 |
|
|
373 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
while (!$dbh->seq($word, $value, DB_NEXT)) { |
374 |
# We should limit this to a "resonable" number of words |
# We should limit this to a "resonable" number of words |
375 |
last if $word !~ /^p$prefix/; |
last if $word !~ /^p$prefix/; |
376 |
push @result, substr($word,1); |
push @result, substr($word,1); |
413 |
|
|
414 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
415 |
$self->sync; |
$self->sync; |
416 |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there |
417 |
} |
} |
418 |
|
|
419 |
sub parse { |
sub parse { |
423 |
&{$self->{func}}(@_); |
&{$self->{func}}(@_); |
424 |
} |
} |
425 |
|
|
|
sub keys { |
|
|
my $self = shift; |
|
|
|
|
|
defined $self->{db} or $self->open; |
|
|
keys %{$self->{db}}; |
|
|
} |
|
|
|
|
426 |
sub search_prefix { |
sub search_prefix { |
427 |
my $self = shift; |
my $self = shift; |
428 |
|
|
466 |
# check which words occur in the index. |
# check which words occur in the index. |
467 |
grep { $self->{db}->{'o'.$_} } @_; |
grep { $self->{db}->{'o'.$_} } @_; |
468 |
|
|
469 |
return () unless @terms; # nothing to search for |
return unless @terms; |
470 |
|
|
471 |
# We special-case one term queries here. If the index was sorted, |
# We special-case one term queries here. If the index was sorted, |
472 |
# choping off the rest of the list will return the same ranking. |
# choping off the rest of the list will return the same ranking. |
615 |
my $full; # Need to process all postings |
my $full; # Need to process all postings |
616 |
my $chop; # Score necessary to enter the ranking list |
my $chop; # Score necessary to enter the ranking list |
617 |
|
|
618 |
if (# We know that wanted is true since we especial cased the |
if (# We know that wanted is true since we special cased the |
619 |
# exhaustive search. |
# exhaustive search. |
620 |
|
|
621 |
$wanted and |
$wanted and |
671 |
sub set { |
sub set { |
672 |
my ($self, $attr, $value) = @_; |
my ($self, $attr, $value) = @_; |
673 |
|
|
674 |
die "No such indexy attribute: '$attr'" unless $attr eq 'top'; |
die "No such index attribute: '$attr'" unless $attr eq 'top'; |
675 |
|
|
676 |
return delete $self->{reorg} if $value == 0; |
return delete $self->{reorg} if $value == 0; |
677 |
|
|
729 |
} |
} |
730 |
} |
} |
731 |
|
|
732 |
|
sub keys { |
733 |
|
my $self = shift; |
734 |
|
|
735 |
|
defined $self->{db} or $self->open; |
736 |
|
keys %{$self->{db}}; |
737 |
|
} |
738 |
|
|
739 |
1; |
1; |
740 |
|
|