4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sun Nov 12 14:40:21 2000 |
# Last Modified On: Sat Apr 20 16:56:29 2002 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# |
# |
10 |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
# (C) Copyright 1996-2002, Ulrich Pfeifer |
11 |
# |
# |
12 |
|
|
13 |
package WAIT::InvertedIndex; |
package WAIT::InvertedIndex; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC $VERSION); |
use vars qw(%FUNC $VERSION); |
20 |
|
|
21 |
$VERSION = "1.801"; # others test if we are loaded by checking $VERSION |
$VERSION = "1.900"; # others test if we are loaded by checking $VERSION |
22 |
|
|
23 |
# The dictionary has three different key types: |
# The dictionary has three different key types: |
24 |
# 'o'.$word |
# 'o'.$word |
25 |
# |
# |
26 |
# The document frequency is the number of documents a term occurs |
# The document frequency is the number of documents a term occurs |
27 |
# in. The idea is that a term occuring in a significant part of the |
# in. The idea is that a term occuring in a significant portion of the |
28 |
# documents is not too significant. |
# documents is not too significant. |
29 |
# |
# |
30 |
# 'm'.$word |
# 'm'.$word |
31 |
# |
# |
32 |
# The maximum term frequency of a document is the frequency of the |
# The maximum term frequency of a document is the frequency of the |
33 |
# most frequent term in the document. It is related to the document |
# most frequent term in the document. It is related to the document |
34 |
# length obviously. A document in which the most frequnet term occurs |
# length obviously. A document in which the most frequent term occurs |
35 |
# 100 times is probably much longer than a document whichs most |
# 100 times is probably much longer than a document whichs most |
36 |
# frequent term occurs five time. |
# frequent term occurs five time. |
37 |
# |
# |
156 |
|
|
157 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
158 |
$self->sync; |
$self->sync; |
159 |
my $dbh = $self->{dbh}; # for convenience |
my $dbh = $self->{dbh} or return $self->{old_index} = 0; # for convenience |
160 |
|
|
161 |
my $O = pack('C', 0xff)."o"; |
my $O = pack('C', 0xff)."o"; |
162 |
my ($word, $value) = ($O.$;); # $word and $value are modified! |
my ($word, $value) = ($O.$;); # $word and $value are modified by seq! |
163 |
$dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0; |
if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) { |
164 |
|
# warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O"; |
165 |
|
return $self->{old_index} = 0; |
166 |
|
} |
167 |
for (my $i=0; $i<10;$i++) { |
for (my $i=0; $i<10;$i++) { |
168 |
if ($value !~ /^\d+$/) { |
if ($value !~ /^\d+$/) { |
169 |
|
# warn "DEBUG: word[$word]value[$value], not an old index"; |
170 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
171 |
} |
} |
172 |
if ($dbh->seq($word, $value, R_NEXT) or # no values left |
if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left |
173 |
$word !~ /^$O$;/o # no $O values left |
$word !~ /^$O$;/o # no $O values left |
174 |
) { |
) { |
175 |
# we are not sure enough that this is an old index |
# we are not sure enough that this is an old index |
176 |
|
# warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index"; |
177 |
return $self->{old_index} = 0; |
return $self->{old_index} = 0; |
178 |
} |
} |
179 |
} |
} |
180 |
|
# warn "DEBUG: old index"; |
181 |
return $self->{old_index} = 1; |
return $self->{old_index} = 1; |
182 |
} |
} |
183 |
|
|
251 |
my $r = ''; |
my $r = ''; |
252 |
|
|
253 |
# Sort posting list by increasing ratio of maximum term frequency (~ |
# Sort posting list by increasing ratio of maximum term frequency (~ |
254 |
# "document length") and term frequency. This rati multipied by the |
# "document length") and term frequency. This ratio multipied by the |
255 |
# inverse document frequence gives the score for a term. This sort |
# inverse document frequence gives the score for a term. This sort |
256 |
# order can be exploited for tuning of single term queries. |
# order can be exploited for tuning of single term queries. |
257 |
|
|
411 |
|
|
412 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
413 |
$self->sync; |
$self->sync; |
414 |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there |
415 |
} |
} |
416 |
|
|
417 |
sub parse { |
sub parse { |
421 |
&{$self->{func}}(@_); |
&{$self->{func}}(@_); |
422 |
} |
} |
423 |
|
|
|
sub keys { |
|
|
my $self = shift; |
|
|
|
|
|
defined $self->{db} or $self->open; |
|
|
keys %{$self->{db}}; |
|
|
} |
|
|
|
|
424 |
sub search_prefix { |
sub search_prefix { |
425 |
my $self = shift; |
my $self = shift; |
426 |
|
|
464 |
# check which words occur in the index. |
# check which words occur in the index. |
465 |
grep { $self->{db}->{'o'.$_} } @_; |
grep { $self->{db}->{'o'.$_} } @_; |
466 |
|
|
467 |
return () unless @terms; # nothing to search for |
return unless @terms; |
468 |
|
|
469 |
# We special-case one term queries here. If the index was sorted, |
# We special-case one term queries here. If the index was sorted, |
470 |
# choping off the rest of the list will return the same ranking. |
# choping off the rest of the list will return the same ranking. |
613 |
my $full; # Need to process all postings |
my $full; # Need to process all postings |
614 |
my $chop; # Score necessary to enter the ranking list |
my $chop; # Score necessary to enter the ranking list |
615 |
|
|
616 |
if (# We know that wanted is true since we especial cased the |
if (# We know that wanted is true since we special cased the |
617 |
# exhaustive search. |
# exhaustive search. |
618 |
|
|
619 |
$wanted and |
$wanted and |
727 |
} |
} |
728 |
} |
} |
729 |
|
|
730 |
|
sub keys { |
731 |
|
my $self = shift; |
732 |
|
|
733 |
|
defined $self->{db} or $self->open; |
734 |
|
keys %{$self->{db}}; |
735 |
|
} |
736 |
|
|
737 |
1; |
1; |
738 |
|
|