4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Tue May 9 08:33:28 2000 |
# Last Modified On: Sun Nov 12 14:40:21 2000 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# |
# |
10 |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
16 |
use Fcntl; |
use Fcntl; |
17 |
use WAIT::Filter; |
use WAIT::Filter; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC); |
use vars qw(%FUNC $VERSION); |
20 |
|
|
21 |
my $O = pack('C', 0xff)."o"; # occurances (document ferquency) |
$VERSION = "1.801"; # others test if we are loaded by checking $VERSION |
22 |
|
|
23 |
# The document frequency is the number of documents a term occurs |
# The dictionary has three different key types: |
24 |
# in. The idea is that a term occuring in a significant part of the |
# 'o'.$word |
25 |
# documents is not too significant. |
# |
26 |
|
# The document frequency is the number of documents a term occurs |
27 |
|
# in. The idea is that a term occuring in a significant part of the |
28 |
|
# documents is not too significant. |
29 |
|
# |
30 |
|
# 'm'.$word |
31 |
|
# |
32 |
|
# The maximum term frequency of a document is the frequency of the |
33 |
|
# most frequent term in the document. It is related to the document |
34 |
|
# length obviously. A document in which the most frequnet term occurs |
35 |
|
# 100 times is probably much longer than a document whichs most |
36 |
|
# frequent term occurs five time. |
37 |
|
# |
38 |
|
# 'p'.$word |
39 |
|
# |
40 |
|
# Under this key we store the actual posting list as pairs of |
41 |
|
# packed integers. |
42 |
|
|
43 |
my $M = pack('C', 0xff)."m"; # maxtf (term frequency) |
my $no_old_index_support = 0; # do not check for old indices if set |
|
|
|
|
# The maximum term frequency of a document is the frequency of the |
|
|
# most frequent term in the document. It is related to the document |
|
|
# length obviously. A document in which the most frequnet term occurs |
|
|
# 100 times is probably much longer than a document whichs most |
|
|
# frequent term occurs five time. |
|
44 |
|
|
45 |
sub new { |
sub new { |
46 |
my $type = shift; |
my $type = shift; |
144 |
} |
} |
145 |
} |
} |
146 |
|
|
147 |
|
sub is_an_old_index { |
148 |
|
my $self = shift; |
149 |
|
|
150 |
|
return 0 if $no_old_index_support; |
151 |
|
return $self->{old_index} if exists $self->{old_index}; |
152 |
|
|
153 |
|
# We can only guess if this is an old index. We lookup the first 10 |
154 |
|
# $O entries. If all values are integers, we assume that the index |
155 |
|
# is an old one. |
156 |
|
|
157 |
|
defined $self->{db} or $self->open; |
158 |
|
$self->sync; |
159 |
|
my $dbh = $self->{dbh}; # for convenience |
160 |
|
|
161 |
|
my $O = pack('C', 0xff)."o"; |
162 |
|
my ($word, $value) = ($O.$;); # $word and $value are modified! |
163 |
|
$dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0; |
164 |
|
for (my $i=0; $i<10;$i++) { |
165 |
|
if ($value !~ /^\d+$/) { |
166 |
|
return $self->{old_index} = 0; |
167 |
|
} |
168 |
|
if ($dbh->seq($word, $value, R_NEXT) or # no values left |
169 |
|
$word !~ /^$O$;/o # no $O values left |
170 |
|
) { |
171 |
|
# we are not sure enough that this is an old index |
172 |
|
return $self->{old_index} = 0; |
173 |
|
} |
174 |
|
} |
175 |
|
return $self->{old_index} = 1; |
176 |
|
} |
177 |
|
|
178 |
sub open { |
sub open { |
179 |
my $self = shift; |
my $self = shift; |
180 |
my $file = $self->{file}; |
my $file = $self->{file}; |
191 |
$self->{cdict} = {} |
$self->{cdict} = {} |
192 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
193 |
$self->{cached} = 0; |
$self->{cached} = 0; |
194 |
|
if (!$no_old_index_support and $self->is_an_old_index()) { |
195 |
|
warn "This is an old index, upgrade you database"; |
196 |
|
require WAIT::InvertedIndexOld; |
197 |
|
bless $self, 'WAIT::InvertedIndexOld'; |
198 |
|
} |
199 |
} |
} |
200 |
} |
} |
201 |
|
|
210 |
$self->{records}++; |
$self->{records}++; |
211 |
while (($word, $noc) = each %occ) { |
while (($word, $noc) = each %occ) { |
212 |
if (defined $self->{cache}->{$word}) { |
if (defined $self->{cache}->{$word}) { |
213 |
$self->{cdict}->{$O,$word}++; |
$self->{cdict}->{$word}++; |
214 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
215 |
} else { |
} else { |
216 |
$self->{cdict}->{$O,$word} = 1; |
$self->{cdict}->{$word} = 1; |
217 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
218 |
} |
} |
219 |
$self->{cached}++; |
$self->{cached}++; |
224 |
for (values %occ) { |
for (values %occ) { |
225 |
$maxtf = $_ if $_ > $maxtf; |
$maxtf = $_ if $_ > $maxtf; |
226 |
} |
} |
227 |
$self->{db}->{$M, $key} = $maxtf; |
$self->{db}->{'m'. $key} = $maxtf; |
228 |
} |
} |
229 |
|
|
230 |
# We sort postings by increasing max term frequency (~ by increasing |
# We sort postings by increasing max term frequency (~ by increasing |
249 |
# inverse document frequence gives the score for a term. This sort |
# inverse document frequence gives the score for a term. This sort |
250 |
# order can be exploited for tuning of single term queries. |
# order can be exploited for tuning of single term queries. |
251 |
|
|
252 |
for my $did (sort { $post->{$b} / $self->{db}->{$M, $b} |
for my $did (keys %$post) { # sanity check |
253 |
|
unless ($self->{db}->{"m". $did}) { |
254 |
|
warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]"; |
255 |
|
$self->{db}->{"m". $did} = 1; # fails if we have not opened for writing |
256 |
|
} |
257 |
|
} |
258 |
|
for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} |
259 |
<=> |
<=> |
260 |
$post->{$a} / $self->{db}->{$M, $a} |
$post->{$a} / $self->{db}->{'m'. $a} |
261 |
} keys %$post) { |
} keys %$post) { |
262 |
$r .= pack 'w2', $did, $post->{$did}; |
$r .= pack 'w2', $did, $post->{$did}; |
263 |
} |
} |
282 |
|
|
283 |
grep $occ{$_}++, &{$self->{func}}(@_); |
grep $occ{$_}++, &{$self->{func}}(@_); |
284 |
|
|
285 |
|
# Be prepared for "Odd number of elements in hash assignment" |
286 |
|
local $SIG{__WARN__} = sub { |
287 |
|
my $warning = shift; |
288 |
|
chomp $warning; |
289 |
|
warn "Catching warning[$warning] during delete of key[$key]"; |
290 |
|
}; |
291 |
for (keys %occ) {# may reorder posting list |
for (keys %occ) {# may reorder posting list |
292 |
my %post = unpack 'w*', $db->{$_}; |
my %post = unpack 'w*', $db->{'p'.$_}; |
293 |
delete $post{$key}; |
delete $post{$key}; |
294 |
$db->{$_} = $self->sort_postings(\%post); |
$db->{'p'.$_} = $self->sort_postings(\%post); |
295 |
_complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post; |
_complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post; |
296 |
$db->{$O,$_} = scalar keys %post; |
$db->{'o'.$_} = scalar keys %post; |
297 |
} |
} |
298 |
delete $db->{$M, $key}; |
delete $db->{'m'. $key}; |
299 |
} |
} |
300 |
|
|
301 |
sub intervall { |
sub intervall { |
318 |
($first) = &{$self->{'ifunc'}}($first) if $first; |
($first) = &{$self->{'ifunc'}}($first) if $first; |
319 |
($last) = &{$self->{'ifunc'}}($last) if $last; |
($last) = &{$self->{'ifunc'}}($last) if $last; |
320 |
} |
} |
321 |
if (defined $first and $first ne '') { # set the cursor to $first |
$first = 'p'.($first||''); |
322 |
$dbh->seq($first, $value, R_CURSOR); |
$last = (defined $last)?'p'.$last:'q'; |
323 |
} else { |
|
324 |
$dbh->seq($first, $value, R_FIRST); |
# set the cursor to $first |
325 |
} |
$dbh->seq($first, $value, R_CURSOR); |
326 |
# We assume that word do not start with the character \377 |
|
327 |
# $last = pack 'C', 0xff unless defined $last and $last ne ''; |
# $first would be after the last word |
328 |
return () if defined $last and $first gt $last; # $first would be after the last word |
return () if $first gt $last; |
329 |
|
|
330 |
push @result, $first; |
push @result, substr($first,1); |
331 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
while (!$dbh->seq($word, $value, R_NEXT)) { |
332 |
# We should limit this to a "resonable" number of words |
# We should limit this to a "resonable" number of words |
333 |
last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o; |
last if $word gt $last; |
334 |
push @result, $word; |
push @result, substr($word,1); |
335 |
} |
} |
336 |
\@result; # speed |
\@result; # speed |
337 |
} |
} |
356 |
($prefix) = &{$self->{'pfunc'}}($prefix); |
($prefix) = &{$self->{'pfunc'}}($prefix); |
357 |
} |
} |
358 |
|
|
359 |
if ($dbh->seq($word = $prefix, $value, R_CURSOR)) { |
if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) { |
360 |
return (); |
return (); |
361 |
} |
} |
362 |
return () if $word !~ /^$prefix/; |
return () if $word !~ /^p$prefix/; |
363 |
push @result, $word; |
push @result, substr($word,1); |
364 |
|
|
365 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
while (!$dbh->seq($word, $value, R_NEXT)) { |
366 |
# We should limit this to a "resonable" number of words |
# We should limit this to a "resonable" number of words |
367 |
last if $word !~ /^$prefix/; |
last if $word !~ /^p$prefix/; |
368 |
push @result, $word; |
push @result, substr($word,1); |
369 |
} |
} |
370 |
\@result; # speed |
\@result; # speed |
371 |
} |
} |
461 |
# We keep duplicates |
# We keep duplicates |
462 |
my @terms = |
my @terms = |
463 |
# Sort words by decreasing document frequency |
# Sort words by decreasing document frequency |
464 |
sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} } |
sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} } |
465 |
# check which words occur in the index. |
# check which words occur in the index. |
466 |
grep { $self->{db}->{$O,$_} } @_; |
grep { $self->{db}->{'o'.$_} } @_; |
467 |
|
|
468 |
return () unless @terms; # nothing to search for |
return () unless @terms; # nothing to search for |
469 |
|
|
471 |
# choping off the rest of the list will return the same ranking. |
# choping off the rest of the list will return the same ranking. |
472 |
if ($wanted and @terms == 1) { |
if ($wanted and @terms == 1) { |
473 |
my $term = shift @terms; |
my $term = shift @terms; |
474 |
my $idf = log($self->{records}/$self->{db}->{$O,$term}); |
my $idf = log($self->{records}/$self->{db}->{'o'.$term}); |
475 |
my @res; |
my @res; |
476 |
|
|
477 |
if ($self->{reorg}) { # or not $query->{picky} |
if ($self->{reorg}) { # or not $query->{picky} |
478 |
@res = unpack "w". int(2*$wanted), $self->{db}->{$term}; |
@res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term}; |
479 |
} else { |
} else { |
480 |
@res = unpack 'w*', $self->{db}->{$term}; |
@res = unpack 'w*', $self->{db}->{'p'.$term}; |
481 |
} |
} |
482 |
|
|
483 |
for (my $i=1; $i<@res; $i+=2) { |
for (my $i=1; $i<@res; $i+=2) { |
484 |
$res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf; |
# $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; |
485 |
|
# above was written badly, allows two DIV_ZERO problems. |
486 |
|
my $maxtf = $self->{db}->{"m". $res[$i-1]}; |
487 |
|
unless ($maxtf) { |
488 |
|
warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]"; |
489 |
|
$maxtf = 1; |
490 |
|
} |
491 |
|
$res[$i] = ($res[$i] / $maxtf) * $idf; |
492 |
} |
} |
493 |
|
|
494 |
return @res |
return @res |
499 |
# result. |
# result. |
500 |
unless ($wanted) { |
unless ($wanted) { |
501 |
for (@terms) { |
for (@terms) { |
502 |
my $df = $self->{db}->{$O,$_}; |
my $df = $self->{db}->{'o'.$_}; |
503 |
|
|
504 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
505 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
_complain('search for term', $_) and $df = 1 if $df < 1; |
506 |
|
|
507 |
# Unpack posting list for current query term $_ |
# Unpack posting list for current query term $_ |
508 |
my %post = unpack 'w*', $self->{db}->{$_}; |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
509 |
|
|
510 |
_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
511 |
# This is the inverse document frequency. The log of the inverse |
# This is the inverse document frequency. The log of the inverse |
512 |
# fraction of documents the term occurs in. |
# fraction of documents the term occurs in. |
513 |
my $idf = log($self->{records}/$df); |
my $idf = log($self->{records}/$df); |
514 |
for my $did (keys %post) { |
for my $did (keys %post) { |
515 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
516 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
517 |
} |
} |
518 |
} |
} |
525 |
unless ($strict) { |
unless ($strict) { |
526 |
for (@terms) { |
for (@terms) { |
527 |
# Unpack posting list for current query term $_ |
# Unpack posting list for current query term $_ |
528 |
my %post = unpack 'w*', $self->{db}->{$_}; |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
529 |
|
|
530 |
# Lookup the number of documents the term occurs in (document frequency) |
# Lookup the number of documents the term occurs in (document frequency) |
531 |
my $occ = $self->{db}->{$O,$_}; |
my $occ = $self->{db}->{'o'.$_}; |
532 |
|
|
533 |
_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
534 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
535 |
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
536 |
|
|
553 |
|
|
554 |
if (keys %score < $wanted) { |
if (keys %score < $wanted) { |
555 |
for my $did (keys %post) { |
for my $did (keys %post) { |
556 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
557 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
558 |
} |
} |
559 |
} |
} |
560 |
} else { |
} else { |
561 |
for my $did (keys %score) { |
for my $did (keys %score) { |
562 |
next unless exists $post{$did}; |
next unless exists $post{$did}; |
563 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
564 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
565 |
} |
} |
566 |
} |
} |
580 |
for (my $i = $#terms; $i >=0; $i--) { |
for (my $i = $#terms; $i >=0; $i--) { |
581 |
local $_ = $terms[$i]; |
local $_ = $terms[$i]; |
582 |
# Lookup the number of documents the term occurs in (document frequency) |
# Lookup the number of documents the term occurs in (document frequency) |
583 |
my $df = $self->{db}->{$O,$_}; |
my $df = $self->{db}->{'o'.$_}; |
584 |
|
|
585 |
# The frequency *must* be 1 at least since the posting list is nonempty |
# The frequency *must* be 1 at least since the posting list is nonempty |
586 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
_complain('search for term', $_) and $df = 1 if $df < 1; |
591 |
|
|
592 |
my ($did,$occ); |
my ($did,$occ); |
593 |
if ($self->{reorg}) { |
if ($self->{reorg}) { |
594 |
($did,$occ) = unpack 'w2', $self->{db}->{$_}; |
($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_}; |
595 |
} else { # Maybe this costs more than it helps |
} else { # Maybe this costs more than it helps |
596 |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_}); |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_}); |
597 |
} |
} |
598 |
my $freq = $self->{db}->{$M, $did}; |
my $freq = $self->{db}->{'m'. $did}; |
599 |
my $max = $occ/$freq*$idf[$i]; |
my $max = $occ/$freq*$idf[$i]; |
600 |
$max[$i] = $max + $max[$i+1]; |
$max[$i] = $max + $max[$i+1]; |
601 |
} |
} |
605 |
my $term = $terms[$i]; |
my $term = $terms[$i]; |
606 |
# Unpack posting list for current query term $term. We loose the |
# Unpack posting list for current query term $term. We loose the |
607 |
# sorting order because the assignment to a hash. |
# sorting order because the assignment to a hash. |
608 |
my %post = unpack 'w*', $self->{db}->{$term}; |
my %post = unpack 'w*', $self->{db}->{'p'.$term}; |
609 |
|
|
610 |
_complain('search for term', $term) |
_complain('search for term', $term) |
611 |
if $self->{db}->{$O,$term} != keys %post; |
if $self->{db}->{'o'.$term} != keys %post; |
612 |
|
|
613 |
my $idf = $idf[$i]; |
my $idf = $idf[$i]; |
614 |
my $full; # Need to process all postings |
my $full; # Need to process all postings |
637 |
if (defined $chop) { |
if (defined $chop) { |
638 |
# We might be able to avoid allocating accumulators |
# We might be able to avoid allocating accumulators |
639 |
for my $did (keys %post) { |
for my $did (keys %post) { |
640 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
641 |
my $wgt = $post{$did} / $freq * $idf; |
my $wgt = $post{$did} / $freq * $idf; |
642 |
# We add an accumulator if $wgt exeeds $chop |
# We add an accumulator if $wgt exeeds $chop |
643 |
if (exists $score{$did} or $wgt > $chop) { |
if (exists $score{$did} or $wgt > $chop) { |
648 |
} else { |
} else { |
649 |
# Allocate acumulators for each seen document. |
# Allocate acumulators for each seen document. |
650 |
for my $did (keys %post) { |
for my $did (keys %post) { |
651 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
652 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
653 |
} |
} |
654 |
} |
} |
657 |
# Update existing accumulators |
# Update existing accumulators |
658 |
for my $did (keys %score) { |
for my $did (keys %score) { |
659 |
next unless exists $post{$did}; |
next unless exists $post{$did}; |
660 |
if (my $freq = $self->{db}->{$M, $did}) { |
if (my $freq = $self->{db}->{'m'. $did}) { |
661 |
$score{$did} += $post{$did} / $freq * $idf; |
$score{$did} += $post{$did} / $freq * $idf; |
662 |
} |
} |
663 |
} |
} |
680 |
|
|
681 |
$self->sync; |
$self->sync; |
682 |
while (my($key, $value) = each %{$self->{db}}) { |
while (my($key, $value) = each %{$self->{db}}) { |
683 |
next if $key =~ /^\377[om]/; |
next if $key !~ /^p/; |
684 |
$self->{db}->{$key} = $self->sort_postings($value); |
$self->{db}->{$key} = $self->sort_postings($value); |
685 |
} |
} |
686 |
$self->{reorg} = 1; |
$self->{reorg} = 1; |
692 |
if ($self->{mode} & O_RDWR) { |
if ($self->{mode} & O_RDWR) { |
693 |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
694 |
while (my($key, $value) = each %{$self->{cache}}) { |
while (my($key, $value) = each %{$self->{cache}}) { |
695 |
|
$self->{db}->{"p". $key} ||= ""; |
696 |
if ($self->{reorg}) { |
if ($self->{reorg}) { |
697 |
$self->{db}->{$key} = $self->sort_postings($self->{db}->{$key} |
$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} |
698 |
. $value); |
. $value); |
699 |
} else { |
} else { |
700 |
$self->{db}->{$key} .= $value; |
$self->{db}->{'p'.$key} .= $value; |
701 |
} |
} |
702 |
} |
} |
703 |
while (my($key, $value) = each %{$self->{cdict}}) { |
while (my($key, $value) = each %{$self->{cdict}}) { |
704 |
$self->{db}->{$key} = 0 unless $self->{db}->{$key}; |
$self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key}; |
705 |
$self->{db}->{$key} += $value; |
$self->{db}->{'o'.$key} += $value; |
706 |
} |
} |
707 |
$self->{cache} = {}; |
$self->{cache} = {}; |
708 |
$self->{cdict} = {}; |
$self->{cdict} = {}; |