1 |
# -*- Mode: Cperl -*- |
# -*- Mode: Perl -*- |
2 |
# InvertedIndex.pm -- |
# $Basename: InvertedIndex.pm $ |
3 |
# ITIID : $ITI$ $Header $__Header$ |
# $Revision: 1.30 $ |
4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sun Nov 22 18:44:42 1998 |
# Last Modified On: Tue May 9 08:33:28 2000 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# Status : Unknown, Use with caution! |
# |
10 |
# |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
11 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
# |
|
# |
|
12 |
|
|
13 |
package WAIT::InvertedIndex; |
package WAIT::InvertedIndex; |
14 |
use strict; |
use strict; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw(%FUNC); |
use vars qw(%FUNC); |
20 |
|
|
21 |
my $O = pack('C', 0xff)."o"; # occurances |
my $O = pack('C', 0xff)."o"; # occurances (document ferquency) |
22 |
my $M = pack('C', 0xff)."m"; # maxtf |
|
23 |
|
# The document frequency is the number of documents a term occurs |
24 |
|
# in. The idea is that a term occuring in a significant part of the |
25 |
|
# documents is not too significant. |
26 |
|
|
27 |
|
my $M = pack('C', 0xff)."m"; # maxtf (term frequency) |
28 |
|
|
29 |
|
# The maximum term frequency of a document is the frequency of the |
30 |
|
# most frequent term in the document. It is related to the document |
31 |
|
# length obviously. A document in which the most frequnet term occurs |
32 |
|
# 100 times is probably much longer than a document whichs most |
33 |
|
# frequent term occurs five time. |
34 |
|
|
35 |
sub new { |
sub new { |
36 |
my $type = shift; |
my $type = shift; |
145 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
146 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
147 |
$self->{mode}, 0664, $DB_BTREE); |
$self->{mode}, 0664, $DB_BTREE); |
|
# tie(%{$self->{cache}}, 'DB_File', undef, |
|
|
# $self->{mode}, 0664, $DB_BTREE) |
|
148 |
$self->{cache} = {} |
$self->{cache} = {} |
149 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
|
# tie(%{$self->{cdict}}, 'DB_File', undef, |
|
|
# $self->{mode}, 0664, $DB_BTREE) |
|
150 |
$self->{cdict} = {} |
$self->{cdict} = {} |
151 |
if $self->{mode} & O_RDWR; |
if $self->{mode} & O_RDWR; |
152 |
$self->{cached} = 0; |
$self->{cached} = 0; |
172 |
} |
} |
173 |
$self->{cached}++; |
$self->{cached}++; |
174 |
} |
} |
175 |
|
# This cache limit should be configurable |
176 |
$self->sync if $self->{cached} > 100_000; |
$self->sync if $self->{cached} > 100_000; |
177 |
my $maxtf = 0; |
my $maxtf = 0; |
178 |
for (values %occ) { |
for (values %occ) { |
181 |
$self->{db}->{$M, $key} = $maxtf; |
$self->{db}->{$M, $key} = $maxtf; |
182 |
} |
} |
183 |
|
|
184 |
|
# We sort postings by increasing max term frequency (~ by increasing |
185 |
|
# document length. This reduces the quality degradation if we process |
186 |
|
# only the first part of a posting list. |
187 |
|
|
188 |
|
sub sort_postings { |
189 |
|
my $self = shift; |
190 |
|
my $post = shift; # reference to a hash or packed string |
191 |
|
|
192 |
|
if (ref $post) { |
193 |
|
# we skip the sort part, if the index is not sorted |
194 |
|
return pack('w*', %$post) unless $self->{reorg}; |
195 |
|
} else { |
196 |
|
$post = { unpack 'w*', $post }; |
197 |
|
} |
198 |
|
|
199 |
|
my $r = ''; |
200 |
|
|
201 |
|
# Sort posting list by increasing ratio of maximum term frequency (~ |
202 |
|
# "document length") and term frequency. This rati multipied by the |
203 |
|
# inverse document frequence gives the score for a term. This sort |
204 |
|
# order can be exploited for tuning of single term queries. |
205 |
|
|
206 |
|
for my $did (sort { $post->{$b} / $self->{db}->{$M, $b} |
207 |
|
<=> |
208 |
|
$post->{$a} / $self->{db}->{$M, $a} |
209 |
|
} keys %$post) { |
210 |
|
$r .= pack 'w2', $did, $post->{$did}; |
211 |
|
} |
212 |
|
#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r; |
213 |
|
$r; |
214 |
|
} |
215 |
|
|
216 |
sub delete { |
sub delete { |
217 |
my $self = shift; |
my $self = shift; |
218 |
my $key = shift; |
my $key = shift; |
219 |
my %occ; |
my %occ; |
220 |
|
|
221 |
|
my $db; |
222 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
223 |
|
$db = $self->{db}; |
224 |
$self->sync; |
$self->sync; |
225 |
$self->{records}--; |
$self->{records}--; |
226 |
|
|
227 |
|
# less than zero documents in database? |
228 |
|
_complain('delete of document', $key) and $self->{records} = 0 |
229 |
|
if $self->{records} < 0; |
230 |
|
|
231 |
grep $occ{$_}++, &{$self->{func}}(@_); |
grep $occ{$_}++, &{$self->{func}}(@_); |
232 |
for (keys %occ) { |
|
233 |
# may reorder posting list |
for (keys %occ) {# may reorder posting list |
234 |
my %post = unpack 'w*', $self->{db}->{$_}; |
my %post = unpack 'w*', $db->{$_}; |
|
$self->{db}->{$O,$_}--; |
|
235 |
delete $post{$key}; |
delete $post{$key}; |
236 |
$self->{db}->{$_} = pack 'w*', %post; |
$db->{$_} = $self->sort_postings(\%post); |
237 |
|
_complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post; |
238 |
|
$db->{$O,$_} = scalar keys %post; |
239 |
} |
} |
240 |
delete $self->{db}->{$M, $key}; |
delete $db->{$M, $key}; |
241 |
} |
} |
242 |
|
|
243 |
sub intervall { |
sub intervall { |
312 |
\@result; # speed |
\@result; # speed |
313 |
} |
} |
314 |
|
|
315 |
|
=head2 search($query) |
316 |
|
|
317 |
|
The search method supports a range of search algorithms. It is |
318 |
|
recommended to tune the index by calling |
319 |
|
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents |
320 |
|
into the table. This is a computing intense operation and all inserts |
321 |
|
and deletes after this optimization are slightly more expensive. Once |
322 |
|
reorganized, the index is kept sorted automatically until you switch |
323 |
|
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>. |
324 |
|
|
325 |
|
When searching a tuned index, a query can be processed faster if the |
326 |
|
caller requests only the topmost documents. This can be done by |
327 |
|
passing a C<top =E<gt>> I<n> parameter to the search method. |
328 |
|
|
329 |
|
For single term queries, the method returns only the I<n> top ranking |
330 |
|
documents. For multi term queries two optimized algorithms are |
331 |
|
available. The first algorithm computes the top n documents |
332 |
|
approximately but very fast, sacrificing a little bit of precision for |
333 |
|
speed. The second algorithm computes the topmost I<n> documents |
334 |
|
precisely. This algorithm is slower and should be used only for small |
335 |
|
values of I<n>. It can be requested by passing the query attribute |
336 |
|
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits. |
337 |
|
While the picky version might not be faster than the brute force |
338 |
|
version on average for modest size databases it uses less memory and |
339 |
|
the processing time is almost linear in the number of query terms, not |
340 |
|
in the size of the lists. |
341 |
|
|
342 |
|
=cut |
343 |
|
|
344 |
sub search { |
sub search { |
345 |
my $self = shift; |
my $self = shift; |
346 |
|
my $query = shift; |
347 |
|
|
348 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
349 |
$self->sync; |
$self->sync; |
350 |
$self->search_raw(&{$self->{func}}(@_)); # No call to parse() here |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here |
351 |
} |
} |
352 |
|
|
353 |
sub parse { |
sub parse { |
372 |
$self->search_raw(map($self->prefix($_), @_)); |
$self->search_raw(map($self->prefix($_), @_)); |
373 |
} |
} |
374 |
|
|
375 |
|
sub _complain ($$) { |
376 |
|
my ($action, $term) = @_; |
377 |
|
|
378 |
|
require Carp; |
379 |
|
Carp::cluck |
380 |
|
(sprintf("WAIT database inconsistency during $action [%s]: ". |
381 |
|
"Please rebuild index\n", |
382 |
|
$term,)); |
383 |
|
} |
384 |
|
|
385 |
sub search_raw { |
sub search_raw { |
386 |
my $self = shift; |
my $self = shift; |
387 |
my %occ; |
my $query = shift; |
388 |
my %score; |
my %score; |
389 |
|
|
390 |
return () unless @_; |
# Top $wanted documents must be correct. Zero means all matching |
391 |
|
# documents. |
392 |
|
my $wanted = $query->{top}; |
393 |
|
my $strict = $query->{picky}; |
394 |
|
|
395 |
|
# Return at least $minacc documents. Zero means all matching |
396 |
|
# documents. |
397 |
|
# my $minacc = $query->{accus} || $wanted; |
398 |
|
|
399 |
|
# Open index and flush cache if necessary |
400 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
401 |
$self->sync; |
$self->sync; |
402 |
grep $occ{$_}++, @_; |
|
403 |
for (keys %occ) { |
# We keep duplicates |
404 |
if (defined $self->{db}->{$_}) { |
my @terms = |
405 |
|
# Sort words by decreasing document frequency |
406 |
|
sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} } |
407 |
|
# check which words occur in the index. |
408 |
|
grep { $self->{db}->{$O,$_} } @_; |
409 |
|
|
410 |
|
return () unless @terms; # nothing to search for |
411 |
|
|
412 |
|
# We special-case one term queries here. If the index was sorted, |
413 |
|
# choping off the rest of the list will return the same ranking. |
414 |
|
if ($wanted and @terms == 1) { |
415 |
|
my $term = shift @terms; |
416 |
|
my $idf = log($self->{records}/$self->{db}->{$O,$term}); |
417 |
|
my @res; |
418 |
|
|
419 |
|
if ($self->{reorg}) { # or not $query->{picky} |
420 |
|
@res = unpack "w". int(2*$wanted), $self->{db}->{$term}; |
421 |
|
} else { |
422 |
|
@res = unpack 'w*', $self->{db}->{$term}; |
423 |
|
} |
424 |
|
|
425 |
|
for (my $i=1; $i<@res; $i+=2) { |
426 |
|
$res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf; |
427 |
|
} |
428 |
|
|
429 |
|
return @res |
430 |
|
} |
431 |
|
|
432 |
|
# We separate exhaustive search here to avoid overhead and make the |
433 |
|
# code more readable. The block can be removed without changing the |
434 |
|
# result. |
435 |
|
unless ($wanted) { |
436 |
|
for (@terms) { |
437 |
|
my $df = $self->{db}->{$O,$_}; |
438 |
|
|
439 |
|
# The frequency *must* be 1 at least since the posting list is nonempty |
440 |
|
_complain('search for term', $_) and $df = 1 if $df < 1; |
441 |
|
|
442 |
|
# Unpack posting list for current query term $_ |
443 |
|
my %post = unpack 'w*', $self->{db}->{$_}; |
444 |
|
|
445 |
|
_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; |
446 |
|
# This is the inverse document frequency. The log of the inverse |
447 |
|
# fraction of documents the term occurs in. |
448 |
|
my $idf = log($self->{records}/$df); |
449 |
|
for my $did (keys %post) { |
450 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
451 |
|
$score{$did} += $post{$did} / $freq * $idf; |
452 |
|
} |
453 |
|
} |
454 |
|
} |
455 |
|
# warn sprintf "Used %d accumulators\n", scalar keys %score; |
456 |
|
return %score; |
457 |
|
} |
458 |
|
|
459 |
|
# A sloppy but fast algorithm for multiple term queries. |
460 |
|
unless ($strict) { |
461 |
|
for (@terms) { |
462 |
|
# Unpack posting list for current query term $_ |
463 |
my %post = unpack 'w*', $self->{db}->{$_}; |
my %post = unpack 'w*', $self->{db}->{$_}; |
464 |
my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1)); |
|
465 |
my $did; |
# Lookup the number of documents the term occurs in (document frequency) |
466 |
for $did (keys %post) { |
my $occ = $self->{db}->{$O,$_}; |
467 |
$score{$did} = 0 unless defined $score{$did}; # perl -w |
|
468 |
$score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf |
_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post; |
469 |
if $self->{db}->{$M, $did}; # db may be broken |
# The frequency *must* be 1 at least since the posting list is nonempty |
470 |
|
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
471 |
|
|
472 |
|
# This is the inverse document frequency. The log of the inverse |
473 |
|
# fraction of documents the term occurs in. |
474 |
|
my $idf = log($self->{records}/$occ); |
475 |
|
|
476 |
|
# If we have a reasonable number of accumulators, change the |
477 |
|
# loop to iterate over the accumulators. This will compromise |
478 |
|
# quality for better speed. The algorithm still computes the |
479 |
|
# exact weights, but the result is not guaranteed to contain the |
480 |
|
# *best* results. The database might contain documents better |
481 |
|
# than the worst returned document. |
482 |
|
|
483 |
|
# We process the lists in order of increasing length. When the |
484 |
|
# number of accumulators exceeds $wanted, no new documents are |
485 |
|
# added, only the ranking/weighting of the seen documents is |
486 |
|
# improved. The resulting ranking list must be pruned, since only |
487 |
|
# the top most documents end up near their "optimal" rank. |
488 |
|
|
489 |
|
if (keys %score < $wanted) { |
490 |
|
for my $did (keys %post) { |
491 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
492 |
|
$score{$did} += $post{$did} / $freq * $idf; |
493 |
|
} |
494 |
|
} |
495 |
|
} else { |
496 |
|
for my $did (keys %score) { |
497 |
|
next unless exists $post{$did}; |
498 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
499 |
|
$score{$did} += $post{$did} / $freq * $idf; |
500 |
|
} |
501 |
|
} |
502 |
|
} |
503 |
|
} |
504 |
|
return %score; |
505 |
|
} |
506 |
|
my @max; $max[$#terms+1]=0; |
507 |
|
my @idf; |
508 |
|
|
509 |
|
# Preparation loop. This extra loop makes sense only when "reorg" |
510 |
|
# and "wanted" are true. But at the time beeing, keeping the code |
511 |
|
# for the different search algorithms in one place seems more |
512 |
|
# desirable than some minor speedup of the brute force version. We |
513 |
|
# do cache $idf though. |
514 |
|
|
515 |
|
for (my $i = $#terms; $i >=0; $i--) { |
516 |
|
local $_ = $terms[$i]; |
517 |
|
# Lookup the number of documents the term occurs in (document frequency) |
518 |
|
my $df = $self->{db}->{$O,$_}; |
519 |
|
|
520 |
|
# The frequency *must* be 1 at least since the posting list is nonempty |
521 |
|
_complain('search for term', $_) and $df = 1 if $df < 1; |
522 |
|
|
523 |
|
# This is the inverse document frequency. The log of the inverse |
524 |
|
# fraction of documents the term occurs in. |
525 |
|
$idf[$i] = log($self->{records}/$df); |
526 |
|
|
527 |
|
my ($did,$occ); |
528 |
|
if ($self->{reorg}) { |
529 |
|
($did,$occ) = unpack 'w2', $self->{db}->{$_}; |
530 |
|
} else { # Maybe this costs more than it helps |
531 |
|
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_}); |
532 |
|
} |
533 |
|
my $freq = $self->{db}->{$M, $did}; |
534 |
|
my $max = $occ/$freq*$idf[$i]; |
535 |
|
$max[$i] = $max + $max[$i+1]; |
536 |
|
} |
537 |
|
|
538 |
|
# Main loop |
539 |
|
for my $i (0 .. $#terms) { |
540 |
|
my $term = $terms[$i]; |
541 |
|
# Unpack posting list for current query term $term. We loose the |
542 |
|
# sorting order because the assignment to a hash. |
543 |
|
my %post = unpack 'w*', $self->{db}->{$term}; |
544 |
|
|
545 |
|
_complain('search for term', $term) |
546 |
|
if $self->{db}->{$O,$term} != keys %post; |
547 |
|
|
548 |
|
my $idf = $idf[$i]; |
549 |
|
my $full; # Need to process all postings |
550 |
|
my $chop; # Score necessary to enter the ranking list |
551 |
|
|
552 |
|
if (# We know that wanted is true since we especial cased the |
553 |
|
# exhaustive search. |
554 |
|
|
555 |
|
$wanted and |
556 |
|
|
557 |
|
# We did sort here if necessary in |
558 |
|
# the preparation loop |
559 |
|
# $self->{reorg} and |
560 |
|
|
561 |
|
scalar keys %score > $wanted) { |
562 |
|
$chop = (sort { $b <=> $a } values %score)[$wanted]; |
563 |
|
$full = $max[$i] > $chop; |
564 |
|
} else { |
565 |
|
$full = 1; |
566 |
|
} |
567 |
|
|
568 |
|
if ($full) { |
569 |
|
# We need to inspect the full list. Either $wanted is not given, |
570 |
|
# the index is not sorted, or we don't have enough accumulators |
571 |
|
# yet. |
572 |
|
if (defined $chop) { |
573 |
|
# We might be able to avoid allocating accumulators |
574 |
|
for my $did (keys %post) { |
575 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
576 |
|
my $wgt = $post{$did} / $freq * $idf; |
577 |
|
# We add an accumulator if $wgt exeeds $chop |
578 |
|
if (exists $score{$did} or $wgt > $chop) { |
579 |
|
$score{$did} += $wgt; |
580 |
|
} |
581 |
|
} |
582 |
|
} |
583 |
|
} else { |
584 |
|
# Allocate acumulators for each seen document. |
585 |
|
for my $did (keys %post) { |
586 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
587 |
|
$score{$did} += $post{$did} / $freq * $idf; |
588 |
|
} |
589 |
|
} |
590 |
|
} |
591 |
|
} else { |
592 |
|
# Update existing accumulators |
593 |
|
for my $did (keys %score) { |
594 |
|
next unless exists $post{$did}; |
595 |
|
if (my $freq = $self->{db}->{$M, $did}) { |
596 |
|
$score{$did} += $post{$did} / $freq * $idf; |
597 |
|
} |
598 |
} |
} |
599 |
} |
} |
600 |
} |
} |
601 |
|
#warn sprintf "Used %d accumulators\n", scalar keys %score; |
602 |
%score; |
%score; |
603 |
} |
} |
604 |
|
|
605 |
|
sub set { |
606 |
|
my ($self, $attr, $value) = @_; |
607 |
|
|
608 |
|
die "No such indexy attribute: '$attr'" unless $attr eq 'top'; |
609 |
|
|
610 |
|
return delete $self->{reorg} if $value == 0; |
611 |
|
|
612 |
|
return if $self->{reorg}; # we are sorted already |
613 |
|
return unless $self->{mode} & O_RDWR; |
614 |
|
defined $self->{db} or $self->open; |
615 |
|
|
616 |
|
$self->sync; |
617 |
|
while (my($key, $value) = each %{$self->{db}}) { |
618 |
|
next if $key =~ /^\377[om]/; |
619 |
|
$self->{db}->{$key} = $self->sort_postings($value); |
620 |
|
} |
621 |
|
$self->{reorg} = 1; |
622 |
|
} |
623 |
|
|
624 |
sub sync { |
sub sync { |
625 |
my $self = shift; |
my $self = shift; |
626 |
|
|
627 |
if ($self->{mode} & O_RDWR) { |
if ($self->{mode} & O_RDWR) { |
628 |
print STDERR "Flushing $self->{cached} postings\n"; |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
629 |
while (my($key, $value) = each %{$self->{cache}}) { |
while (my($key, $value) = each %{$self->{cache}}) { |
630 |
$self->{db}->{$key} .= $value; |
if ($self->{reorg}) { |
631 |
#delete $self->{cache}->{$key}; |
$self->{db}->{$key} = $self->sort_postings($self->{db}->{$key} |
632 |
|
. $value); |
633 |
|
} else { |
634 |
|
$self->{db}->{$key} .= $value; |
635 |
|
} |
636 |
} |
} |
637 |
while (my($key, $value) = each %{$self->{cdict}}) { |
while (my($key, $value) = each %{$self->{cdict}}) { |
638 |
$self->{db}->{$key} = 0 unless $self->{db}->{$key}; |
$self->{db}->{$key} = 0 unless $self->{db}->{$key}; |
639 |
$self->{db}->{$key} += $value; |
$self->{db}->{$key} += $value; |
|
#delete $self->{cdict}->{$key}; |
|
640 |
} |
} |
641 |
$self->{cache} = {}; |
$self->{cache} = {}; |
642 |
$self->{cdict} = {}; |
$self->{cdict} = {}; |
|
# print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n"; |
|
643 |
$self->{cached} = 0; |
$self->{cached} = 0; |
|
# $self->{dbh}->sync if $self->{dbh}; |
|
644 |
} |
} |
645 |
} |
} |
646 |
|
|