1 |
# -*- Mode: Perl -*- |
2 |
# $Basename: InvertedIndex.pm $ |
3 |
# $Revision: 1.30 $ |
4 |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
6 |
# Last Modified By: Ulrich Pfeifer |
7 |
# Last Modified On: Sat Nov 11 16:32:38 2000 |
8 |
# Language : CPerl |
9 |
# |
10 |
# (C) Copyright 1996-2000, Ulrich Pfeifer |
11 |
# |
12 |
|
13 |
package WAIT::InvertedIndex; |
14 |
use strict; |
15 |
use DB_File; |
16 |
use Fcntl; |
17 |
use WAIT::Filter; |
18 |
use Carp; |
19 |
use vars qw(%FUNC); |
20 |
|
21 |
# The dictionary has three different key types: |
22 |
# 'o'.$word |
23 |
# |
24 |
# The document frequency is the number of documents a term occurs |
25 |
# in. The idea is that a term occuring in a significant part of the |
26 |
# documents is not too significant. |
27 |
# |
28 |
# 'm'.$word |
29 |
# |
30 |
# The maximum term frequency of a document is the frequency of the |
31 |
# most frequent term in the document. It is related to the document |
32 |
# length obviously. A document in which the most frequnet term occurs |
33 |
# 100 times is probably much longer than a document whichs most |
34 |
# frequent term occurs five time. |
35 |
# |
36 |
# 'p'.$word |
37 |
# |
38 |
# Under this key we store the actual posting list as pairs of |
39 |
# packed integers. |
40 |
|
41 |
my $no_old_index_support = 0; # do not check for old indices if set |
42 |
|
43 |
sub new { |
44 |
my $type = shift; |
45 |
my %parm = @_; |
46 |
my $self = {}; |
47 |
|
48 |
$self->{file} = $parm{file} or croak "No file specified"; |
49 |
$self->{attr} = $parm{attr} or croak "No attributes specified"; |
50 |
$self->{filter} = $parm{filter}; |
51 |
$self->{'name'} = $parm{'name'}; |
52 |
$self->{records} = 0; |
53 |
for (qw(intervall prefix)) { |
54 |
if (exists $parm{$_}) { |
55 |
if (ref $parm{$_}) { |
56 |
$self->{$_} = [@{$parm{$_}}] # clone |
57 |
} else { |
58 |
$self->{$_} = $parm{$_} |
59 |
} |
60 |
} |
61 |
} |
62 |
bless $self, ref($type) || $type; |
63 |
} |
64 |
|
65 |
sub name {$_[0]->{'name'}} |
66 |
|
67 |
sub _split_pos { |
68 |
my ($text, $pos) = @{$_[0]}; |
69 |
my @result; |
70 |
|
71 |
$text =~ s/(^\s+)// and $pos += length($1); |
72 |
while ($text =~ s/(^\S+)//) { |
73 |
my $word = $1; |
74 |
push @result, [$word, $pos]; |
75 |
$pos += length($word); |
76 |
$text =~ s/(^\s+)// and $pos += length($1); |
77 |
} |
78 |
@result; |
79 |
} |
80 |
|
81 |
sub _xfiltergen { |
82 |
my $filter = pop @_; |
83 |
|
84 |
# Oops, we cannot overrule the user's choice. Other filters may kill |
85 |
# stopwords, such as isotr clobbers "isn't" to "isnt". |
86 |
|
87 |
# if ($filter eq 'stop') { # avoid the slow stopword elimination |
88 |
# return _xfiltergen(@_); # it's cheaper to look them up afterwards |
89 |
# } |
90 |
if (@_) { |
91 |
if ($filter =~ /^split(\d*)/) { |
92 |
if ($1) { |
93 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ; |
94 |
} else { |
95 |
"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ; |
96 |
} |
97 |
} else { |
98 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')'; |
99 |
} |
100 |
} else { |
101 |
if ($filter =~ /^split(\d*)/) { |
102 |
if ($1) { |
103 |
"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ; |
104 |
} else { |
105 |
"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ; |
106 |
} |
107 |
} else { |
108 |
"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])"; |
109 |
} |
110 |
} |
111 |
} |
112 |
|
113 |
sub parse_pos { |
114 |
my $self = shift; |
115 |
|
116 |
unless (exists $self->{xfunc}) { |
117 |
$self->{xfunc} = |
118 |
eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}})); |
119 |
#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}}); |
120 |
} |
121 |
&{$self->{xfunc}}($_[0]); |
122 |
} |
123 |
|
124 |
sub _filtergen { |
125 |
my $filter = pop @_; |
126 |
|
127 |
if (@_) { |
128 |
"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')'; |
129 |
} else { |
130 |
"map(&WAIT::Filter::$filter(\$_), \@_)"; |
131 |
} |
132 |
} |
133 |
|
134 |
sub drop { |
135 |
my $self = shift; |
136 |
if ((caller)[0] eq 'WAIT::Table') { # Table knows about this |
137 |
my $file = $self->{file}; |
138 |
|
139 |
! (!-e $file or unlink $file); |
140 |
} else { # notify our database |
141 |
croak ref($self)."::drop called directly"; |
142 |
} |
143 |
} |
144 |
|
145 |
sub is_an_old_index { |
146 |
my $self = shift; |
147 |
|
148 |
return 0 if $no_old_index_support; |
149 |
return $self->{old_index} if exists $self->{old_index}; |
150 |
|
151 |
# We can only guess if this is an old index. We lookup the first 10 |
152 |
# $O entries. If all values are integers, we assume that the index |
153 |
# is an old one. |
154 |
|
155 |
defined $self->{db} or $self->open; |
156 |
$self->sync; |
157 |
my $dbh = $self->{dbh}; # for convenience |
158 |
|
159 |
my $O = pack('C', 0xff)."o"; |
160 |
my ($word, $value) = ($O.$;); |
161 |
$dbh->seq($word, $value, R_CURSOR); |
162 |
for (my $i=0; $i<10;$i++) { |
163 |
if ($value !~ /^\d+$/) { |
164 |
return $self->{old_index} = 0; |
165 |
} |
166 |
if ($dbh->seq($word, $value, R_NEXT) or # no values left |
167 |
$word !~ /^$O/o # no $O values left |
168 |
) { |
169 |
# we are not sure enough that this is an old index |
170 |
return $self->{old_index} = 0; |
171 |
} |
172 |
} |
173 |
return $self->{old_index} = 1; |
174 |
} |
175 |
|
176 |
sub open { |
177 |
my $self = shift; |
178 |
my $file = $self->{file}; |
179 |
|
180 |
if (defined $self->{dbh}) { |
181 |
$self->{dbh}; |
182 |
} else { |
183 |
$self->{func} = |
184 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}})); |
185 |
$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file, |
186 |
$self->{mode}, 0664, $DB_BTREE); |
187 |
$self->{cache} = {} |
188 |
if $self->{mode} & O_RDWR; |
189 |
$self->{cdict} = {} |
190 |
if $self->{mode} & O_RDWR; |
191 |
$self->{cached} = 0; |
192 |
if (!$no_old_index_support and $self->is_an_old_index()) { |
193 |
warn "This is an old index, upgrade you database"; |
194 |
require WAIT::InvertedIndexOld; |
195 |
bless $self, 'WAIT::InvertedIndexOld'; |
196 |
} |
197 |
} |
198 |
} |
199 |
|
200 |
sub insert { |
201 |
my $self = shift; |
202 |
my $key = shift; |
203 |
my %occ; |
204 |
|
205 |
defined $self->{db} or $self->open; |
206 |
grep $occ{$_}++, &{$self->{func}}(@_); |
207 |
my ($word, $noc); |
208 |
$self->{records}++; |
209 |
while (($word, $noc) = each %occ) { |
210 |
if (defined $self->{cache}->{$word}) { |
211 |
$self->{cdict}->{$word}++; |
212 |
$self->{cache}->{$word} .= pack 'w2', $key, $noc; |
213 |
} else { |
214 |
$self->{cdict}->{$word} = 1; |
215 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
216 |
} |
217 |
$self->{cached}++; |
218 |
} |
219 |
# This cache limit should be configurable |
220 |
$self->sync if $self->{cached} > 100_000; |
221 |
my $maxtf = 0; |
222 |
for (values %occ) { |
223 |
$maxtf = $_ if $_ > $maxtf; |
224 |
} |
225 |
$self->{db}->{'m'. $key} = $maxtf; |
226 |
} |
227 |
|
228 |
# We sort postings by increasing max term frequency (~ by increasing |
229 |
# document length. This reduces the quality degradation if we process |
230 |
# only the first part of a posting list. |
231 |
|
232 |
sub sort_postings { |
233 |
my $self = shift; |
234 |
my $post = shift; # reference to a hash or packed string |
235 |
|
236 |
if (ref $post) { |
237 |
# we skip the sort part, if the index is not sorted |
238 |
return pack('w*', %$post) unless $self->{reorg}; |
239 |
} else { |
240 |
$post = { unpack 'w*', $post }; |
241 |
} |
242 |
|
243 |
my $r = ''; |
244 |
|
245 |
# Sort posting list by increasing ratio of maximum term frequency (~ |
246 |
# "document length") and term frequency. This rati multipied by the |
247 |
# inverse document frequence gives the score for a term. This sort |
248 |
# order can be exploited for tuning of single term queries. |
249 |
|
250 |
for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b} |
251 |
<=> |
252 |
$post->{$a} / $self->{db}->{'m'. $a} |
253 |
} keys %$post) { |
254 |
$r .= pack 'w2', $did, $post->{$did}; |
255 |
} |
256 |
#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r; |
257 |
$r; |
258 |
} |
259 |
|
260 |
sub delete { |
261 |
my $self = shift; |
262 |
my $key = shift; |
263 |
my %occ; |
264 |
|
265 |
my $db; |
266 |
defined $self->{db} or $self->open; |
267 |
$db = $self->{db}; |
268 |
$self->sync; |
269 |
$self->{records}--; |
270 |
|
271 |
# less than zero documents in database? |
272 |
_complain('delete of document', $key) and $self->{records} = 0 |
273 |
if $self->{records} < 0; |
274 |
|
275 |
grep $occ{$_}++, &{$self->{func}}(@_); |
276 |
|
277 |
for (keys %occ) {# may reorder posting list |
278 |
my %post = unpack 'w*', $db->{'p'.$_}; |
279 |
delete $post{$key}; |
280 |
$db->{'p'.$_} = $self->sort_postings(\%post); |
281 |
_complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post; |
282 |
$db->{'o'.$_} = scalar keys %post; |
283 |
} |
284 |
delete $db->{'m'. $key}; |
285 |
} |
286 |
|
287 |
sub intervall { |
288 |
my ($self, $first, $last) = @_; |
289 |
my $value = ''; |
290 |
my $word = ''; |
291 |
my @result; |
292 |
|
293 |
return unless exists $self->{'intervall'}; |
294 |
|
295 |
defined $self->{db} or $self->open; |
296 |
$self->sync; |
297 |
my $dbh = $self->{dbh}; # for convenience |
298 |
|
299 |
if (ref $self->{'intervall'}) { |
300 |
unless (exists $self->{'ifunc'}) { |
301 |
$self->{'ifunc'} = |
302 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}})); |
303 |
} |
304 |
($first) = &{$self->{'ifunc'}}($first) if $first; |
305 |
($last) = &{$self->{'ifunc'}}($last) if $last; |
306 |
} |
307 |
$first = 'p'.($first||''); |
308 |
$last = (defined $last)?'p'.$last:'q'; |
309 |
|
310 |
# set the cursor to $first |
311 |
$dbh->seq($first, $value, R_CURSOR); |
312 |
|
313 |
# $first would be after the last word |
314 |
return () if $first gt $last; |
315 |
|
316 |
push @result, substr($first,1); |
317 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
318 |
# We should limit this to a "resonable" number of words |
319 |
last if $word gt $last; |
320 |
push @result, substr($word,1); |
321 |
} |
322 |
\@result; # speed |
323 |
} |
324 |
|
325 |
sub prefix { |
326 |
my ($self, $prefix) = @_; |
327 |
my $value = ''; |
328 |
my $word = ''; |
329 |
my @result; |
330 |
|
331 |
return () unless defined $prefix; # Full dictionary requested !! |
332 |
return unless exists $self->{'prefix'}; |
333 |
defined $self->{db} or $self->open; |
334 |
$self->sync; |
335 |
my $dbh = $self->{dbh}; |
336 |
|
337 |
if (ref $self->{'prefix'}) { |
338 |
unless (exists $self->{'pfunc'}) { |
339 |
$self->{'pfunc'} = |
340 |
eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}})); |
341 |
} |
342 |
($prefix) = &{$self->{'pfunc'}}($prefix); |
343 |
} |
344 |
|
345 |
if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) { |
346 |
return (); |
347 |
} |
348 |
return () if $word !~ /^p$prefix/; |
349 |
push @result, substr($word,1); |
350 |
|
351 |
while (!$dbh->seq($word, $value, R_NEXT)) { |
352 |
# We should limit this to a "resonable" number of words |
353 |
last if $word !~ /^p$prefix/; |
354 |
push @result, substr($word,1); |
355 |
} |
356 |
\@result; # speed |
357 |
} |
358 |
|
359 |
=head2 search($query) |
360 |
|
361 |
The search method supports a range of search algorithms. It is |
362 |
recommended to tune the index by calling |
363 |
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents |
364 |
into the table. This is a computing intense operation and all inserts |
365 |
and deletes after this optimization are slightly more expensive. Once |
366 |
reorganized, the index is kept sorted automatically until you switch |
367 |
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>. |
368 |
|
369 |
When searching a tuned index, a query can be processed faster if the |
370 |
caller requests only the topmost documents. This can be done by |
371 |
passing a C<top =E<gt>> I<n> parameter to the search method. |
372 |
|
373 |
For single term queries, the method returns only the I<n> top ranking |
374 |
documents. For multi term queries two optimized algorithms are |
375 |
available. The first algorithm computes the top n documents |
376 |
approximately but very fast, sacrificing a little bit of precision for |
377 |
speed. The second algorithm computes the topmost I<n> documents |
378 |
precisely. This algorithm is slower and should be used only for small |
379 |
values of I<n>. It can be requested by passing the query attribute |
380 |
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits. |
381 |
While the picky version might not be faster than the brute force |
382 |
version on average for modest size databases it uses less memory and |
383 |
the processing time is almost linear in the number of query terms, not |
384 |
in the size of the lists. |
385 |
|
386 |
=cut |
387 |
|
388 |
sub search { |
389 |
my $self = shift; |
390 |
my $query = shift; |
391 |
|
392 |
defined $self->{db} or $self->open; |
393 |
$self->sync; |
394 |
$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here |
395 |
} |
396 |
|
397 |
sub parse { |
398 |
my $self = shift; |
399 |
|
400 |
defined $self->{db} or $self->open; |
401 |
&{$self->{func}}(@_); |
402 |
} |
403 |
|
404 |
sub keys { |
405 |
my $self = shift; |
406 |
|
407 |
defined $self->{db} or $self->open; |
408 |
keys %{$self->{db}}; |
409 |
} |
410 |
|
411 |
sub search_prefix { |
412 |
my $self = shift; |
413 |
|
414 |
# print "search_prefix(@_)\n"; |
415 |
defined $self->{db} or $self->open; |
416 |
$self->search_raw(map($self->prefix($_), @_)); |
417 |
} |
418 |
|
419 |
sub _complain ($$) { |
420 |
my ($action, $term) = @_; |
421 |
|
422 |
require Carp; |
423 |
Carp::cluck |
424 |
(sprintf("WAIT database inconsistency during $action [%s]: ". |
425 |
"Please rebuild index\n", |
426 |
$term,)); |
427 |
} |
428 |
|
429 |
sub search_raw { |
430 |
my $self = shift; |
431 |
my $query = shift; |
432 |
my %score; |
433 |
|
434 |
# Top $wanted documents must be correct. Zero means all matching |
435 |
# documents. |
436 |
my $wanted = $query->{top}; |
437 |
my $strict = $query->{picky}; |
438 |
|
439 |
# Return at least $minacc documents. Zero means all matching |
440 |
# documents. |
441 |
# my $minacc = $query->{accus} || $wanted; |
442 |
|
443 |
# Open index and flush cache if necessary |
444 |
defined $self->{db} or $self->open; |
445 |
$self->sync; |
446 |
|
447 |
# We keep duplicates |
448 |
my @terms = |
449 |
# Sort words by decreasing document frequency |
450 |
sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} } |
451 |
# check which words occur in the index. |
452 |
grep { $self->{db}->{'o'.$_} } @_; |
453 |
|
454 |
return () unless @terms; # nothing to search for |
455 |
|
456 |
# We special-case one term queries here. If the index was sorted, |
457 |
# choping off the rest of the list will return the same ranking. |
458 |
if ($wanted and @terms == 1) { |
459 |
my $term = shift @terms; |
460 |
my $idf = log($self->{records}/$self->{db}->{'o'.$term}); |
461 |
my @res; |
462 |
|
463 |
if ($self->{reorg}) { # or not $query->{picky} |
464 |
@res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term}; |
465 |
} else { |
466 |
@res = unpack 'w*', $self->{db}->{'p'.$term}; |
467 |
} |
468 |
|
469 |
for (my $i=1; $i<@res; $i+=2) { |
470 |
$res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf; |
471 |
} |
472 |
|
473 |
return @res |
474 |
} |
475 |
|
476 |
# We separate exhaustive search here to avoid overhead and make the |
477 |
# code more readable. The block can be removed without changing the |
478 |
# result. |
479 |
unless ($wanted) { |
480 |
for (@terms) { |
481 |
my $df = $self->{db}->{'o'.$_}; |
482 |
|
483 |
# The frequency *must* be 1 at least since the posting list is nonempty |
484 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
485 |
|
486 |
# Unpack posting list for current query term $_ |
487 |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
488 |
|
489 |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
490 |
# This is the inverse document frequency. The log of the inverse |
491 |
# fraction of documents the term occurs in. |
492 |
my $idf = log($self->{records}/$df); |
493 |
for my $did (keys %post) { |
494 |
if (my $freq = $self->{db}->{'m'. $did}) { |
495 |
$score{$did} += $post{$did} / $freq * $idf; |
496 |
} |
497 |
} |
498 |
} |
499 |
# warn sprintf "Used %d accumulators\n", scalar keys %score; |
500 |
return %score; |
501 |
} |
502 |
|
503 |
# A sloppy but fast algorithm for multiple term queries. |
504 |
unless ($strict) { |
505 |
for (@terms) { |
506 |
# Unpack posting list for current query term $_ |
507 |
my %post = unpack 'w*', $self->{db}->{'p'.$_}; |
508 |
|
509 |
# Lookup the number of documents the term occurs in (document frequency) |
510 |
my $occ = $self->{db}->{'o'.$_}; |
511 |
|
512 |
_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post; |
513 |
# The frequency *must* be 1 at least since the posting list is nonempty |
514 |
_complain('search for term', $_) and $occ = 1 if $occ < 1; |
515 |
|
516 |
# This is the inverse document frequency. The log of the inverse |
517 |
# fraction of documents the term occurs in. |
518 |
my $idf = log($self->{records}/$occ); |
519 |
|
520 |
# If we have a reasonable number of accumulators, change the |
521 |
# loop to iterate over the accumulators. This will compromise |
522 |
# quality for better speed. The algorithm still computes the |
523 |
# exact weights, but the result is not guaranteed to contain the |
524 |
# *best* results. The database might contain documents better |
525 |
# than the worst returned document. |
526 |
|
527 |
# We process the lists in order of increasing length. When the |
528 |
# number of accumulators exceeds $wanted, no new documents are |
529 |
# added, only the ranking/weighting of the seen documents is |
530 |
# improved. The resulting ranking list must be pruned, since only |
531 |
# the top most documents end up near their "optimal" rank. |
532 |
|
533 |
if (keys %score < $wanted) { |
534 |
for my $did (keys %post) { |
535 |
if (my $freq = $self->{db}->{'m'. $did}) { |
536 |
$score{$did} += $post{$did} / $freq * $idf; |
537 |
} |
538 |
} |
539 |
} else { |
540 |
for my $did (keys %score) { |
541 |
next unless exists $post{$did}; |
542 |
if (my $freq = $self->{db}->{'m'. $did}) { |
543 |
$score{$did} += $post{$did} / $freq * $idf; |
544 |
} |
545 |
} |
546 |
} |
547 |
} |
548 |
return %score; |
549 |
} |
550 |
my @max; $max[$#terms+1]=0; |
551 |
my @idf; |
552 |
|
553 |
# Preparation loop. This extra loop makes sense only when "reorg" |
554 |
# and "wanted" are true. But at the time beeing, keeping the code |
555 |
# for the different search algorithms in one place seems more |
556 |
# desirable than some minor speedup of the brute force version. We |
557 |
# do cache $idf though. |
558 |
|
559 |
for (my $i = $#terms; $i >=0; $i--) { |
560 |
local $_ = $terms[$i]; |
561 |
# Lookup the number of documents the term occurs in (document frequency) |
562 |
my $df = $self->{db}->{'o'.$_}; |
563 |
|
564 |
# The frequency *must* be 1 at least since the posting list is nonempty |
565 |
_complain('search for term', $_) and $df = 1 if $df < 1; |
566 |
|
567 |
# This is the inverse document frequency. The log of the inverse |
568 |
# fraction of documents the term occurs in. |
569 |
$idf[$i] = log($self->{records}/$df); |
570 |
|
571 |
my ($did,$occ); |
572 |
if ($self->{reorg}) { |
573 |
($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_}; |
574 |
} else { # Maybe this costs more than it helps |
575 |
($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_}); |
576 |
} |
577 |
my $freq = $self->{db}->{'m'. $did}; |
578 |
my $max = $occ/$freq*$idf[$i]; |
579 |
$max[$i] = $max + $max[$i+1]; |
580 |
} |
581 |
|
582 |
# Main loop |
583 |
for my $i (0 .. $#terms) { |
584 |
my $term = $terms[$i]; |
585 |
# Unpack posting list for current query term $term. We loose the |
586 |
# sorting order because the assignment to a hash. |
587 |
my %post = unpack 'w*', $self->{db}->{'p'.$term}; |
588 |
|
589 |
_complain('search for term', $term) |
590 |
if $self->{db}->{'o'.$term} != keys %post; |
591 |
|
592 |
my $idf = $idf[$i]; |
593 |
my $full; # Need to process all postings |
594 |
my $chop; # Score necessary to enter the ranking list |
595 |
|
596 |
if (# We know that wanted is true since we especial cased the |
597 |
# exhaustive search. |
598 |
|
599 |
$wanted and |
600 |
|
601 |
# We did sort here if necessary in |
602 |
# the preparation loop |
603 |
# $self->{reorg} and |
604 |
|
605 |
scalar keys %score > $wanted) { |
606 |
$chop = (sort { $b <=> $a } values %score)[$wanted]; |
607 |
$full = $max[$i] > $chop; |
608 |
} else { |
609 |
$full = 1; |
610 |
} |
611 |
|
612 |
if ($full) { |
613 |
# We need to inspect the full list. Either $wanted is not given, |
614 |
# the index is not sorted, or we don't have enough accumulators |
615 |
# yet. |
616 |
if (defined $chop) { |
617 |
# We might be able to avoid allocating accumulators |
618 |
for my $did (keys %post) { |
619 |
if (my $freq = $self->{db}->{'m'. $did}) { |
620 |
my $wgt = $post{$did} / $freq * $idf; |
621 |
# We add an accumulator if $wgt exeeds $chop |
622 |
if (exists $score{$did} or $wgt > $chop) { |
623 |
$score{$did} += $wgt; |
624 |
} |
625 |
} |
626 |
} |
627 |
} else { |
628 |
# Allocate acumulators for each seen document. |
629 |
for my $did (keys %post) { |
630 |
if (my $freq = $self->{db}->{'m'. $did}) { |
631 |
$score{$did} += $post{$did} / $freq * $idf; |
632 |
} |
633 |
} |
634 |
} |
635 |
} else { |
636 |
# Update existing accumulators |
637 |
for my $did (keys %score) { |
638 |
next unless exists $post{$did}; |
639 |
if (my $freq = $self->{db}->{'m'. $did}) { |
640 |
$score{$did} += $post{$did} / $freq * $idf; |
641 |
} |
642 |
} |
643 |
} |
644 |
} |
645 |
#warn sprintf "Used %d accumulators\n", scalar keys %score; |
646 |
%score; |
647 |
} |
648 |
|
649 |
sub set { |
650 |
my ($self, $attr, $value) = @_; |
651 |
|
652 |
die "No such indexy attribute: '$attr'" unless $attr eq 'top'; |
653 |
|
654 |
return delete $self->{reorg} if $value == 0; |
655 |
|
656 |
return if $self->{reorg}; # we are sorted already |
657 |
return unless $self->{mode} & O_RDWR; |
658 |
defined $self->{db} or $self->open; |
659 |
|
660 |
$self->sync; |
661 |
while (my($key, $value) = each %{$self->{db}}) { |
662 |
next if $key !~ /^p/; |
663 |
$self->{db}->{$key} = $self->sort_postings($value); |
664 |
} |
665 |
$self->{reorg} = 1; |
666 |
} |
667 |
|
668 |
sub sync { |
669 |
my $self = shift; |
670 |
|
671 |
if ($self->{mode} & O_RDWR) { |
672 |
print STDERR "Flushing $self->{cached} postings\n" if $self->{cached}; |
673 |
while (my($key, $value) = each %{$self->{cache}}) { |
674 |
if ($self->{reorg}) { |
675 |
$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key} |
676 |
. $value); |
677 |
} else { |
678 |
$self->{db}->{'p'.$key} .= $value; |
679 |
} |
680 |
} |
681 |
while (my($key, $value) = each %{$self->{cdict}}) { |
682 |
$self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key}; |
683 |
$self->{db}->{'o'.$key} += $value; |
684 |
} |
685 |
$self->{cache} = {}; |
686 |
$self->{cdict} = {}; |
687 |
$self->{cached} = 0; |
688 |
} |
689 |
} |
690 |
|
691 |
sub close { |
692 |
my $self = shift; |
693 |
|
694 |
if ($self->{dbh}) { |
695 |
$self->sync; |
696 |
delete $self->{dbh}; |
697 |
untie %{$self->{db}}; |
698 |
delete $self->{db}; |
699 |
delete $self->{func}; |
700 |
delete $self->{cache}; |
701 |
delete $self->{cached}; |
702 |
delete $self->{cdict}; |
703 |
delete $self->{pfunc} if defined $self->{pfunc}; |
704 |
delete $self->{ifunc} if defined $self->{ifunc}; |
705 |
delete $self->{xfunc} if defined $self->{xfunc}; |
706 |
} |
707 |
} |
708 |
|
709 |
1; |
710 |
|