/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 40 - (hide annotations)
Mon Nov 13 10:44:03 2000 UTC (23 years, 6 months ago) by laperla
File size: 22237 byte(s)
seq returns 0 on success. So if there is an error on positioning the
cursor on $O$;, then we can say it is not an old index. The condition
is
    if $dbh->seq, then return
but it was
    $dbh->seq or return.

Besides there are now some outcommented debugging statements that I
needed from time to time. They should be removed some day later but
left for a while to ease the debugging process.

1 ulpfr 19 # -*- Mode: Perl -*-
2     # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 ulpfr 36 # Last Modified On: Sun Nov 12 14:40:21 2000
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10     # (C) Copyright 1996-2000, Ulrich Pfeifer
11     #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15     use DB_File;
16     use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19 laperla 30 use vars qw(%FUNC $VERSION);
20 ulpfr 10
21 laperla 30 $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23 ulpfr 22 # The dictionary has three different key types:
24     # 'o'.$word
25     #
26     # The document frequency is the number of documents a term occurs
27     # in. The idea is that a term occuring in a significant part of the
28     # documents is not too significant.
29     #
30     # 'm'.$word
31     #
32     # The maximum term frequency of a document is the frequency of the
33     # most frequent term in the document. It is related to the document
34     # length obviously. A document in which the most frequnet term occurs
35     # 100 times is probably much longer than a document whichs most
36     # frequent term occurs five time.
37     #
38     # 'p'.$word
39     #
40     # Under this key we store the actual posting list as pairs of
41     # packed integers.
42 ulpfr 10
43 ulpfr 22 my $no_old_index_support = 0; # do not check for old indices if set
44 ulpfr 19
45 ulpfr 10 sub new {
46     my $type = shift;
47     my %parm = @_;
48     my $self = {};
49    
50     $self->{file} = $parm{file} or croak "No file specified";
51     $self->{attr} = $parm{attr} or croak "No attributes specified";
52     $self->{filter} = $parm{filter};
53     $self->{'name'} = $parm{'name'};
54     $self->{records} = 0;
55     for (qw(intervall prefix)) {
56     if (exists $parm{$_}) {
57     if (ref $parm{$_}) {
58     $self->{$_} = [@{$parm{$_}}] # clone
59     } else {
60     $self->{$_} = $parm{$_}
61     }
62     }
63     }
64     bless $self, ref($type) || $type;
65     }
66    
67     sub name {$_[0]->{'name'}}
68    
69     sub _split_pos {
70     my ($text, $pos) = @{$_[0]};
71     my @result;
72    
73     $text =~ s/(^\s+)// and $pos += length($1);
74     while ($text =~ s/(^\S+)//) {
75     my $word = $1;
76     push @result, [$word, $pos];
77     $pos += length($word);
78     $text =~ s/(^\s+)// and $pos += length($1);
79     }
80     @result;
81     }
82    
83     sub _xfiltergen {
84     my $filter = pop @_;
85    
86 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
87     # stopwords, such as isotr clobbers "isn't" to "isnt".
88    
89     # if ($filter eq 'stop') { # avoid the slow stopword elimination
90     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
91     # }
92 ulpfr 10 if (@_) {
93     if ($filter =~ /^split(\d*)/) {
94     if ($1) {
95     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
96     } else {
97     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
98     }
99     } else {
100     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
101     }
102     } else {
103     if ($filter =~ /^split(\d*)/) {
104     if ($1) {
105     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
106     } else {
107     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
108     }
109     } else {
110     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
111     }
112     }
113     }
114    
115     sub parse_pos {
116     my $self = shift;
117    
118     unless (exists $self->{xfunc}) {
119     $self->{xfunc} =
120     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
121     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
122     }
123     &{$self->{xfunc}}($_[0]);
124     }
125    
126     sub _filtergen {
127     my $filter = pop @_;
128    
129     if (@_) {
130     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
131     } else {
132     "map(&WAIT::Filter::$filter(\$_), \@_)";
133     }
134     }
135    
136     sub drop {
137     my $self = shift;
138     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
139     my $file = $self->{file};
140    
141     ! (!-e $file or unlink $file);
142     } else { # notify our database
143     croak ref($self)."::drop called directly";
144     }
145     }
146    
147 ulpfr 22 sub is_an_old_index {
148     my $self = shift;
149    
150     return 0 if $no_old_index_support;
151     return $self->{old_index} if exists $self->{old_index};
152    
153     # We can only guess if this is an old index. We lookup the first 10
154     # $O entries. If all values are integers, we assume that the index
155     # is an old one.
156    
157     defined $self->{db} or $self->open;
158     $self->sync;
159     my $dbh = $self->{dbh}; # for convenience
160    
161     my $O = pack('C', 0xff)."o";
162 laperla 40 my ($word, $value) = ($O.$;); # $word and $value are modified by seq!
163     if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165     return $self->{old_index} = 0;
166     }
167 ulpfr 22 for (my $i=0; $i<10;$i++) {
168     if ($value !~ /^\d+$/) {
169 laperla 40 # warn "DEBUG: word[$word]value[$value], not an old index";
170 ulpfr 22 return $self->{old_index} = 0;
171     }
172 laperla 40 if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173 ulpfr 36 $word !~ /^$O$;/o # no $O values left
174 ulpfr 22 ) {
175     # we are not sure enough that this is an old index
176 laperla 40 # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177 ulpfr 22 return $self->{old_index} = 0;
178     }
179     }
180 laperla 40 # warn "DEBUG: old index";
181 ulpfr 22 return $self->{old_index} = 1;
182     }
183    
184 ulpfr 10 sub open {
185     my $self = shift;
186     my $file = $self->{file};
187    
188     if (defined $self->{dbh}) {
189     $self->{dbh};
190     } else {
191     $self->{func} =
192     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
194     $self->{mode}, 0664, $DB_BTREE);
195     $self->{cache} = {}
196     if $self->{mode} & O_RDWR;
197     $self->{cdict} = {}
198     if $self->{mode} & O_RDWR;
199     $self->{cached} = 0;
200 ulpfr 22 if (!$no_old_index_support and $self->is_an_old_index()) {
201     warn "This is an old index, upgrade you database";
202     require WAIT::InvertedIndexOld;
203     bless $self, 'WAIT::InvertedIndexOld';
204     }
205 ulpfr 10 }
206     }
207    
208     sub insert {
209     my $self = shift;
210     my $key = shift;
211     my %occ;
212 ulpfr 13
213 ulpfr 10 defined $self->{db} or $self->open;
214     grep $occ{$_}++, &{$self->{func}}(@_);
215     my ($word, $noc);
216     $self->{records}++;
217     while (($word, $noc) = each %occ) {
218     if (defined $self->{cache}->{$word}) {
219 ulpfr 22 $self->{cdict}->{$word}++;
220 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221 ulpfr 22 } else {
222     $self->{cdict}->{$word} = 1;
223 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
224 ulpfr 13 }
225 ulpfr 10 $self->{cached}++;
226     }
227 ulpfr 19 # This cache limit should be configurable
228 ulpfr 10 $self->sync if $self->{cached} > 100_000;
229     my $maxtf = 0;
230     for (values %occ) {
231     $maxtf = $_ if $_ > $maxtf;
232     }
233 ulpfr 22 $self->{db}->{'m'. $key} = $maxtf;
234 ulpfr 10 }
235    
236 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
237     # document length. This reduces the quality degradation if we process
238     # only the first part of a posting list.
239    
240     sub sort_postings {
241     my $self = shift;
242     my $post = shift; # reference to a hash or packed string
243    
244     if (ref $post) {
245     # we skip the sort part, if the index is not sorted
246     return pack('w*', %$post) unless $self->{reorg};
247     } else {
248     $post = { unpack 'w*', $post };
249     }
250    
251     my $r = '';
252    
253     # Sort posting list by increasing ratio of maximum term frequency (~
254     # "document length") and term frequency. This rati multipied by the
255     # inverse document frequence gives the score for a term. This sort
256     # order can be exploited for tuning of single term queries.
257    
258 laperla 30 for my $did (keys %$post) { # sanity check
259     unless ($self->{db}->{"m". $did}) {
260     warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261     $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262     }
263     }
264 ulpfr 22 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
265 ulpfr 19 <=>
266 ulpfr 22 $post->{$a} / $self->{db}->{'m'. $a}
267 ulpfr 19 } keys %$post) {
268     $r .= pack 'w2', $did, $post->{$did};
269     }
270     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
271     $r;
272     }
273    
274 ulpfr 10 sub delete {
275     my $self = shift;
276     my $key = shift;
277     my %occ;
278    
279 ulpfr 19 my $db;
280 ulpfr 10 defined $self->{db} or $self->open;
281 ulpfr 19 $db = $self->{db};
282 ulpfr 10 $self->sync;
283     $self->{records}--;
284 ulpfr 19
285     # less than zero documents in database?
286     _complain('delete of document', $key) and $self->{records} = 0
287     if $self->{records} < 0;
288    
289 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
290 ulpfr 19
291 laperla 30 # Be prepared for "Odd number of elements in hash assignment"
292     local $SIG{__WARN__} = sub {
293     my $warning = shift;
294     chomp $warning;
295     warn "Catching warning[$warning] during delete of key[$key]";
296     };
297 ulpfr 19 for (keys %occ) {# may reorder posting list
298 ulpfr 22 my %post = unpack 'w*', $db->{'p'.$_};
299 ulpfr 10 delete $post{$key};
300 ulpfr 22 $db->{'p'.$_} = $self->sort_postings(\%post);
301     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302     $db->{'o'.$_} = scalar keys %post;
303 ulpfr 10 }
304 ulpfr 22 delete $db->{'m'. $key};
305 ulpfr 10 }
306    
307     sub intervall {
308     my ($self, $first, $last) = @_;
309     my $value = '';
310     my $word = '';
311     my @result;
312    
313     return unless exists $self->{'intervall'};
314    
315     defined $self->{db} or $self->open;
316     $self->sync;
317     my $dbh = $self->{dbh}; # for convenience
318    
319     if (ref $self->{'intervall'}) {
320     unless (exists $self->{'ifunc'}) {
321     $self->{'ifunc'} =
322     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
323     }
324     ($first) = &{$self->{'ifunc'}}($first) if $first;
325     ($last) = &{$self->{'ifunc'}}($last) if $last;
326     }
327 ulpfr 22 $first = 'p'.($first||'');
328     $last = (defined $last)?'p'.$last:'q';
329    
330     # set the cursor to $first
331     $dbh->seq($first, $value, R_CURSOR);
332    
333     # $first would be after the last word
334     return () if $first gt $last;
335 ulpfr 10
336 ulpfr 22 push @result, substr($first,1);
337 ulpfr 10 while (!$dbh->seq($word, $value, R_NEXT)) {
338     # We should limit this to a "resonable" number of words
339 ulpfr 22 last if $word gt $last;
340     push @result, substr($word,1);
341 ulpfr 10 }
342     \@result; # speed
343     }
344    
345     sub prefix {
346     my ($self, $prefix) = @_;
347     my $value = '';
348     my $word = '';
349     my @result;
350    
351     return () unless defined $prefix; # Full dictionary requested !!
352     return unless exists $self->{'prefix'};
353     defined $self->{db} or $self->open;
354     $self->sync;
355     my $dbh = $self->{dbh};
356    
357     if (ref $self->{'prefix'}) {
358     unless (exists $self->{'pfunc'}) {
359     $self->{'pfunc'} =
360     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
361     }
362     ($prefix) = &{$self->{'pfunc'}}($prefix);
363     }
364    
365 ulpfr 22 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366 ulpfr 10 return ();
367     }
368 ulpfr 22 return () if $word !~ /^p$prefix/;
369     push @result, substr($word,1);
370 ulpfr 10
371     while (!$dbh->seq($word, $value, R_NEXT)) {
372     # We should limit this to a "resonable" number of words
373 ulpfr 22 last if $word !~ /^p$prefix/;
374     push @result, substr($word,1);
375 ulpfr 10 }
376     \@result; # speed
377     }
378    
379 ulpfr 19 =head2 search($query)
380    
381     The search method supports a range of search algorithms. It is
382     recommended to tune the index by calling
383     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
384     into the table. This is a computing intense operation and all inserts
385     and deletes after this optimization are slightly more expensive. Once
386     reorganized, the index is kept sorted automatically until you switch
387     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
388    
389     When searching a tuned index, a query can be processed faster if the
390     caller requests only the topmost documents. This can be done by
391     passing a C<top =E<gt>> I<n> parameter to the search method.
392    
393     For single term queries, the method returns only the I<n> top ranking
394     documents. For multi term queries two optimized algorithms are
395     available. The first algorithm computes the top n documents
396     approximately but very fast, sacrificing a little bit of precision for
397     speed. The second algorithm computes the topmost I<n> documents
398     precisely. This algorithm is slower and should be used only for small
399     values of I<n>. It can be requested by passing the query attribute
400     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
401     While the picky version might not be faster than the brute force
402     version on average for modest size databases it uses less memory and
403     the processing time is almost linear in the number of query terms, not
404     in the size of the lists.
405    
406     =cut
407    
408 ulpfr 10 sub search {
409     my $self = shift;
410 ulpfr 19 my $query = shift;
411 ulpfr 10
412     defined $self->{db} or $self->open;
413     $self->sync;
414 ulpfr 19 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
415 ulpfr 10 }
416    
417     sub parse {
418     my $self = shift;
419    
420     defined $self->{db} or $self->open;
421     &{$self->{func}}(@_);
422     }
423    
424 ulpfr 13 sub keys {
425     my $self = shift;
426    
427     defined $self->{db} or $self->open;
428     keys %{$self->{db}};
429     }
430    
431 ulpfr 10 sub search_prefix {
432     my $self = shift;
433    
434     # print "search_prefix(@_)\n";
435     defined $self->{db} or $self->open;
436     $self->search_raw(map($self->prefix($_), @_));
437     }
438    
439 ulpfr 19 sub _complain ($$) {
440     my ($action, $term) = @_;
441    
442     require Carp;
443     Carp::cluck
444     (sprintf("WAIT database inconsistency during $action [%s]: ".
445     "Please rebuild index\n",
446     $term,));
447     }
448    
449 ulpfr 10 sub search_raw {
450     my $self = shift;
451 ulpfr 19 my $query = shift;
452 ulpfr 10 my %score;
453    
454 ulpfr 19 # Top $wanted documents must be correct. Zero means all matching
455     # documents.
456     my $wanted = $query->{top};
457     my $strict = $query->{picky};
458 ulpfr 10
459 ulpfr 19 # Return at least $minacc documents. Zero means all matching
460     # documents.
461     # my $minacc = $query->{accus} || $wanted;
462    
463     # Open index and flush cache if necessary
464 ulpfr 10 defined $self->{db} or $self->open;
465     $self->sync;
466 ulpfr 19
467     # We keep duplicates
468     my @terms =
469     # Sort words by decreasing document frequency
470 ulpfr 22 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
471 ulpfr 19 # check which words occur in the index.
472 ulpfr 22 grep { $self->{db}->{'o'.$_} } @_;
473 ulpfr 19
474 laperla 40 return unless @terms;
475 ulpfr 19
476     # We special-case one term queries here. If the index was sorted,
477     # choping off the rest of the list will return the same ranking.
478     if ($wanted and @terms == 1) {
479     my $term = shift @terms;
480 ulpfr 22 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
481 ulpfr 19 my @res;
482    
483     if ($self->{reorg}) { # or not $query->{picky}
484 ulpfr 22 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
485 ulpfr 19 } else {
486 ulpfr 22 @res = unpack 'w*', $self->{db}->{'p'.$term};
487 ulpfr 19 }
488    
489     for (my $i=1; $i<@res; $i+=2) {
490 laperla 30 # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
491     # above was written badly, allows two DIV_ZERO problems.
492     my $maxtf = $self->{db}->{"m". $res[$i-1]};
493     unless ($maxtf) {
494     warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
495     $maxtf = 1;
496     }
497     $res[$i] = ($res[$i] / $maxtf) * $idf;
498 ulpfr 19 }
499    
500     return @res
501     }
502    
503     # We separate exhaustive search here to avoid overhead and make the
504     # code more readable. The block can be removed without changing the
505     # result.
506     unless ($wanted) {
507     for (@terms) {
508 ulpfr 22 my $df = $self->{db}->{'o'.$_};
509 ulpfr 19
510     # The frequency *must* be 1 at least since the posting list is nonempty
511     _complain('search for term', $_) and $df = 1 if $df < 1;
512    
513     # Unpack posting list for current query term $_
514 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
515 ulpfr 19
516 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
517 ulpfr 19 # This is the inverse document frequency. The log of the inverse
518     # fraction of documents the term occurs in.
519     my $idf = log($self->{records}/$df);
520     for my $did (keys %post) {
521 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
522 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
523     }
524 ulpfr 10 }
525     }
526 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
527     return %score;
528 ulpfr 10 }
529 ulpfr 19
530     # A sloppy but fast algorithm for multiple term queries.
531     unless ($strict) {
532     for (@terms) {
533     # Unpack posting list for current query term $_
534 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
535 ulpfr 19
536     # Lookup the number of documents the term occurs in (document frequency)
537 ulpfr 22 my $occ = $self->{db}->{'o'.$_};
538 ulpfr 19
539 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
540 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
541     _complain('search for term', $_) and $occ = 1 if $occ < 1;
542    
543     # This is the inverse document frequency. The log of the inverse
544     # fraction of documents the term occurs in.
545     my $idf = log($self->{records}/$occ);
546    
547     # If we have a reasonable number of accumulators, change the
548     # loop to iterate over the accumulators. This will compromise
549     # quality for better speed. The algorithm still computes the
550     # exact weights, but the result is not guaranteed to contain the
551     # *best* results. The database might contain documents better
552     # than the worst returned document.
553    
554     # We process the lists in order of increasing length. When the
555     # number of accumulators exceeds $wanted, no new documents are
556     # added, only the ranking/weighting of the seen documents is
557     # improved. The resulting ranking list must be pruned, since only
558     # the top most documents end up near their "optimal" rank.
559    
560     if (keys %score < $wanted) {
561     for my $did (keys %post) {
562 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
563 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
564     }
565     }
566     } else {
567     for my $did (keys %score) {
568     next unless exists $post{$did};
569 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
570 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
571     }
572     }
573     }
574     }
575     return %score;
576     }
577     my @max; $max[$#terms+1]=0;
578     my @idf;
579    
580     # Preparation loop. This extra loop makes sense only when "reorg"
581     # and "wanted" are true. But at the time beeing, keeping the code
582     # for the different search algorithms in one place seems more
583     # desirable than some minor speedup of the brute force version. We
584     # do cache $idf though.
585    
586     for (my $i = $#terms; $i >=0; $i--) {
587     local $_ = $terms[$i];
588     # Lookup the number of documents the term occurs in (document frequency)
589 ulpfr 22 my $df = $self->{db}->{'o'.$_};
590 ulpfr 19
591     # The frequency *must* be 1 at least since the posting list is nonempty
592     _complain('search for term', $_) and $df = 1 if $df < 1;
593    
594     # This is the inverse document frequency. The log of the inverse
595     # fraction of documents the term occurs in.
596     $idf[$i] = log($self->{records}/$df);
597    
598     my ($did,$occ);
599     if ($self->{reorg}) {
600 ulpfr 22 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
601 ulpfr 19 } else { # Maybe this costs more than it helps
602 ulpfr 22 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
603 ulpfr 19 }
604 ulpfr 22 my $freq = $self->{db}->{'m'. $did};
605 ulpfr 19 my $max = $occ/$freq*$idf[$i];
606     $max[$i] = $max + $max[$i+1];
607     }
608    
609     # Main loop
610     for my $i (0 .. $#terms) {
611     my $term = $terms[$i];
612     # Unpack posting list for current query term $term. We loose the
613     # sorting order because the assignment to a hash.
614 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$term};
615 ulpfr 19
616     _complain('search for term', $term)
617 ulpfr 22 if $self->{db}->{'o'.$term} != keys %post;
618 ulpfr 19
619     my $idf = $idf[$i];
620     my $full; # Need to process all postings
621     my $chop; # Score necessary to enter the ranking list
622    
623     if (# We know that wanted is true since we especial cased the
624     # exhaustive search.
625    
626     $wanted and
627    
628     # We did sort here if necessary in
629     # the preparation loop
630     # $self->{reorg} and
631    
632     scalar keys %score > $wanted) {
633     $chop = (sort { $b <=> $a } values %score)[$wanted];
634     $full = $max[$i] > $chop;
635     } else {
636     $full = 1;
637     }
638    
639     if ($full) {
640     # We need to inspect the full list. Either $wanted is not given,
641     # the index is not sorted, or we don't have enough accumulators
642     # yet.
643     if (defined $chop) {
644     # We might be able to avoid allocating accumulators
645     for my $did (keys %post) {
646 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
647 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
648     # We add an accumulator if $wgt exeeds $chop
649     if (exists $score{$did} or $wgt > $chop) {
650     $score{$did} += $wgt;
651     }
652     }
653     }
654     } else {
655     # Allocate acumulators for each seen document.
656     for my $did (keys %post) {
657 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
658 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
659     }
660     }
661     }
662     } else {
663     # Update existing accumulators
664     for my $did (keys %score) {
665     next unless exists $post{$did};
666 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
667 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
668     }
669     }
670     }
671     }
672     #warn sprintf "Used %d accumulators\n", scalar keys %score;
673 ulpfr 10 %score;
674     }
675    
676 ulpfr 19 sub set {
677     my ($self, $attr, $value) = @_;
678    
679     die "No such indexy attribute: '$attr'" unless $attr eq 'top';
680    
681     return delete $self->{reorg} if $value == 0;
682    
683     return if $self->{reorg}; # we are sorted already
684     return unless $self->{mode} & O_RDWR;
685     defined $self->{db} or $self->open;
686    
687     $self->sync;
688     while (my($key, $value) = each %{$self->{db}}) {
689 ulpfr 22 next if $key !~ /^p/;
690 ulpfr 19 $self->{db}->{$key} = $self->sort_postings($value);
691     }
692     $self->{reorg} = 1;
693     }
694    
695 ulpfr 10 sub sync {
696     my $self = shift;
697    
698     if ($self->{mode} & O_RDWR) {
699 ulpfr 19 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
700 ulpfr 10 while (my($key, $value) = each %{$self->{cache}}) {
701 laperla 30 $self->{db}->{"p". $key} ||= "";
702 ulpfr 19 if ($self->{reorg}) {
703 ulpfr 22 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
704 ulpfr 19 . $value);
705     } else {
706 ulpfr 22 $self->{db}->{'p'.$key} .= $value;
707 ulpfr 19 }
708 ulpfr 10 }
709     while (my($key, $value) = each %{$self->{cdict}}) {
710 ulpfr 22 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
711     $self->{db}->{'o'.$key} += $value;
712 ulpfr 10 }
713 ulpfr 19 $self->{cache} = {};
714     $self->{cdict} = {};
715 ulpfr 10 $self->{cached} = 0;
716     }
717     }
718    
719     sub close {
720     my $self = shift;
721    
722     if ($self->{dbh}) {
723     $self->sync;
724     delete $self->{dbh};
725     untie %{$self->{db}};
726     delete $self->{db};
727     delete $self->{func};
728     delete $self->{cache};
729     delete $self->{cached};
730     delete $self->{cdict};
731     delete $self->{pfunc} if defined $self->{pfunc};
732     delete $self->{ifunc} if defined $self->{ifunc};
733     delete $self->{xfunc} if defined $self->{xfunc};
734     }
735     }
736    
737     1;
738    

Properties

Name Value
cvs2svn:cvs-rev 1.6

  ViewVC Help
Powered by ViewVC 1.1.26