/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 71 - (hide annotations)
Sun Jan 27 15:27:38 2002 UTC (22 years, 3 months ago) by laperla
File size: 22237 byte(s)
- trigrams working

1 ulpfr 19 # -*- Mode: Perl -*-
2     # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 ulpfr 51 # Last Modified On: Mon Dec 31 14:30:05 2001
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10     # (C) Copyright 1996-2000, Ulrich Pfeifer
11     #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15     use DB_File;
16     use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19 laperla 30 use vars qw(%FUNC $VERSION);
20 ulpfr 10
21 laperla 30 $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23 ulpfr 22 # The dictionary has three different key types:
24     # 'o'.$word
25     #
26     # The document frequency is the number of documents a term occurs
27     # in. The idea is that a term occuring in a significant part of the
28     # documents is not too significant.
29     #
30     # 'm'.$word
31     #
32     # The maximum term frequency of a document is the frequency of the
33     # most frequent term in the document. It is related to the document
34 laperla 71 # length obviously. A document in which the most frequent term occurs
35 ulpfr 22 # 100 times is probably much longer than a document whichs most
36     # frequent term occurs five time.
37     #
38     # 'p'.$word
39     #
40     # Under this key we store the actual posting list as pairs of
41     # packed integers.
42 ulpfr 10
43 ulpfr 22 my $no_old_index_support = 0; # do not check for old indices if set
44 ulpfr 19
45 ulpfr 10 sub new {
46     my $type = shift;
47     my %parm = @_;
48     my $self = {};
49    
50     $self->{file} = $parm{file} or croak "No file specified";
51     $self->{attr} = $parm{attr} or croak "No attributes specified";
52     $self->{filter} = $parm{filter};
53     $self->{'name'} = $parm{'name'};
54     $self->{records} = 0;
55     for (qw(intervall prefix)) {
56     if (exists $parm{$_}) {
57     if (ref $parm{$_}) {
58     $self->{$_} = [@{$parm{$_}}] # clone
59     } else {
60     $self->{$_} = $parm{$_}
61     }
62     }
63     }
64     bless $self, ref($type) || $type;
65     }
66    
67     sub name {$_[0]->{'name'}}
68    
69     sub _split_pos {
70     my ($text, $pos) = @{$_[0]};
71     my @result;
72    
73     $text =~ s/(^\s+)// and $pos += length($1);
74     while ($text =~ s/(^\S+)//) {
75     my $word = $1;
76     push @result, [$word, $pos];
77     $pos += length($word);
78     $text =~ s/(^\s+)// and $pos += length($1);
79     }
80     @result;
81     }
82    
83     sub _xfiltergen {
84     my $filter = pop @_;
85    
86 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
87     # stopwords, such as isotr clobbers "isn't" to "isnt".
88    
89     # if ($filter eq 'stop') { # avoid the slow stopword elimination
90     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
91     # }
92 ulpfr 10 if (@_) {
93     if ($filter =~ /^split(\d*)/) {
94     if ($1) {
95     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
96     } else {
97     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
98     }
99     } else {
100     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
101     }
102     } else {
103     if ($filter =~ /^split(\d*)/) {
104     if ($1) {
105     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
106     } else {
107     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
108     }
109     } else {
110     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
111     }
112     }
113     }
114    
115     sub parse_pos {
116     my $self = shift;
117    
118     unless (exists $self->{xfunc}) {
119     $self->{xfunc} =
120     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
121     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
122     }
123     &{$self->{xfunc}}($_[0]);
124     }
125    
126     sub _filtergen {
127     my $filter = pop @_;
128    
129     if (@_) {
130     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
131     } else {
132     "map(&WAIT::Filter::$filter(\$_), \@_)";
133     }
134     }
135    
136     sub drop {
137     my $self = shift;
138     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
139     my $file = $self->{file};
140    
141     ! (!-e $file or unlink $file);
142     } else { # notify our database
143     croak ref($self)."::drop called directly";
144     }
145     }
146    
147 ulpfr 22 sub is_an_old_index {
148     my $self = shift;
149    
150     return 0 if $no_old_index_support;
151     return $self->{old_index} if exists $self->{old_index};
152    
153     # We can only guess if this is an old index. We lookup the first 10
154     # $O entries. If all values are integers, we assume that the index
155     # is an old one.
156    
157     defined $self->{db} or $self->open;
158     $self->sync;
159     my $dbh = $self->{dbh}; # for convenience
160    
161     my $O = pack('C', 0xff)."o";
162 laperla 40 my ($word, $value) = ($O.$;); # $word and $value are modified by seq!
163     if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165     return $self->{old_index} = 0;
166     }
167 ulpfr 22 for (my $i=0; $i<10;$i++) {
168     if ($value !~ /^\d+$/) {
169 laperla 40 # warn "DEBUG: word[$word]value[$value], not an old index";
170 ulpfr 22 return $self->{old_index} = 0;
171     }
172 laperla 40 if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173 ulpfr 36 $word !~ /^$O$;/o # no $O values left
174 ulpfr 22 ) {
175     # we are not sure enough that this is an old index
176 laperla 40 # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177 ulpfr 22 return $self->{old_index} = 0;
178     }
179     }
180 laperla 40 # warn "DEBUG: old index";
181 ulpfr 22 return $self->{old_index} = 1;
182     }
183    
184 ulpfr 10 sub open {
185     my $self = shift;
186     my $file = $self->{file};
187    
188     if (defined $self->{dbh}) {
189     $self->{dbh};
190     } else {
191     $self->{func} =
192     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
194     $self->{mode}, 0664, $DB_BTREE);
195     $self->{cache} = {}
196     if $self->{mode} & O_RDWR;
197     $self->{cdict} = {}
198     if $self->{mode} & O_RDWR;
199     $self->{cached} = 0;
200 ulpfr 22 if (!$no_old_index_support and $self->is_an_old_index()) {
201     warn "This is an old index, upgrade you database";
202     require WAIT::InvertedIndexOld;
203     bless $self, 'WAIT::InvertedIndexOld';
204     }
205 ulpfr 10 }
206     }
207    
208     sub insert {
209     my $self = shift;
210     my $key = shift;
211     my %occ;
212 ulpfr 13
213 ulpfr 10 defined $self->{db} or $self->open;
214     grep $occ{$_}++, &{$self->{func}}(@_);
215     my ($word, $noc);
216     $self->{records}++;
217     while (($word, $noc) = each %occ) {
218     if (defined $self->{cache}->{$word}) {
219 ulpfr 22 $self->{cdict}->{$word}++;
220 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221 ulpfr 22 } else {
222     $self->{cdict}->{$word} = 1;
223 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
224 ulpfr 13 }
225 ulpfr 10 $self->{cached}++;
226     }
227 ulpfr 19 # This cache limit should be configurable
228 ulpfr 10 $self->sync if $self->{cached} > 100_000;
229     my $maxtf = 0;
230     for (values %occ) {
231     $maxtf = $_ if $_ > $maxtf;
232     }
233 ulpfr 22 $self->{db}->{'m'. $key} = $maxtf;
234 ulpfr 10 }
235    
236 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
237     # document length. This reduces the quality degradation if we process
238     # only the first part of a posting list.
239    
240     sub sort_postings {
241     my $self = shift;
242     my $post = shift; # reference to a hash or packed string
243    
244     if (ref $post) {
245     # we skip the sort part, if the index is not sorted
246     return pack('w*', %$post) unless $self->{reorg};
247     } else {
248     $post = { unpack 'w*', $post };
249     }
250    
251     my $r = '';
252    
253     # Sort posting list by increasing ratio of maximum term frequency (~
254     # "document length") and term frequency. This rati multipied by the
255     # inverse document frequence gives the score for a term. This sort
256     # order can be exploited for tuning of single term queries.
257    
258 laperla 30 for my $did (keys %$post) { # sanity check
259     unless ($self->{db}->{"m". $did}) {
260     warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261     $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262     }
263     }
264 ulpfr 22 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
265 ulpfr 19 <=>
266 ulpfr 22 $post->{$a} / $self->{db}->{'m'. $a}
267 ulpfr 19 } keys %$post) {
268     $r .= pack 'w2', $did, $post->{$did};
269     }
270     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
271     $r;
272     }
273    
274 ulpfr 10 sub delete {
275     my $self = shift;
276     my $key = shift;
277     my %occ;
278    
279 ulpfr 19 my $db;
280 ulpfr 10 defined $self->{db} or $self->open;
281 ulpfr 19 $db = $self->{db};
282 ulpfr 10 $self->sync;
283     $self->{records}--;
284 ulpfr 19
285     # less than zero documents in database?
286     _complain('delete of document', $key) and $self->{records} = 0
287     if $self->{records} < 0;
288    
289 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
290 ulpfr 19
291 laperla 30 # Be prepared for "Odd number of elements in hash assignment"
292     local $SIG{__WARN__} = sub {
293     my $warning = shift;
294     chomp $warning;
295     warn "Catching warning[$warning] during delete of key[$key]";
296     };
297 ulpfr 19 for (keys %occ) {# may reorder posting list
298 ulpfr 22 my %post = unpack 'w*', $db->{'p'.$_};
299 ulpfr 10 delete $post{$key};
300 ulpfr 22 $db->{'p'.$_} = $self->sort_postings(\%post);
301     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302     $db->{'o'.$_} = scalar keys %post;
303 ulpfr 10 }
304 ulpfr 22 delete $db->{'m'. $key};
305 ulpfr 10 }
306    
307     sub intervall {
308     my ($self, $first, $last) = @_;
309     my $value = '';
310     my $word = '';
311     my @result;
312    
313     return unless exists $self->{'intervall'};
314    
315     defined $self->{db} or $self->open;
316     $self->sync;
317     my $dbh = $self->{dbh}; # for convenience
318    
319     if (ref $self->{'intervall'}) {
320     unless (exists $self->{'ifunc'}) {
321     $self->{'ifunc'} =
322     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
323     }
324     ($first) = &{$self->{'ifunc'}}($first) if $first;
325     ($last) = &{$self->{'ifunc'}}($last) if $last;
326     }
327 ulpfr 22 $first = 'p'.($first||'');
328     $last = (defined $last)?'p'.$last:'q';
329    
330     # set the cursor to $first
331     $dbh->seq($first, $value, R_CURSOR);
332    
333     # $first would be after the last word
334     return () if $first gt $last;
335 ulpfr 10
336 ulpfr 22 push @result, substr($first,1);
337 ulpfr 10 while (!$dbh->seq($word, $value, R_NEXT)) {
338     # We should limit this to a "resonable" number of words
339 ulpfr 22 last if $word gt $last;
340     push @result, substr($word,1);
341 ulpfr 10 }
342     \@result; # speed
343     }
344    
345     sub prefix {
346     my ($self, $prefix) = @_;
347     my $value = '';
348     my $word = '';
349     my @result;
350    
351     return () unless defined $prefix; # Full dictionary requested !!
352     return unless exists $self->{'prefix'};
353     defined $self->{db} or $self->open;
354     $self->sync;
355     my $dbh = $self->{dbh};
356    
357     if (ref $self->{'prefix'}) {
358     unless (exists $self->{'pfunc'}) {
359     $self->{'pfunc'} =
360     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
361     }
362     ($prefix) = &{$self->{'pfunc'}}($prefix);
363     }
364    
365 ulpfr 22 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366 ulpfr 10 return ();
367     }
368 ulpfr 22 return () if $word !~ /^p$prefix/;
369     push @result, substr($word,1);
370 ulpfr 10
371     while (!$dbh->seq($word, $value, R_NEXT)) {
372     # We should limit this to a "resonable" number of words
373 ulpfr 22 last if $word !~ /^p$prefix/;
374     push @result, substr($word,1);
375 ulpfr 10 }
376     \@result; # speed
377     }
378    
379 ulpfr 19 =head2 search($query)
380    
381     The search method supports a range of search algorithms. It is
382     recommended to tune the index by calling
383     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
384     into the table. This is a computing intense operation and all inserts
385     and deletes after this optimization are slightly more expensive. Once
386     reorganized, the index is kept sorted automatically until you switch
387     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
388    
389     When searching a tuned index, a query can be processed faster if the
390     caller requests only the topmost documents. This can be done by
391     passing a C<top =E<gt>> I<n> parameter to the search method.
392    
393     For single term queries, the method returns only the I<n> top ranking
394     documents. For multi term queries two optimized algorithms are
395     available. The first algorithm computes the top n documents
396     approximately but very fast, sacrificing a little bit of precision for
397     speed. The second algorithm computes the topmost I<n> documents
398     precisely. This algorithm is slower and should be used only for small
399     values of I<n>. It can be requested by passing the query attribute
400     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
401     While the picky version might not be faster than the brute force
402     version on average for modest size databases it uses less memory and
403     the processing time is almost linear in the number of query terms, not
404     in the size of the lists.
405    
406     =cut
407    
408 ulpfr 10 sub search {
409     my $self = shift;
410 ulpfr 19 my $query = shift;
411 ulpfr 10
412     defined $self->{db} or $self->open;
413     $self->sync;
414 ulpfr 19 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
415 ulpfr 10 }
416    
417     sub parse {
418     my $self = shift;
419    
420     defined $self->{db} or $self->open;
421     &{$self->{func}}(@_);
422     }
423    
424     sub search_prefix {
425     my $self = shift;
426    
427     # print "search_prefix(@_)\n";
428     defined $self->{db} or $self->open;
429     $self->search_raw(map($self->prefix($_), @_));
430     }
431    
432 ulpfr 19 sub _complain ($$) {
433     my ($action, $term) = @_;
434    
435     require Carp;
436     Carp::cluck
437     (sprintf("WAIT database inconsistency during $action [%s]: ".
438     "Please rebuild index\n",
439     $term,));
440     }
441    
442 ulpfr 10 sub search_raw {
443     my $self = shift;
444 ulpfr 19 my $query = shift;
445 ulpfr 10 my %score;
446    
447 ulpfr 19 # Top $wanted documents must be correct. Zero means all matching
448     # documents.
449     my $wanted = $query->{top};
450     my $strict = $query->{picky};
451 ulpfr 10
452 ulpfr 19 # Return at least $minacc documents. Zero means all matching
453     # documents.
454     # my $minacc = $query->{accus} || $wanted;
455    
456     # Open index and flush cache if necessary
457 ulpfr 10 defined $self->{db} or $self->open;
458     $self->sync;
459 ulpfr 19
460     # We keep duplicates
461     my @terms =
462     # Sort words by decreasing document frequency
463 ulpfr 22 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
464 ulpfr 19 # check which words occur in the index.
465 ulpfr 22 grep { $self->{db}->{'o'.$_} } @_;
466 ulpfr 19
467 laperla 40 return unless @terms;
468 ulpfr 19
469     # We special-case one term queries here. If the index was sorted,
470     # choping off the rest of the list will return the same ranking.
471     if ($wanted and @terms == 1) {
472     my $term = shift @terms;
473 ulpfr 22 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
474 ulpfr 19 my @res;
475    
476     if ($self->{reorg}) { # or not $query->{picky}
477 ulpfr 22 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
478 ulpfr 19 } else {
479 ulpfr 22 @res = unpack 'w*', $self->{db}->{'p'.$term};
480 ulpfr 19 }
481    
482     for (my $i=1; $i<@res; $i+=2) {
483 laperla 30 # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
484     # above was written badly, allows two DIV_ZERO problems.
485     my $maxtf = $self->{db}->{"m". $res[$i-1]};
486     unless ($maxtf) {
487     warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
488     $maxtf = 1;
489     }
490     $res[$i] = ($res[$i] / $maxtf) * $idf;
491 ulpfr 19 }
492    
493     return @res
494     }
495    
496     # We separate exhaustive search here to avoid overhead and make the
497     # code more readable. The block can be removed without changing the
498     # result.
499     unless ($wanted) {
500     for (@terms) {
501 ulpfr 22 my $df = $self->{db}->{'o'.$_};
502 ulpfr 19
503     # The frequency *must* be 1 at least since the posting list is nonempty
504     _complain('search for term', $_) and $df = 1 if $df < 1;
505    
506     # Unpack posting list for current query term $_
507 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
508 ulpfr 19
509 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
510 ulpfr 19 # This is the inverse document frequency. The log of the inverse
511     # fraction of documents the term occurs in.
512     my $idf = log($self->{records}/$df);
513     for my $did (keys %post) {
514 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
515 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
516     }
517 ulpfr 10 }
518     }
519 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
520     return %score;
521 ulpfr 10 }
522 ulpfr 19
523     # A sloppy but fast algorithm for multiple term queries.
524     unless ($strict) {
525     for (@terms) {
526     # Unpack posting list for current query term $_
527 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
528 ulpfr 19
529     # Lookup the number of documents the term occurs in (document frequency)
530 ulpfr 22 my $occ = $self->{db}->{'o'.$_};
531 ulpfr 19
532 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
533 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
534     _complain('search for term', $_) and $occ = 1 if $occ < 1;
535    
536     # This is the inverse document frequency. The log of the inverse
537     # fraction of documents the term occurs in.
538     my $idf = log($self->{records}/$occ);
539    
540     # If we have a reasonable number of accumulators, change the
541     # loop to iterate over the accumulators. This will compromise
542     # quality for better speed. The algorithm still computes the
543     # exact weights, but the result is not guaranteed to contain the
544     # *best* results. The database might contain documents better
545     # than the worst returned document.
546    
547     # We process the lists in order of increasing length. When the
548     # number of accumulators exceeds $wanted, no new documents are
549     # added, only the ranking/weighting of the seen documents is
550     # improved. The resulting ranking list must be pruned, since only
551     # the top most documents end up near their "optimal" rank.
552    
553     if (keys %score < $wanted) {
554     for my $did (keys %post) {
555 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
556 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
557     }
558     }
559     } else {
560     for my $did (keys %score) {
561     next unless exists $post{$did};
562 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
563 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
564     }
565     }
566     }
567     }
568     return %score;
569     }
570     my @max; $max[$#terms+1]=0;
571     my @idf;
572    
573     # Preparation loop. This extra loop makes sense only when "reorg"
574     # and "wanted" are true. But at the time beeing, keeping the code
575     # for the different search algorithms in one place seems more
576     # desirable than some minor speedup of the brute force version. We
577     # do cache $idf though.
578    
579     for (my $i = $#terms; $i >=0; $i--) {
580     local $_ = $terms[$i];
581     # Lookup the number of documents the term occurs in (document frequency)
582 ulpfr 22 my $df = $self->{db}->{'o'.$_};
583 ulpfr 19
584     # The frequency *must* be 1 at least since the posting list is nonempty
585     _complain('search for term', $_) and $df = 1 if $df < 1;
586    
587     # This is the inverse document frequency. The log of the inverse
588     # fraction of documents the term occurs in.
589     $idf[$i] = log($self->{records}/$df);
590    
591     my ($did,$occ);
592     if ($self->{reorg}) {
593 ulpfr 22 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
594 ulpfr 19 } else { # Maybe this costs more than it helps
595 ulpfr 22 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
596 ulpfr 19 }
597 ulpfr 22 my $freq = $self->{db}->{'m'. $did};
598 ulpfr 19 my $max = $occ/$freq*$idf[$i];
599     $max[$i] = $max + $max[$i+1];
600     }
601    
602     # Main loop
603     for my $i (0 .. $#terms) {
604     my $term = $terms[$i];
605     # Unpack posting list for current query term $term. We loose the
606     # sorting order because the assignment to a hash.
607 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$term};
608 ulpfr 19
609     _complain('search for term', $term)
610 ulpfr 22 if $self->{db}->{'o'.$term} != keys %post;
611 ulpfr 19
612     my $idf = $idf[$i];
613     my $full; # Need to process all postings
614     my $chop; # Score necessary to enter the ranking list
615    
616     if (# We know that wanted is true since we especial cased the
617     # exhaustive search.
618    
619     $wanted and
620    
621     # We did sort here if necessary in
622     # the preparation loop
623     # $self->{reorg} and
624    
625     scalar keys %score > $wanted) {
626     $chop = (sort { $b <=> $a } values %score)[$wanted];
627     $full = $max[$i] > $chop;
628     } else {
629     $full = 1;
630     }
631    
632     if ($full) {
633     # We need to inspect the full list. Either $wanted is not given,
634     # the index is not sorted, or we don't have enough accumulators
635     # yet.
636     if (defined $chop) {
637     # We might be able to avoid allocating accumulators
638     for my $did (keys %post) {
639 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
640 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
641     # We add an accumulator if $wgt exeeds $chop
642     if (exists $score{$did} or $wgt > $chop) {
643     $score{$did} += $wgt;
644     }
645     }
646     }
647     } else {
648     # Allocate acumulators for each seen document.
649     for my $did (keys %post) {
650 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
651 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
652     }
653     }
654     }
655     } else {
656     # Update existing accumulators
657     for my $did (keys %score) {
658     next unless exists $post{$did};
659 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
660 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
661     }
662     }
663     }
664     }
665     #warn sprintf "Used %d accumulators\n", scalar keys %score;
666 ulpfr 10 %score;
667     }
668    
669 ulpfr 19 sub set {
670     my ($self, $attr, $value) = @_;
671    
672     die "No such indexy attribute: '$attr'" unless $attr eq 'top';
673    
674     return delete $self->{reorg} if $value == 0;
675    
676     return if $self->{reorg}; # we are sorted already
677     return unless $self->{mode} & O_RDWR;
678     defined $self->{db} or $self->open;
679    
680     $self->sync;
681     while (my($key, $value) = each %{$self->{db}}) {
682 ulpfr 22 next if $key !~ /^p/;
683 ulpfr 19 $self->{db}->{$key} = $self->sort_postings($value);
684     }
685     $self->{reorg} = 1;
686     }
687    
688 ulpfr 10 sub sync {
689     my $self = shift;
690    
691     if ($self->{mode} & O_RDWR) {
692 ulpfr 19 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
693 ulpfr 10 while (my($key, $value) = each %{$self->{cache}}) {
694 laperla 30 $self->{db}->{"p". $key} ||= "";
695 ulpfr 19 if ($self->{reorg}) {
696 ulpfr 22 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
697 ulpfr 19 . $value);
698     } else {
699 ulpfr 22 $self->{db}->{'p'.$key} .= $value;
700 ulpfr 19 }
701 ulpfr 10 }
702     while (my($key, $value) = each %{$self->{cdict}}) {
703 ulpfr 22 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
704     $self->{db}->{'o'.$key} += $value;
705 ulpfr 10 }
706 ulpfr 19 $self->{cache} = {};
707     $self->{cdict} = {};
708 ulpfr 10 $self->{cached} = 0;
709     }
710     }
711    
712     sub close {
713     my $self = shift;
714    
715     if ($self->{dbh}) {
716     $self->sync;
717     delete $self->{dbh};
718     untie %{$self->{db}};
719     delete $self->{db};
720     delete $self->{func};
721     delete $self->{cache};
722     delete $self->{cached};
723     delete $self->{cdict};
724     delete $self->{pfunc} if defined $self->{pfunc};
725     delete $self->{ifunc} if defined $self->{ifunc};
726     delete $self->{xfunc} if defined $self->{xfunc};
727     }
728     }
729    
730 ulpfr 51 sub keys {
731     my $self = shift;
732    
733     defined $self->{db} or $self->open;
734     keys %{$self->{db}};
735     }
736    
737 ulpfr 10 1;
738    

Properties

Name Value
cvs2svn:cvs-rev 1.8

  ViewVC Help
Powered by ViewVC 1.1.26