/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 36 - (hide annotations)
Sun Nov 12 17:01:59 2000 UTC (23 years, 6 months ago) by ulpfr
File size: 21995 byte(s)
$0 -> $O$;

1 ulpfr 19 # -*- Mode: Perl -*-
2     # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 ulpfr 36 # Last Modified On: Sun Nov 12 14:40:21 2000
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10     # (C) Copyright 1996-2000, Ulrich Pfeifer
11     #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15     use DB_File;
16     use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19 laperla 30 use vars qw(%FUNC $VERSION);
20 ulpfr 10
21 laperla 30 $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23 ulpfr 22 # The dictionary has three different key types:
24     # 'o'.$word
25     #
26     # The document frequency is the number of documents a term occurs
27     # in. The idea is that a term occuring in a significant part of the
28     # documents is not too significant.
29     #
30     # 'm'.$word
31     #
32     # The maximum term frequency of a document is the frequency of the
33     # most frequent term in the document. It is related to the document
34     # length obviously. A document in which the most frequnet term occurs
35     # 100 times is probably much longer than a document whichs most
36     # frequent term occurs five time.
37     #
38     # 'p'.$word
39     #
40     # Under this key we store the actual posting list as pairs of
41     # packed integers.
42 ulpfr 10
43 ulpfr 22 my $no_old_index_support = 0; # do not check for old indices if set
44 ulpfr 19
45 ulpfr 10 sub new {
46     my $type = shift;
47     my %parm = @_;
48     my $self = {};
49    
50     $self->{file} = $parm{file} or croak "No file specified";
51     $self->{attr} = $parm{attr} or croak "No attributes specified";
52     $self->{filter} = $parm{filter};
53     $self->{'name'} = $parm{'name'};
54     $self->{records} = 0;
55     for (qw(intervall prefix)) {
56     if (exists $parm{$_}) {
57     if (ref $parm{$_}) {
58     $self->{$_} = [@{$parm{$_}}] # clone
59     } else {
60     $self->{$_} = $parm{$_}
61     }
62     }
63     }
64     bless $self, ref($type) || $type;
65     }
66    
67     sub name {$_[0]->{'name'}}
68    
69     sub _split_pos {
70     my ($text, $pos) = @{$_[0]};
71     my @result;
72    
73     $text =~ s/(^\s+)// and $pos += length($1);
74     while ($text =~ s/(^\S+)//) {
75     my $word = $1;
76     push @result, [$word, $pos];
77     $pos += length($word);
78     $text =~ s/(^\s+)// and $pos += length($1);
79     }
80     @result;
81     }
82    
83     sub _xfiltergen {
84     my $filter = pop @_;
85    
86 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
87     # stopwords, such as isotr clobbers "isn't" to "isnt".
88    
89     # if ($filter eq 'stop') { # avoid the slow stopword elimination
90     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
91     # }
92 ulpfr 10 if (@_) {
93     if ($filter =~ /^split(\d*)/) {
94     if ($1) {
95     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
96     } else {
97     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
98     }
99     } else {
100     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
101     }
102     } else {
103     if ($filter =~ /^split(\d*)/) {
104     if ($1) {
105     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
106     } else {
107     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
108     }
109     } else {
110     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
111     }
112     }
113     }
114    
115     sub parse_pos {
116     my $self = shift;
117    
118     unless (exists $self->{xfunc}) {
119     $self->{xfunc} =
120     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
121     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
122     }
123     &{$self->{xfunc}}($_[0]);
124     }
125    
126     sub _filtergen {
127     my $filter = pop @_;
128    
129     if (@_) {
130     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
131     } else {
132     "map(&WAIT::Filter::$filter(\$_), \@_)";
133     }
134     }
135    
136     sub drop {
137     my $self = shift;
138     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
139     my $file = $self->{file};
140    
141     ! (!-e $file or unlink $file);
142     } else { # notify our database
143     croak ref($self)."::drop called directly";
144     }
145     }
146    
147 ulpfr 22 sub is_an_old_index {
148     my $self = shift;
149    
150     return 0 if $no_old_index_support;
151     return $self->{old_index} if exists $self->{old_index};
152    
153     # We can only guess if this is an old index. We lookup the first 10
154     # $O entries. If all values are integers, we assume that the index
155     # is an old one.
156    
157     defined $self->{db} or $self->open;
158     $self->sync;
159     my $dbh = $self->{dbh}; # for convenience
160    
161     my $O = pack('C', 0xff)."o";
162 ulpfr 33 my ($word, $value) = ($O.$;); # $word and $value are modified!
163     $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;
164 ulpfr 22 for (my $i=0; $i<10;$i++) {
165     if ($value !~ /^\d+$/) {
166     return $self->{old_index} = 0;
167     }
168     if ($dbh->seq($word, $value, R_NEXT) or # no values left
169 ulpfr 36 $word !~ /^$O$;/o # no $O values left
170 ulpfr 22 ) {
171     # we are not sure enough that this is an old index
172     return $self->{old_index} = 0;
173     }
174     }
175     return $self->{old_index} = 1;
176     }
177    
178 ulpfr 10 sub open {
179     my $self = shift;
180     my $file = $self->{file};
181    
182     if (defined $self->{dbh}) {
183     $self->{dbh};
184     } else {
185     $self->{func} =
186     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
187     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
188     $self->{mode}, 0664, $DB_BTREE);
189     $self->{cache} = {}
190     if $self->{mode} & O_RDWR;
191     $self->{cdict} = {}
192     if $self->{mode} & O_RDWR;
193     $self->{cached} = 0;
194 ulpfr 22 if (!$no_old_index_support and $self->is_an_old_index()) {
195     warn "This is an old index, upgrade you database";
196     require WAIT::InvertedIndexOld;
197     bless $self, 'WAIT::InvertedIndexOld';
198     }
199 ulpfr 10 }
200     }
201    
202     sub insert {
203     my $self = shift;
204     my $key = shift;
205     my %occ;
206 ulpfr 13
207 ulpfr 10 defined $self->{db} or $self->open;
208     grep $occ{$_}++, &{$self->{func}}(@_);
209     my ($word, $noc);
210     $self->{records}++;
211     while (($word, $noc) = each %occ) {
212     if (defined $self->{cache}->{$word}) {
213 ulpfr 22 $self->{cdict}->{$word}++;
214 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
215 ulpfr 22 } else {
216     $self->{cdict}->{$word} = 1;
217 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
218 ulpfr 13 }
219 ulpfr 10 $self->{cached}++;
220     }
221 ulpfr 19 # This cache limit should be configurable
222 ulpfr 10 $self->sync if $self->{cached} > 100_000;
223     my $maxtf = 0;
224     for (values %occ) {
225     $maxtf = $_ if $_ > $maxtf;
226     }
227 ulpfr 22 $self->{db}->{'m'. $key} = $maxtf;
228 ulpfr 10 }
229    
230 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
231     # document length. This reduces the quality degradation if we process
232     # only the first part of a posting list.
233    
234     sub sort_postings {
235     my $self = shift;
236     my $post = shift; # reference to a hash or packed string
237    
238     if (ref $post) {
239     # we skip the sort part, if the index is not sorted
240     return pack('w*', %$post) unless $self->{reorg};
241     } else {
242     $post = { unpack 'w*', $post };
243     }
244    
245     my $r = '';
246    
247     # Sort posting list by increasing ratio of maximum term frequency (~
248     # "document length") and term frequency. This rati multipied by the
249     # inverse document frequence gives the score for a term. This sort
250     # order can be exploited for tuning of single term queries.
251    
252 laperla 30 for my $did (keys %$post) { # sanity check
253     unless ($self->{db}->{"m". $did}) {
254     warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
255     $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
256     }
257     }
258 ulpfr 22 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
259 ulpfr 19 <=>
260 ulpfr 22 $post->{$a} / $self->{db}->{'m'. $a}
261 ulpfr 19 } keys %$post) {
262     $r .= pack 'w2', $did, $post->{$did};
263     }
264     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
265     $r;
266     }
267    
268 ulpfr 10 sub delete {
269     my $self = shift;
270     my $key = shift;
271     my %occ;
272    
273 ulpfr 19 my $db;
274 ulpfr 10 defined $self->{db} or $self->open;
275 ulpfr 19 $db = $self->{db};
276 ulpfr 10 $self->sync;
277     $self->{records}--;
278 ulpfr 19
279     # less than zero documents in database?
280     _complain('delete of document', $key) and $self->{records} = 0
281     if $self->{records} < 0;
282    
283 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
284 ulpfr 19
285 laperla 30 # Be prepared for "Odd number of elements in hash assignment"
286     local $SIG{__WARN__} = sub {
287     my $warning = shift;
288     chomp $warning;
289     warn "Catching warning[$warning] during delete of key[$key]";
290     };
291 ulpfr 19 for (keys %occ) {# may reorder posting list
292 ulpfr 22 my %post = unpack 'w*', $db->{'p'.$_};
293 ulpfr 10 delete $post{$key};
294 ulpfr 22 $db->{'p'.$_} = $self->sort_postings(\%post);
295     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
296     $db->{'o'.$_} = scalar keys %post;
297 ulpfr 10 }
298 ulpfr 22 delete $db->{'m'. $key};
299 ulpfr 10 }
300    
301     sub intervall {
302     my ($self, $first, $last) = @_;
303     my $value = '';
304     my $word = '';
305     my @result;
306    
307     return unless exists $self->{'intervall'};
308    
309     defined $self->{db} or $self->open;
310     $self->sync;
311     my $dbh = $self->{dbh}; # for convenience
312    
313     if (ref $self->{'intervall'}) {
314     unless (exists $self->{'ifunc'}) {
315     $self->{'ifunc'} =
316     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
317     }
318     ($first) = &{$self->{'ifunc'}}($first) if $first;
319     ($last) = &{$self->{'ifunc'}}($last) if $last;
320     }
321 ulpfr 22 $first = 'p'.($first||'');
322     $last = (defined $last)?'p'.$last:'q';
323    
324     # set the cursor to $first
325     $dbh->seq($first, $value, R_CURSOR);
326    
327     # $first would be after the last word
328     return () if $first gt $last;
329 ulpfr 10
330 ulpfr 22 push @result, substr($first,1);
331 ulpfr 10 while (!$dbh->seq($word, $value, R_NEXT)) {
332     # We should limit this to a "resonable" number of words
333 ulpfr 22 last if $word gt $last;
334     push @result, substr($word,1);
335 ulpfr 10 }
336     \@result; # speed
337     }
338    
339     sub prefix {
340     my ($self, $prefix) = @_;
341     my $value = '';
342     my $word = '';
343     my @result;
344    
345     return () unless defined $prefix; # Full dictionary requested !!
346     return unless exists $self->{'prefix'};
347     defined $self->{db} or $self->open;
348     $self->sync;
349     my $dbh = $self->{dbh};
350    
351     if (ref $self->{'prefix'}) {
352     unless (exists $self->{'pfunc'}) {
353     $self->{'pfunc'} =
354     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
355     }
356     ($prefix) = &{$self->{'pfunc'}}($prefix);
357     }
358    
359 ulpfr 22 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
360 ulpfr 10 return ();
361     }
362 ulpfr 22 return () if $word !~ /^p$prefix/;
363     push @result, substr($word,1);
364 ulpfr 10
365     while (!$dbh->seq($word, $value, R_NEXT)) {
366     # We should limit this to a "resonable" number of words
367 ulpfr 22 last if $word !~ /^p$prefix/;
368     push @result, substr($word,1);
369 ulpfr 10 }
370     \@result; # speed
371     }
372    
373 ulpfr 19 =head2 search($query)
374    
375     The search method supports a range of search algorithms. It is
376     recommended to tune the index by calling
377     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
378     into the table. This is a computing intense operation and all inserts
379     and deletes after this optimization are slightly more expensive. Once
380     reorganized, the index is kept sorted automatically until you switch
381     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
382    
383     When searching a tuned index, a query can be processed faster if the
384     caller requests only the topmost documents. This can be done by
385     passing a C<top =E<gt>> I<n> parameter to the search method.
386    
387     For single term queries, the method returns only the I<n> top ranking
388     documents. For multi term queries two optimized algorithms are
389     available. The first algorithm computes the top n documents
390     approximately but very fast, sacrificing a little bit of precision for
391     speed. The second algorithm computes the topmost I<n> documents
392     precisely. This algorithm is slower and should be used only for small
393     values of I<n>. It can be requested by passing the query attribute
394     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
395     While the picky version might not be faster than the brute force
396     version on average for modest size databases it uses less memory and
397     the processing time is almost linear in the number of query terms, not
398     in the size of the lists.
399    
400     =cut
401    
402 ulpfr 10 sub search {
403     my $self = shift;
404 ulpfr 19 my $query = shift;
405 ulpfr 10
406     defined $self->{db} or $self->open;
407     $self->sync;
408 ulpfr 19 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
409 ulpfr 10 }
410    
411     sub parse {
412     my $self = shift;
413    
414     defined $self->{db} or $self->open;
415     &{$self->{func}}(@_);
416     }
417    
418 ulpfr 13 sub keys {
419     my $self = shift;
420    
421     defined $self->{db} or $self->open;
422     keys %{$self->{db}};
423     }
424    
425 ulpfr 10 sub search_prefix {
426     my $self = shift;
427    
428     # print "search_prefix(@_)\n";
429     defined $self->{db} or $self->open;
430     $self->search_raw(map($self->prefix($_), @_));
431     }
432    
433 ulpfr 19 sub _complain ($$) {
434     my ($action, $term) = @_;
435    
436     require Carp;
437     Carp::cluck
438     (sprintf("WAIT database inconsistency during $action [%s]: ".
439     "Please rebuild index\n",
440     $term,));
441     }
442    
443 ulpfr 10 sub search_raw {
444     my $self = shift;
445 ulpfr 19 my $query = shift;
446 ulpfr 10 my %score;
447    
448 ulpfr 19 # Top $wanted documents must be correct. Zero means all matching
449     # documents.
450     my $wanted = $query->{top};
451     my $strict = $query->{picky};
452 ulpfr 10
453 ulpfr 19 # Return at least $minacc documents. Zero means all matching
454     # documents.
455     # my $minacc = $query->{accus} || $wanted;
456    
457     # Open index and flush cache if necessary
458 ulpfr 10 defined $self->{db} or $self->open;
459     $self->sync;
460 ulpfr 19
461     # We keep duplicates
462     my @terms =
463     # Sort words by decreasing document frequency
464 ulpfr 22 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
465 ulpfr 19 # check which words occur in the index.
466 ulpfr 22 grep { $self->{db}->{'o'.$_} } @_;
467 ulpfr 19
468     return () unless @terms; # nothing to search for
469    
470     # We special-case one term queries here. If the index was sorted,
471     # choping off the rest of the list will return the same ranking.
472     if ($wanted and @terms == 1) {
473     my $term = shift @terms;
474 ulpfr 22 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
475 ulpfr 19 my @res;
476    
477     if ($self->{reorg}) { # or not $query->{picky}
478 ulpfr 22 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
479 ulpfr 19 } else {
480 ulpfr 22 @res = unpack 'w*', $self->{db}->{'p'.$term};
481 ulpfr 19 }
482    
483     for (my $i=1; $i<@res; $i+=2) {
484 laperla 30 # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
485     # above was written badly, allows two DIV_ZERO problems.
486     my $maxtf = $self->{db}->{"m". $res[$i-1]};
487     unless ($maxtf) {
488     warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
489     $maxtf = 1;
490     }
491     $res[$i] = ($res[$i] / $maxtf) * $idf;
492 ulpfr 19 }
493    
494     return @res
495     }
496    
497     # We separate exhaustive search here to avoid overhead and make the
498     # code more readable. The block can be removed without changing the
499     # result.
500     unless ($wanted) {
501     for (@terms) {
502 ulpfr 22 my $df = $self->{db}->{'o'.$_};
503 ulpfr 19
504     # The frequency *must* be 1 at least since the posting list is nonempty
505     _complain('search for term', $_) and $df = 1 if $df < 1;
506    
507     # Unpack posting list for current query term $_
508 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
509 ulpfr 19
510 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
511 ulpfr 19 # This is the inverse document frequency. The log of the inverse
512     # fraction of documents the term occurs in.
513     my $idf = log($self->{records}/$df);
514     for my $did (keys %post) {
515 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
516 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
517     }
518 ulpfr 10 }
519     }
520 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
521     return %score;
522 ulpfr 10 }
523 ulpfr 19
524     # A sloppy but fast algorithm for multiple term queries.
525     unless ($strict) {
526     for (@terms) {
527     # Unpack posting list for current query term $_
528 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
529 ulpfr 19
530     # Lookup the number of documents the term occurs in (document frequency)
531 ulpfr 22 my $occ = $self->{db}->{'o'.$_};
532 ulpfr 19
533 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
534 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
535     _complain('search for term', $_) and $occ = 1 if $occ < 1;
536    
537     # This is the inverse document frequency. The log of the inverse
538     # fraction of documents the term occurs in.
539     my $idf = log($self->{records}/$occ);
540    
541     # If we have a reasonable number of accumulators, change the
542     # loop to iterate over the accumulators. This will compromise
543     # quality for better speed. The algorithm still computes the
544     # exact weights, but the result is not guaranteed to contain the
545     # *best* results. The database might contain documents better
546     # than the worst returned document.
547    
548     # We process the lists in order of increasing length. When the
549     # number of accumulators exceeds $wanted, no new documents are
550     # added, only the ranking/weighting of the seen documents is
551     # improved. The resulting ranking list must be pruned, since only
552     # the top most documents end up near their "optimal" rank.
553    
554     if (keys %score < $wanted) {
555     for my $did (keys %post) {
556 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
557 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
558     }
559     }
560     } else {
561     for my $did (keys %score) {
562     next unless exists $post{$did};
563 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
564 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
565     }
566     }
567     }
568     }
569     return %score;
570     }
571     my @max; $max[$#terms+1]=0;
572     my @idf;
573    
574     # Preparation loop. This extra loop makes sense only when "reorg"
575     # and "wanted" are true. But at the time beeing, keeping the code
576     # for the different search algorithms in one place seems more
577     # desirable than some minor speedup of the brute force version. We
578     # do cache $idf though.
579    
580     for (my $i = $#terms; $i >=0; $i--) {
581     local $_ = $terms[$i];
582     # Lookup the number of documents the term occurs in (document frequency)
583 ulpfr 22 my $df = $self->{db}->{'o'.$_};
584 ulpfr 19
585     # The frequency *must* be 1 at least since the posting list is nonempty
586     _complain('search for term', $_) and $df = 1 if $df < 1;
587    
588     # This is the inverse document frequency. The log of the inverse
589     # fraction of documents the term occurs in.
590     $idf[$i] = log($self->{records}/$df);
591    
592     my ($did,$occ);
593     if ($self->{reorg}) {
594 ulpfr 22 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
595 ulpfr 19 } else { # Maybe this costs more than it helps
596 ulpfr 22 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
597 ulpfr 19 }
598 ulpfr 22 my $freq = $self->{db}->{'m'. $did};
599 ulpfr 19 my $max = $occ/$freq*$idf[$i];
600     $max[$i] = $max + $max[$i+1];
601     }
602    
603     # Main loop
604     for my $i (0 .. $#terms) {
605     my $term = $terms[$i];
606     # Unpack posting list for current query term $term. We loose the
607     # sorting order because the assignment to a hash.
608 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$term};
609 ulpfr 19
610     _complain('search for term', $term)
611 ulpfr 22 if $self->{db}->{'o'.$term} != keys %post;
612 ulpfr 19
613     my $idf = $idf[$i];
614     my $full; # Need to process all postings
615     my $chop; # Score necessary to enter the ranking list
616    
617     if (# We know that wanted is true since we especial cased the
618     # exhaustive search.
619    
620     $wanted and
621    
622     # We did sort here if necessary in
623     # the preparation loop
624     # $self->{reorg} and
625    
626     scalar keys %score > $wanted) {
627     $chop = (sort { $b <=> $a } values %score)[$wanted];
628     $full = $max[$i] > $chop;
629     } else {
630     $full = 1;
631     }
632    
633     if ($full) {
634     # We need to inspect the full list. Either $wanted is not given,
635     # the index is not sorted, or we don't have enough accumulators
636     # yet.
637     if (defined $chop) {
638     # We might be able to avoid allocating accumulators
639     for my $did (keys %post) {
640 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
641 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
642     # We add an accumulator if $wgt exeeds $chop
643     if (exists $score{$did} or $wgt > $chop) {
644     $score{$did} += $wgt;
645     }
646     }
647     }
648     } else {
649     # Allocate acumulators for each seen document.
650     for my $did (keys %post) {
651 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
652 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
653     }
654     }
655     }
656     } else {
657     # Update existing accumulators
658     for my $did (keys %score) {
659     next unless exists $post{$did};
660 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
661 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
662     }
663     }
664     }
665     }
666     #warn sprintf "Used %d accumulators\n", scalar keys %score;
667 ulpfr 10 %score;
668     }
669    
670 ulpfr 19 sub set {
671     my ($self, $attr, $value) = @_;
672    
673     die "No such indexy attribute: '$attr'" unless $attr eq 'top';
674    
675     return delete $self->{reorg} if $value == 0;
676    
677     return if $self->{reorg}; # we are sorted already
678     return unless $self->{mode} & O_RDWR;
679     defined $self->{db} or $self->open;
680    
681     $self->sync;
682     while (my($key, $value) = each %{$self->{db}}) {
683 ulpfr 22 next if $key !~ /^p/;
684 ulpfr 19 $self->{db}->{$key} = $self->sort_postings($value);
685     }
686     $self->{reorg} = 1;
687     }
688    
689 ulpfr 10 sub sync {
690     my $self = shift;
691    
692     if ($self->{mode} & O_RDWR) {
693 ulpfr 19 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
694 ulpfr 10 while (my($key, $value) = each %{$self->{cache}}) {
695 laperla 30 $self->{db}->{"p". $key} ||= "";
696 ulpfr 19 if ($self->{reorg}) {
697 ulpfr 22 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
698 ulpfr 19 . $value);
699     } else {
700 ulpfr 22 $self->{db}->{'p'.$key} .= $value;
701 ulpfr 19 }
702 ulpfr 10 }
703     while (my($key, $value) = each %{$self->{cdict}}) {
704 ulpfr 22 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
705     $self->{db}->{'o'.$key} += $value;
706 ulpfr 10 }
707 ulpfr 19 $self->{cache} = {};
708     $self->{cdict} = {};
709 ulpfr 10 $self->{cached} = 0;
710     }
711     }
712    
713     sub close {
714     my $self = shift;
715    
716     if ($self->{dbh}) {
717     $self->sync;
718     delete $self->{dbh};
719     untie %{$self->{db}};
720     delete $self->{db};
721     delete $self->{func};
722     delete $self->{cache};
723     delete $self->{cached};
724     delete $self->{cdict};
725     delete $self->{pfunc} if defined $self->{pfunc};
726     delete $self->{ifunc} if defined $self->{ifunc};
727     delete $self->{xfunc} if defined $self->{xfunc};
728     }
729     }
730    
731     1;
732    

Properties

Name Value
cvs2svn:cvs-rev 1.5

  ViewVC Help
Powered by ViewVC 1.1.26