/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 22 - (hide annotations)
Sat Nov 11 16:58:53 2000 UTC (23 years, 6 months ago) by ulpfr
File size: 21034 byte(s)
Modified structure of inverted indices.  The old one was not able to
cope with words starting with "\377(o|m)".  Prefix and interval
searches have not been tested yet.

1 ulpfr 19 # -*- Mode: Perl -*-
2     # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 ulpfr 22 # Last Modified On: Sat Nov 11 16:32:38 2000
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10     # (C) Copyright 1996-2000, Ulrich Pfeifer
11     #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15     use DB_File;
16     use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19     use vars qw(%FUNC);
20    
21 ulpfr 22 # The dictionary has three different key types:
22     # 'o'.$word
23     #
24     # The document frequency is the number of documents a term occurs
25     # in. The idea is that a term occuring in a significant part of the
26     # documents is not too significant.
27     #
28     # 'm'.$word
29     #
30     # The maximum term frequency of a document is the frequency of the
31     # most frequent term in the document. It is related to the document
32     # length obviously. A document in which the most frequnet term occurs
33     # 100 times is probably much longer than a document whichs most
34     # frequent term occurs five time.
35     #
36     # 'p'.$word
37     #
38     # Under this key we store the actual posting list as pairs of
39     # packed integers.
40 ulpfr 10
41 ulpfr 22 my $no_old_index_support = 0; # do not check for old indices if set
42 ulpfr 19
43 ulpfr 10 sub new {
44     my $type = shift;
45     my %parm = @_;
46     my $self = {};
47    
48     $self->{file} = $parm{file} or croak "No file specified";
49     $self->{attr} = $parm{attr} or croak "No attributes specified";
50     $self->{filter} = $parm{filter};
51     $self->{'name'} = $parm{'name'};
52     $self->{records} = 0;
53     for (qw(intervall prefix)) {
54     if (exists $parm{$_}) {
55     if (ref $parm{$_}) {
56     $self->{$_} = [@{$parm{$_}}] # clone
57     } else {
58     $self->{$_} = $parm{$_}
59     }
60     }
61     }
62     bless $self, ref($type) || $type;
63     }
64    
65     sub name {$_[0]->{'name'}}
66    
67     sub _split_pos {
68     my ($text, $pos) = @{$_[0]};
69     my @result;
70    
71     $text =~ s/(^\s+)// and $pos += length($1);
72     while ($text =~ s/(^\S+)//) {
73     my $word = $1;
74     push @result, [$word, $pos];
75     $pos += length($word);
76     $text =~ s/(^\s+)// and $pos += length($1);
77     }
78     @result;
79     }
80    
81     sub _xfiltergen {
82     my $filter = pop @_;
83    
84 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
85     # stopwords, such as isotr clobbers "isn't" to "isnt".
86    
87     # if ($filter eq 'stop') { # avoid the slow stopword elimination
88     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
89     # }
90 ulpfr 10 if (@_) {
91     if ($filter =~ /^split(\d*)/) {
92     if ($1) {
93     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
94     } else {
95     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
96     }
97     } else {
98     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
99     }
100     } else {
101     if ($filter =~ /^split(\d*)/) {
102     if ($1) {
103     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
104     } else {
105     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
106     }
107     } else {
108     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
109     }
110     }
111     }
112    
113     sub parse_pos {
114     my $self = shift;
115    
116     unless (exists $self->{xfunc}) {
117     $self->{xfunc} =
118     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
119     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
120     }
121     &{$self->{xfunc}}($_[0]);
122     }
123    
124     sub _filtergen {
125     my $filter = pop @_;
126    
127     if (@_) {
128     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
129     } else {
130     "map(&WAIT::Filter::$filter(\$_), \@_)";
131     }
132     }
133    
134     sub drop {
135     my $self = shift;
136     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
137     my $file = $self->{file};
138    
139     ! (!-e $file or unlink $file);
140     } else { # notify our database
141     croak ref($self)."::drop called directly";
142     }
143     }
144    
145 ulpfr 22 sub is_an_old_index {
146     my $self = shift;
147    
148     return 0 if $no_old_index_support;
149     return $self->{old_index} if exists $self->{old_index};
150    
151     # We can only guess if this is an old index. We lookup the first 10
152     # $O entries. If all values are integers, we assume that the index
153     # is an old one.
154    
155     defined $self->{db} or $self->open;
156     $self->sync;
157     my $dbh = $self->{dbh}; # for convenience
158    
159     my $O = pack('C', 0xff)."o";
160     my ($word, $value) = ($O.$;);
161     $dbh->seq($word, $value, R_CURSOR);
162     for (my $i=0; $i<10;$i++) {
163     if ($value !~ /^\d+$/) {
164     return $self->{old_index} = 0;
165     }
166     if ($dbh->seq($word, $value, R_NEXT) or # no values left
167     $word !~ /^$O/o # no $O values left
168     ) {
169     # we are not sure enough that this is an old index
170     return $self->{old_index} = 0;
171     }
172     }
173     return $self->{old_index} = 1;
174     }
175    
176 ulpfr 10 sub open {
177     my $self = shift;
178     my $file = $self->{file};
179    
180     if (defined $self->{dbh}) {
181     $self->{dbh};
182     } else {
183     $self->{func} =
184     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
185     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
186     $self->{mode}, 0664, $DB_BTREE);
187     $self->{cache} = {}
188     if $self->{mode} & O_RDWR;
189     $self->{cdict} = {}
190     if $self->{mode} & O_RDWR;
191     $self->{cached} = 0;
192 ulpfr 22 if (!$no_old_index_support and $self->is_an_old_index()) {
193     warn "This is an old index, upgrade you database";
194     require WAIT::InvertedIndexOld;
195     bless $self, 'WAIT::InvertedIndexOld';
196     }
197 ulpfr 10 }
198     }
199    
200     sub insert {
201     my $self = shift;
202     my $key = shift;
203     my %occ;
204 ulpfr 13
205 ulpfr 10 defined $self->{db} or $self->open;
206     grep $occ{$_}++, &{$self->{func}}(@_);
207     my ($word, $noc);
208     $self->{records}++;
209     while (($word, $noc) = each %occ) {
210     if (defined $self->{cache}->{$word}) {
211 ulpfr 22 $self->{cdict}->{$word}++;
212 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
213 ulpfr 22 } else {
214     $self->{cdict}->{$word} = 1;
215 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
216 ulpfr 13 }
217 ulpfr 10 $self->{cached}++;
218     }
219 ulpfr 19 # This cache limit should be configurable
220 ulpfr 10 $self->sync if $self->{cached} > 100_000;
221     my $maxtf = 0;
222     for (values %occ) {
223     $maxtf = $_ if $_ > $maxtf;
224     }
225 ulpfr 22 $self->{db}->{'m'. $key} = $maxtf;
226 ulpfr 10 }
227    
228 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
229     # document length. This reduces the quality degradation if we process
230     # only the first part of a posting list.
231    
232     sub sort_postings {
233     my $self = shift;
234     my $post = shift; # reference to a hash or packed string
235    
236     if (ref $post) {
237     # we skip the sort part, if the index is not sorted
238     return pack('w*', %$post) unless $self->{reorg};
239     } else {
240     $post = { unpack 'w*', $post };
241     }
242    
243     my $r = '';
244    
245     # Sort posting list by increasing ratio of maximum term frequency (~
246     # "document length") and term frequency. This rati multipied by the
247     # inverse document frequence gives the score for a term. This sort
248     # order can be exploited for tuning of single term queries.
249    
250 ulpfr 22 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
251 ulpfr 19 <=>
252 ulpfr 22 $post->{$a} / $self->{db}->{'m'. $a}
253 ulpfr 19 } keys %$post) {
254     $r .= pack 'w2', $did, $post->{$did};
255     }
256     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
257     $r;
258     }
259    
260 ulpfr 10 sub delete {
261     my $self = shift;
262     my $key = shift;
263     my %occ;
264    
265 ulpfr 19 my $db;
266 ulpfr 10 defined $self->{db} or $self->open;
267 ulpfr 19 $db = $self->{db};
268 ulpfr 10 $self->sync;
269     $self->{records}--;
270 ulpfr 19
271     # less than zero documents in database?
272     _complain('delete of document', $key) and $self->{records} = 0
273     if $self->{records} < 0;
274    
275 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
276 ulpfr 19
277     for (keys %occ) {# may reorder posting list
278 ulpfr 22 my %post = unpack 'w*', $db->{'p'.$_};
279 ulpfr 10 delete $post{$key};
280 ulpfr 22 $db->{'p'.$_} = $self->sort_postings(\%post);
281     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
282     $db->{'o'.$_} = scalar keys %post;
283 ulpfr 10 }
284 ulpfr 22 delete $db->{'m'. $key};
285 ulpfr 10 }
286    
287     sub intervall {
288     my ($self, $first, $last) = @_;
289     my $value = '';
290     my $word = '';
291     my @result;
292    
293     return unless exists $self->{'intervall'};
294    
295     defined $self->{db} or $self->open;
296     $self->sync;
297     my $dbh = $self->{dbh}; # for convenience
298    
299     if (ref $self->{'intervall'}) {
300     unless (exists $self->{'ifunc'}) {
301     $self->{'ifunc'} =
302     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
303     }
304     ($first) = &{$self->{'ifunc'}}($first) if $first;
305     ($last) = &{$self->{'ifunc'}}($last) if $last;
306     }
307 ulpfr 22 $first = 'p'.($first||'');
308     $last = (defined $last)?'p'.$last:'q';
309    
310     # set the cursor to $first
311     $dbh->seq($first, $value, R_CURSOR);
312    
313     # $first would be after the last word
314     return () if $first gt $last;
315 ulpfr 10
316 ulpfr 22 push @result, substr($first,1);
317 ulpfr 10 while (!$dbh->seq($word, $value, R_NEXT)) {
318     # We should limit this to a "resonable" number of words
319 ulpfr 22 last if $word gt $last;
320     push @result, substr($word,1);
321 ulpfr 10 }
322     \@result; # speed
323     }
324    
325     sub prefix {
326     my ($self, $prefix) = @_;
327     my $value = '';
328     my $word = '';
329     my @result;
330    
331     return () unless defined $prefix; # Full dictionary requested !!
332     return unless exists $self->{'prefix'};
333     defined $self->{db} or $self->open;
334     $self->sync;
335     my $dbh = $self->{dbh};
336    
337     if (ref $self->{'prefix'}) {
338     unless (exists $self->{'pfunc'}) {
339     $self->{'pfunc'} =
340     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
341     }
342     ($prefix) = &{$self->{'pfunc'}}($prefix);
343     }
344    
345 ulpfr 22 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
346 ulpfr 10 return ();
347     }
348 ulpfr 22 return () if $word !~ /^p$prefix/;
349     push @result, substr($word,1);
350 ulpfr 10
351     while (!$dbh->seq($word, $value, R_NEXT)) {
352     # We should limit this to a "resonable" number of words
353 ulpfr 22 last if $word !~ /^p$prefix/;
354     push @result, substr($word,1);
355 ulpfr 10 }
356     \@result; # speed
357     }
358    
359 ulpfr 19 =head2 search($query)
360    
361     The search method supports a range of search algorithms. It is
362     recommended to tune the index by calling
363     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
364     into the table. This is a computing intense operation and all inserts
365     and deletes after this optimization are slightly more expensive. Once
366     reorganized, the index is kept sorted automatically until you switch
367     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
368    
369     When searching a tuned index, a query can be processed faster if the
370     caller requests only the topmost documents. This can be done by
371     passing a C<top =E<gt>> I<n> parameter to the search method.
372    
373     For single term queries, the method returns only the I<n> top ranking
374     documents. For multi term queries two optimized algorithms are
375     available. The first algorithm computes the top n documents
376     approximately but very fast, sacrificing a little bit of precision for
377     speed. The second algorithm computes the topmost I<n> documents
378     precisely. This algorithm is slower and should be used only for small
379     values of I<n>. It can be requested by passing the query attribute
380     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
381     While the picky version might not be faster than the brute force
382     version on average for modest size databases it uses less memory and
383     the processing time is almost linear in the number of query terms, not
384     in the size of the lists.
385    
386     =cut
387    
388 ulpfr 10 sub search {
389     my $self = shift;
390 ulpfr 19 my $query = shift;
391 ulpfr 10
392     defined $self->{db} or $self->open;
393     $self->sync;
394 ulpfr 19 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
395 ulpfr 10 }
396    
397     sub parse {
398     my $self = shift;
399    
400     defined $self->{db} or $self->open;
401     &{$self->{func}}(@_);
402     }
403    
404 ulpfr 13 sub keys {
405     my $self = shift;
406    
407     defined $self->{db} or $self->open;
408     keys %{$self->{db}};
409     }
410    
411 ulpfr 10 sub search_prefix {
412     my $self = shift;
413    
414     # print "search_prefix(@_)\n";
415     defined $self->{db} or $self->open;
416     $self->search_raw(map($self->prefix($_), @_));
417     }
418    
419 ulpfr 19 sub _complain ($$) {
420     my ($action, $term) = @_;
421    
422     require Carp;
423     Carp::cluck
424     (sprintf("WAIT database inconsistency during $action [%s]: ".
425     "Please rebuild index\n",
426     $term,));
427     }
428    
429 ulpfr 10 sub search_raw {
430     my $self = shift;
431 ulpfr 19 my $query = shift;
432 ulpfr 10 my %score;
433    
434 ulpfr 19 # Top $wanted documents must be correct. Zero means all matching
435     # documents.
436     my $wanted = $query->{top};
437     my $strict = $query->{picky};
438 ulpfr 10
439 ulpfr 19 # Return at least $minacc documents. Zero means all matching
440     # documents.
441     # my $minacc = $query->{accus} || $wanted;
442    
443     # Open index and flush cache if necessary
444 ulpfr 10 defined $self->{db} or $self->open;
445     $self->sync;
446 ulpfr 19
447     # We keep duplicates
448     my @terms =
449     # Sort words by decreasing document frequency
450 ulpfr 22 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
451 ulpfr 19 # check which words occur in the index.
452 ulpfr 22 grep { $self->{db}->{'o'.$_} } @_;
453 ulpfr 19
454     return () unless @terms; # nothing to search for
455    
456     # We special-case one term queries here. If the index was sorted,
457     # choping off the rest of the list will return the same ranking.
458     if ($wanted and @terms == 1) {
459     my $term = shift @terms;
460 ulpfr 22 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
461 ulpfr 19 my @res;
462    
463     if ($self->{reorg}) { # or not $query->{picky}
464 ulpfr 22 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
465 ulpfr 19 } else {
466 ulpfr 22 @res = unpack 'w*', $self->{db}->{'p'.$term};
467 ulpfr 19 }
468    
469     for (my $i=1; $i<@res; $i+=2) {
470 ulpfr 22 $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
471 ulpfr 19 }
472    
473     return @res
474     }
475    
476     # We separate exhaustive search here to avoid overhead and make the
477     # code more readable. The block can be removed without changing the
478     # result.
479     unless ($wanted) {
480     for (@terms) {
481 ulpfr 22 my $df = $self->{db}->{'o'.$_};
482 ulpfr 19
483     # The frequency *must* be 1 at least since the posting list is nonempty
484     _complain('search for term', $_) and $df = 1 if $df < 1;
485    
486     # Unpack posting list for current query term $_
487 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
488 ulpfr 19
489 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
490 ulpfr 19 # This is the inverse document frequency. The log of the inverse
491     # fraction of documents the term occurs in.
492     my $idf = log($self->{records}/$df);
493     for my $did (keys %post) {
494 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
495 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
496     }
497 ulpfr 10 }
498     }
499 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
500     return %score;
501 ulpfr 10 }
502 ulpfr 19
503     # A sloppy but fast algorithm for multiple term queries.
504     unless ($strict) {
505     for (@terms) {
506     # Unpack posting list for current query term $_
507 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$_};
508 ulpfr 19
509     # Lookup the number of documents the term occurs in (document frequency)
510 ulpfr 22 my $occ = $self->{db}->{'o'.$_};
511 ulpfr 19
512 ulpfr 22 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
513 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
514     _complain('search for term', $_) and $occ = 1 if $occ < 1;
515    
516     # This is the inverse document frequency. The log of the inverse
517     # fraction of documents the term occurs in.
518     my $idf = log($self->{records}/$occ);
519    
520     # If we have a reasonable number of accumulators, change the
521     # loop to iterate over the accumulators. This will compromise
522     # quality for better speed. The algorithm still computes the
523     # exact weights, but the result is not guaranteed to contain the
524     # *best* results. The database might contain documents better
525     # than the worst returned document.
526    
527     # We process the lists in order of increasing length. When the
528     # number of accumulators exceeds $wanted, no new documents are
529     # added, only the ranking/weighting of the seen documents is
530     # improved. The resulting ranking list must be pruned, since only
531     # the top most documents end up near their "optimal" rank.
532    
533     if (keys %score < $wanted) {
534     for my $did (keys %post) {
535 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
536 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
537     }
538     }
539     } else {
540     for my $did (keys %score) {
541     next unless exists $post{$did};
542 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
543 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
544     }
545     }
546     }
547     }
548     return %score;
549     }
550     my @max; $max[$#terms+1]=0;
551     my @idf;
552    
553     # Preparation loop. This extra loop makes sense only when "reorg"
554     # and "wanted" are true. But at the time beeing, keeping the code
555     # for the different search algorithms in one place seems more
556     # desirable than some minor speedup of the brute force version. We
557     # do cache $idf though.
558    
559     for (my $i = $#terms; $i >=0; $i--) {
560     local $_ = $terms[$i];
561     # Lookup the number of documents the term occurs in (document frequency)
562 ulpfr 22 my $df = $self->{db}->{'o'.$_};
563 ulpfr 19
564     # The frequency *must* be 1 at least since the posting list is nonempty
565     _complain('search for term', $_) and $df = 1 if $df < 1;
566    
567     # This is the inverse document frequency. The log of the inverse
568     # fraction of documents the term occurs in.
569     $idf[$i] = log($self->{records}/$df);
570    
571     my ($did,$occ);
572     if ($self->{reorg}) {
573 ulpfr 22 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
574 ulpfr 19 } else { # Maybe this costs more than it helps
575 ulpfr 22 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
576 ulpfr 19 }
577 ulpfr 22 my $freq = $self->{db}->{'m'. $did};
578 ulpfr 19 my $max = $occ/$freq*$idf[$i];
579     $max[$i] = $max + $max[$i+1];
580     }
581    
582     # Main loop
583     for my $i (0 .. $#terms) {
584     my $term = $terms[$i];
585     # Unpack posting list for current query term $term. We loose the
586     # sorting order because the assignment to a hash.
587 ulpfr 22 my %post = unpack 'w*', $self->{db}->{'p'.$term};
588 ulpfr 19
589     _complain('search for term', $term)
590 ulpfr 22 if $self->{db}->{'o'.$term} != keys %post;
591 ulpfr 19
592     my $idf = $idf[$i];
593     my $full; # Need to process all postings
594     my $chop; # Score necessary to enter the ranking list
595    
596     if (# We know that wanted is true since we especial cased the
597     # exhaustive search.
598    
599     $wanted and
600    
601     # We did sort here if necessary in
602     # the preparation loop
603     # $self->{reorg} and
604    
605     scalar keys %score > $wanted) {
606     $chop = (sort { $b <=> $a } values %score)[$wanted];
607     $full = $max[$i] > $chop;
608     } else {
609     $full = 1;
610     }
611    
612     if ($full) {
613     # We need to inspect the full list. Either $wanted is not given,
614     # the index is not sorted, or we don't have enough accumulators
615     # yet.
616     if (defined $chop) {
617     # We might be able to avoid allocating accumulators
618     for my $did (keys %post) {
619 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
620 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
621     # We add an accumulator if $wgt exeeds $chop
622     if (exists $score{$did} or $wgt > $chop) {
623     $score{$did} += $wgt;
624     }
625     }
626     }
627     } else {
628     # Allocate acumulators for each seen document.
629     for my $did (keys %post) {
630 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
631 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
632     }
633     }
634     }
635     } else {
636     # Update existing accumulators
637     for my $did (keys %score) {
638     next unless exists $post{$did};
639 ulpfr 22 if (my $freq = $self->{db}->{'m'. $did}) {
640 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
641     }
642     }
643     }
644     }
645     #warn sprintf "Used %d accumulators\n", scalar keys %score;
646 ulpfr 10 %score;
647     }
648    
649 ulpfr 19 sub set {
650     my ($self, $attr, $value) = @_;
651    
652     die "No such indexy attribute: '$attr'" unless $attr eq 'top';
653    
654     return delete $self->{reorg} if $value == 0;
655    
656     return if $self->{reorg}; # we are sorted already
657     return unless $self->{mode} & O_RDWR;
658     defined $self->{db} or $self->open;
659    
660     $self->sync;
661     while (my($key, $value) = each %{$self->{db}}) {
662 ulpfr 22 next if $key !~ /^p/;
663 ulpfr 19 $self->{db}->{$key} = $self->sort_postings($value);
664     }
665     $self->{reorg} = 1;
666     }
667    
668 ulpfr 10 sub sync {
669     my $self = shift;
670    
671     if ($self->{mode} & O_RDWR) {
672 ulpfr 19 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
673 ulpfr 10 while (my($key, $value) = each %{$self->{cache}}) {
674 ulpfr 19 if ($self->{reorg}) {
675 ulpfr 22 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
676 ulpfr 19 . $value);
677     } else {
678 ulpfr 22 $self->{db}->{'p'.$key} .= $value;
679 ulpfr 19 }
680 ulpfr 10 }
681     while (my($key, $value) = each %{$self->{cdict}}) {
682 ulpfr 22 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
683     $self->{db}->{'o'.$key} += $value;
684 ulpfr 10 }
685 ulpfr 19 $self->{cache} = {};
686     $self->{cdict} = {};
687 ulpfr 10 $self->{cached} = 0;
688     }
689     }
690    
691     sub close {
692     my $self = shift;
693    
694     if ($self->{dbh}) {
695     $self->sync;
696     delete $self->{dbh};
697     untie %{$self->{db}};
698     delete $self->{db};
699     delete $self->{func};
700     delete $self->{cache};
701     delete $self->{cached};
702     delete $self->{cdict};
703     delete $self->{pfunc} if defined $self->{pfunc};
704     delete $self->{ifunc} if defined $self->{ifunc};
705     delete $self->{xfunc} if defined $self->{xfunc};
706     }
707     }
708    
709     1;
710    

Properties

Name Value
cvs2svn:cvs-rev 1.2

  ViewVC Help
Powered by ViewVC 1.1.26