/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 40 by laperla, Mon Nov 13 10:44:03 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Sun Nov 12 14:40:21 2000
8  # Language        : CPerl  # Language        : CPerl
 # Status          : Unknown, Use with caution!  
9  #  #
10  # Copyright (c) 1996-1997, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
# Line 17  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22  my $M = pack('C', 0xff)."m";                  # maxtf  
23    # The dictionary has three different key types:
24    #  'o'.$word
25    #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequnet term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43    my $no_old_index_support = 0; # do not check for old indices if set
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 63  sub _split_pos { Line 83  sub _split_pos {
83  sub _xfiltergen {  sub _xfiltergen {
84    my $filter = pop @_;    my $filter = pop @_;
85    
86    if ($filter eq 'stop') {      # avoid the slow stopword elimination  # Oops, we cannot overrule the user's choice. Other filters may kill
87      return _xfiltergen(@_);            # it's cheaper to look them up afterwards  # stopwords, such as isotr clobbers "isn't" to "isnt".
88    }  
89    #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
90    #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
91    #  }
92    if (@_) {    if (@_) {
93      if ($filter =~ /^split(\d*)/) {      if ($filter =~ /^split(\d*)/) {
94        if ($1) {        if ($1) {
# Line 121  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163      if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167      for (my $i=0; $i<10;$i++) {
168        if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170          return $self->{old_index} = 0;
171        }
172        if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173            $word !~ /^$O$;/o                   # no $O values left
174           ) {
175          # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177          return $self->{old_index} = 0;
178        }
179      }
180      # warn "DEBUG: old index";
181      return $self->{old_index} = 1;
182    }
183    
184  sub open {  sub open {
185    my $self = shift;    my $self = shift;
186    my $file = $self->{file};    my $file = $self->{file};
# Line 132  sub open { Line 192  sub open {
192        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
194                         $self->{mode}, 0664, $DB_BTREE);                         $self->{mode}, 0664, $DB_BTREE);
 #    tie(%{$self->{cache}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
195      $self->{cache} = {}      $self->{cache} = {}
196        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
197      $self->{cdict} = {}      $self->{cdict} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
199      $self->{cached} = 0;      $self->{cached} = 0;
200        if (!$no_old_index_support and $self->is_an_old_index()) {
201          warn "This is an old index, upgrade you database";
202          require WAIT::InvertedIndexOld;
203          bless $self, 'WAIT::InvertedIndexOld';
204        }
205    }    }
206  }  }
207    
# Line 148  sub insert { Line 209  sub insert {
209    my $self  = shift;    my $self  = shift;
210    my $key   = shift;    my $key   = shift;
211    my %occ;    my %occ;
212      
213    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
214    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
215    my ($word, $noc);    my ($word, $noc);
216    $self->{records}++;    $self->{records}++;
217    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
218      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
219        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
220        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221      } else {      } else {          
222        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
223        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
224      }      }
225      $self->{cached}++;      $self->{cached}++;
226    }    }
227      # This cache limit should be configurable
228    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
229    my $maxtf = 0;    my $maxtf = 0;
230    for (values %occ) {    for (values %occ) {
231      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
232    }    }
233    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
234    }
235    
236    # We sort postings by increasing max term frequency (~ by increasing
237    # document length.  This reduces the quality degradation if we process
238    # only the first part of a posting list.
239    
240    sub sort_postings {
241      my $self = shift;
242      my $post = shift;             # reference to a hash or packed string
243    
244      if (ref $post) {
245        # we skip the sort part, if the index is not sorted
246        return pack('w*', %$post) unless $self->{reorg};
247      } else {
248        $post = { unpack 'w*', $post };
249      }
250    
251      my $r = '';
252    
253      # Sort posting list by increasing ratio of maximum term frequency (~
254      # "document length") and term frequency. This rati multipied by the
255      # inverse document frequence gives the score for a term.  This sort
256      # order can be exploited for tuning of single term queries.
257    
258      for my $did (keys %$post) { # sanity check
259        unless ($self->{db}->{"m". $did}) {
260          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262        }
263      }
264      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
265                                          <=>
266                             $post->{$a} / $self->{db}->{'m'. $a}
267                        } keys %$post) {
268        $r .= pack 'w2', $did, $post->{$did};
269      }
270      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
271      $r;
272  }  }
273    
274  sub delete {  sub delete {
# Line 176  sub delete { Line 276  sub delete {
276    my $key   = shift;    my $key   = shift;
277    my %occ;    my %occ;
278    
279      my $db;
280    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
281      $db = $self->{db};
282    $self->sync;    $self->sync;
283    $self->{records}--;    $self->{records}--;
284    
285      # less than zero documents in database?
286      _complain('delete of document', $key) and $self->{records} = 0
287        if $self->{records} < 0;
288    
289    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
290    for (keys %occ) {  
291      # may reorder posting list    # Be prepared for "Odd number of elements in hash assignment"
292      my %post = unpack 'w*', $self->{db}->{$_};    local $SIG{__WARN__} = sub {
293      $self->{db}->{$O,$_}--;      my $warning = shift;
294        chomp $warning;
295        warn "Catching warning[$warning] during delete of key[$key]";
296      };
297      for (keys %occ) {# may reorder posting list
298        my %post = unpack 'w*', $db->{'p'.$_};
299      delete $post{$key};      delete $post{$key};
300      $self->{db}->{$_} = pack 'w*', %post;      $db->{'p'.$_}    = $self->sort_postings(\%post);
301        _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302        $db->{'o'.$_} = scalar keys %post;
303    }    }
304    delete $self->{db}->{$M, $key};    delete $db->{'m'. $key};
305  }  }
306    
307  sub intervall {  sub intervall {
# Line 210  sub intervall { Line 324  sub intervall {
324      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
325      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
326    }    }
327    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
328      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
329    } else {  
330      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
331    }    $dbh->seq($first, $value, R_CURSOR);
332    # We assume that word do not start with the character \377  
333    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
334    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
335        
336    push @result, $first;    push @result, substr($first,1);
337    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
338      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
339      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
340      push @result, $word;      push @result, substr($word,1);
341    }    }
342    \@result;                     # speed    \@result;                     # speed
343  }  }
# Line 248  sub prefix { Line 362  sub prefix {
362      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
363    }    }
364    
365    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366      return ();      return ();
367    }    }
368    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
369    push @result, $word;    push @result, substr($word,1);
370    
371    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
372      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
373      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
374      push @result, $word;      push @result, substr($word,1);
375    }    }
376    \@result;                     # speed    \@result;                     # speed
377  }  }
378    
379    =head2 search($query)
380    
381    The search method supports a range of search algorithms.  It is
382    recommended to tune the index by calling
383    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
384    into the table.  This is a computing intense operation and all inserts
385    and deletes after this optimization are slightly more expensive.  Once
386    reorganized, the index is kept sorted automatically until you switch
387    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
388    
389    When searching a tuned index, a query can be processed faster if the
390    caller requests only the topmost documents.  This can be done by
391    passing a C<top =E<gt>> I<n> parameter to the search method.
392    
393    For single term queries, the method returns only the I<n> top ranking
394    documents.  For multi term queries two optimized algorithms are
395    available. The first algorithm computes the top n documents
396    approximately but very fast, sacrificing a little bit of precision for
397    speed.  The second algorithm computes the topmost I<n> documents
398    precisely.  This algorithm is slower and should be used only for small
399    values of I<n>.  It can be requested by passing the query attribute
400    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
401    While the picky version might not be faster than the brute force
402    version on average for modest size databases it uses less memory and
403    the processing time is almost linear in the number of query terms, not
404    in the size of the lists.
405    
406    =cut
407    
408  sub search {  sub search {
409    my $self  = shift;    my $self  = shift;
410      my $query = shift;
411    
412    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
413    $self->sync;    $self->sync;
414    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
415  }  }
416    
417  sub parse {  sub parse {
# Line 277  sub parse { Line 421  sub parse {
421    &{$self->{func}}(@_);    &{$self->{func}}(@_);
422  }  }
423    
424    sub keys {
425      my $self  = shift;
426    
427      defined $self->{db} or $self->open;
428      keys %{$self->{db}};
429    }
430    
431  sub search_prefix {  sub search_prefix {
432    my $self  = shift;    my $self  = shift;
433    
# Line 285  sub search_prefix { Line 436  sub search_prefix {
436    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
437  }  }
438    
439    sub _complain ($$) {
440      my ($action, $term) = @_;
441    
442      require Carp;
443      Carp::cluck
444        (sprintf("WAIT database inconsistency during $action [%s]: ".
445                 "Please rebuild index\n",
446                 $term,));
447    }
448    
449  sub search_raw {  sub search_raw {
450    my $self  = shift;    my $self  = shift;
451    my %occ;    my $query = shift;
452    my %score;    my %score;
453    
454    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
455      # documents.
456      my $wanted = $query->{top};
457      my $strict = $query->{picky};
458    
459      # Return at least $minacc documents. Zero means all matching
460      # documents.
461      # my $minacc = $query->{accus} || $wanted;
462    
463      # Open index and flush cache if necessary
464    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
465    $self->sync;    $self->sync;
466    grep $occ{$_}++, @_;  
467    for (keys %occ) {    # We keep duplicates
468      if (defined $self->{db}->{$_}) {    my @terms =
469        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
470        my $idf = log($self->{records}/$self->{db}->{$O,$_});      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
471        my $did;        # check which words occur in the index.
472        for $did (keys %post) {        grep { $self->{db}->{'o'.$_} } @_;
473          $score{$did} = 0 unless defined $score{$did}; # perl -w  
474          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    return unless @terms;
475            if $self->{db}->{$M, $did}; # db may be broken  
476      # We special-case one term queries here.  If the index was sorted,
477      # choping off the rest of the list will return the same ranking.
478      if ($wanted and @terms == 1) {
479        my $term  = shift @terms;
480        my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
481        my @res;
482    
483        if ($self->{reorg}) { # or not $query->{picky}
484          @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
485        } else {
486          @res = unpack 'w*',                $self->{db}->{'p'.$term};
487        }
488    
489        for (my $i=1; $i<@res; $i+=2) {
490          # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
491          # above was written badly, allows two DIV_ZERO problems.
492          my $maxtf = $self->{db}->{"m". $res[$i-1]};
493          unless ($maxtf) {
494            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
495            $maxtf = 1;
496          }
497          $res[$i] = ($res[$i] / $maxtf) * $idf;
498        }
499    
500        return @res
501      }
502    
503      # We separate exhaustive search here to avoid overhead and make the
504      # code more readable. The block can be removed without changing the
505      # result.
506      unless ($wanted) {
507        for (@terms) {
508          my $df      = $self->{db}->{'o'.$_};
509    
510          # The frequency *must* be 1 at least since the posting list is nonempty
511          _complain('search for term', $_) and $df = 1 if $df < 1;
512    
513          # Unpack posting list for current query term $_
514          my %post = unpack 'w*', $self->{db}->{'p'.$_};
515    
516          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
517          # This is the inverse document frequency. The log of the inverse
518          # fraction of documents the term occurs in.
519          my $idf = log($self->{records}/$df);
520          for my $did (keys %post) {
521            if (my $freq = $self->{db}->{'m'. $did}) {
522              $score{$did} += $post{$did} / $freq * $idf;
523            }
524          }
525        }
526        # warn sprintf "Used %d accumulators\n", scalar keys %score;
527        return %score;
528      }
529    
530      # A sloppy but fast algorithm for multiple term queries.
531      unless ($strict) {
532        for (@terms) {
533          # Unpack posting list for current query term $_
534          my %post = unpack 'w*', $self->{db}->{'p'.$_};
535    
536          # Lookup the number of documents the term occurs in (document frequency)
537          my $occ  = $self->{db}->{'o'.$_};
538    
539          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
540          # The frequency *must* be 1 at least since the posting list is nonempty
541          _complain('search for term', $_) and $occ = 1 if $occ < 1;
542    
543          # This is the inverse document frequency. The log of the inverse
544          # fraction of documents the term occurs in.
545          my $idf = log($self->{records}/$occ);
546    
547          # If we have a reasonable number of accumulators, change the
548          # loop to iterate over the accumulators.  This will compromise
549          # quality for better speed.  The algorithm still computes the
550          # exact weights, but the result is not guaranteed to contain the
551          # *best* results.  The database might contain documents better
552          # than the worst returned document.
553          
554          # We process the lists in order of increasing length.  When the
555          # number of accumulators exceeds $wanted, no new documents are
556          # added, only the ranking/weighting of the seen documents is
557          # improved.  The resulting ranking list must be pruned, since only
558          # the top most documents end up near their "optimal" rank.
559          
560          if (keys %score < $wanted) {
561            for my $did (keys %post) {
562              if (my $freq = $self->{db}->{'m'. $did}) {
563                $score{$did} += $post{$did} / $freq * $idf;
564              }
565            }
566          } else {
567            for my $did (keys %score) {
568              next unless exists $post{$did};
569              if (my $freq = $self->{db}->{'m'. $did}) {
570                $score{$did} += $post{$did} / $freq * $idf;
571              }
572            }
573          }
574        }
575        return %score;
576      }
577      my @max; $max[$#terms+1]=0;
578      my @idf;
579    
580      # Preparation loop.  This extra loop makes sense only when "reorg"
581      # and "wanted" are true.  But at the time beeing, keeping the code
582      # for the different search algorithms in one place seems more
583      # desirable than some minor speedup of the brute force version.  We
584      # do cache $idf though.
585    
586      for (my $i = $#terms; $i >=0; $i--) {
587        local $_ = $terms[$i];
588        # Lookup the number of documents the term occurs in (document frequency)
589        my $df      = $self->{db}->{'o'.$_};
590    
591        # The frequency *must* be 1 at least since the posting list is nonempty
592        _complain('search for term', $_) and $df = 1 if $df < 1;
593    
594        # This is the inverse document frequency. The log of the inverse
595        # fraction of documents the term occurs in.
596        $idf[$i] = log($self->{records}/$df);
597    
598        my ($did,$occ);
599        if ($self->{reorg}) {
600          ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
601        } else {                    # Maybe this costs more than it helps
602          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
603        }
604        my $freq      = $self->{db}->{'m'. $did};
605        my $max       = $occ/$freq*$idf[$i];
606        $max[$i]      = $max + $max[$i+1];
607      }
608    
609      # Main loop
610      for my $i (0 .. $#terms) {
611        my $term = $terms[$i];
612        # Unpack posting list for current query term $term. We loose the
613        # sorting order because the assignment to a hash.
614        my %post = unpack 'w*', $self->{db}->{'p'.$term};
615    
616        _complain('search for term', $term)
617          if $self->{db}->{'o'.$term} != keys %post;
618    
619        my $idf  = $idf[$i];
620        my $full;                   # Need to process all postings
621        my $chop;                   # Score necessary to enter the ranking list
622    
623        if (# We know that wanted is true since we especial cased the
624            # exhaustive search.
625    
626            $wanted and
627    
628            # We did sort here if necessary in
629            # the preparation loop
630            # $self->{reorg} and
631    
632            scalar keys %score > $wanted) {
633          $chop = (sort { $b <=> $a } values %score)[$wanted];
634          $full = $max[$i] > $chop;
635        } else {
636          $full = 1;
637        }
638    
639        if ($full) {
640          # We need to inspect the full list. Either $wanted is not given,
641          # the index is not sorted, or we don't have enough accumulators
642          # yet.
643          if (defined $chop) {
644            # We might be able to avoid allocating accumulators
645            for my $did (keys %post) {
646              if (my $freq = $self->{db}->{'m'. $did}) {
647                my $wgt = $post{$did} / $freq * $idf;
648                # We add an accumulator if $wgt exeeds $chop
649                if (exists $score{$did} or $wgt > $chop) {
650                  $score{$did} += $wgt;
651                }
652              }
653            }
654          } else {
655            # Allocate acumulators for each seen document.
656            for my $did (keys %post) {
657              if (my $freq = $self->{db}->{'m'. $did}) {
658                $score{$did} += $post{$did} / $freq * $idf;
659              }
660            }
661          }
662        } else {
663          # Update existing accumulators
664          for my $did (keys %score) {
665            next unless exists $post{$did};
666            if (my $freq = $self->{db}->{'m'. $did}) {
667              $score{$did} += $post{$did} / $freq * $idf;
668            }
669        }        }
670      }      }
671    }    }
672      #warn sprintf "Used %d accumulators\n", scalar keys %score;
673    %score;    %score;
674  }  }
675    
676    sub set {
677      my ($self, $attr, $value) = @_;
678    
679      die "No such indexy attribute: '$attr'" unless $attr eq 'top';
680    
681      return delete $self->{reorg} if $value == 0;
682    
683      return if     $self->{reorg};     # we are sorted already
684      return unless $self->{mode} & O_RDWR;
685      defined $self->{db} or $self->open;
686    
687      $self->sync;
688      while (my($key, $value) = each %{$self->{db}}) {
689        next if $key !~ /^p/;
690        $self->{db}->{$key} = $self->sort_postings($value);
691      }
692      $self->{reorg} = 1;
693    }
694    
695  sub sync {  sub sync {
696    my $self = shift;    my $self = shift;
697    
698    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
699      print STDERR "\aFlushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
700      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
701        $self->{db}->{$key} .= $value;        $self->{db}->{"p". $key} ||= "";
702        #delete $self->{cache}->{$key};        if ($self->{reorg}) {
703            $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
704                                                       . $value);
705          } else {
706            $self->{db}->{'p'.$key} .= $value;
707          }
708      }      }
709      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
710        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
711        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
712        #delete $self->{cdict}->{$key};      }
713      }      $self->{cache}  = {};
714      $self->{cache} = {};      $self->{cdict}  = {};
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
715      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
716    }    }
717  }  }
718    

Legend:
Removed from v.11  
changed lines
  Added in v.40

  ViewVC Help
Powered by ViewVC 1.1.26