/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 36 by ulpfr, Sun Nov 12 17:01:59 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Sun Nov 12 14:40:21 2000
8  # Language        : CPerl  # Language        : CPerl
 # Status          : Unknown, Use with caution!  
9  #  #
10  # Copyright (c) 1996-1997, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
# Line 17  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22  my $M = pack('C', 0xff)."m";                  # maxtf  
23    # The dictionary has three different key types:
24    #  'o'.$word
25    #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequnet term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43    my $no_old_index_support = 0; # do not check for old indices if set
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 63  sub _split_pos { Line 83  sub _split_pos {
83  sub _xfiltergen {  sub _xfiltergen {
84    my $filter = pop @_;    my $filter = pop @_;
85    
86    if ($filter eq 'stop') {      # avoid the slow stopword elimination  # Oops, we cannot overrule the user's choice. Other filters may kill
87      return _xfiltergen(@_);            # it's cheaper to look them up afterwards  # stopwords, such as isotr clobbers "isn't" to "isnt".
88    }  
89    #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
90    #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
91    #  }
92    if (@_) {    if (@_) {
93      if ($filter =~ /^split(\d*)/) {      if ($filter =~ /^split(\d*)/) {
94        if ($1) {        if ($1) {
# Line 121  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified!
163      $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;
164      for (my $i=0; $i<10;$i++) {
165        if ($value !~ /^\d+$/) {
166          return $self->{old_index} = 0;
167        }
168        if ($dbh->seq($word, $value, R_NEXT) or # no values left
169            $word !~ /^$O$;/o                   # no $O values left
170           ) {
171          # we are not sure enough that this is an old index
172          return $self->{old_index} = 0;
173        }
174      }
175      return $self->{old_index} = 1;
176    }
177    
178  sub open {  sub open {
179    my $self = shift;    my $self = shift;
180    my $file = $self->{file};    my $file = $self->{file};
# Line 132  sub open { Line 186  sub open {
186        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
187      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
188                         $self->{mode}, 0664, $DB_BTREE);                         $self->{mode}, 0664, $DB_BTREE);
 #    tie(%{$self->{cache}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
189      $self->{cache} = {}      $self->{cache} = {}
190        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
191      $self->{cdict} = {}      $self->{cdict} = {}
192        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
193      $self->{cached} = 0;      $self->{cached} = 0;
194        if (!$no_old_index_support and $self->is_an_old_index()) {
195          warn "This is an old index, upgrade you database";
196          require WAIT::InvertedIndexOld;
197          bless $self, 'WAIT::InvertedIndexOld';
198        }
199    }    }
200  }  }
201    
# Line 148  sub insert { Line 203  sub insert {
203    my $self  = shift;    my $self  = shift;
204    my $key   = shift;    my $key   = shift;
205    my %occ;    my %occ;
206      
207    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
208    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
209    my ($word, $noc);    my ($word, $noc);
210    $self->{records}++;    $self->{records}++;
211    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
212      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
213        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
214        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
215      } else {      } else {          
216        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
217        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
218      }      }
219      $self->{cached}++;      $self->{cached}++;
220    }    }
221      # This cache limit should be configurable
222    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
223    my $maxtf = 0;    my $maxtf = 0;
224    for (values %occ) {    for (values %occ) {
225      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
226    }    }
227    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
228    }
229    
230    # We sort postings by increasing max term frequency (~ by increasing
231    # document length.  This reduces the quality degradation if we process
232    # only the first part of a posting list.
233    
234    sub sort_postings {
235      my $self = shift;
236      my $post = shift;             # reference to a hash or packed string
237    
238      if (ref $post) {
239        # we skip the sort part, if the index is not sorted
240        return pack('w*', %$post) unless $self->{reorg};
241      } else {
242        $post = { unpack 'w*', $post };
243      }
244    
245      my $r = '';
246    
247      # Sort posting list by increasing ratio of maximum term frequency (~
248      # "document length") and term frequency. This rati multipied by the
249      # inverse document frequence gives the score for a term.  This sort
250      # order can be exploited for tuning of single term queries.
251    
252      for my $did (keys %$post) { # sanity check
253        unless ($self->{db}->{"m". $did}) {
254          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
255          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
256        }
257      }
258      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
259                                          <=>
260                             $post->{$a} / $self->{db}->{'m'. $a}
261                        } keys %$post) {
262        $r .= pack 'w2', $did, $post->{$did};
263      }
264      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
265      $r;
266  }  }
267    
268  sub delete {  sub delete {
# Line 176  sub delete { Line 270  sub delete {
270    my $key   = shift;    my $key   = shift;
271    my %occ;    my %occ;
272    
273      my $db;
274    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
275      $db = $self->{db};
276    $self->sync;    $self->sync;
277    $self->{records}--;    $self->{records}--;
278    
279      # less than zero documents in database?
280      _complain('delete of document', $key) and $self->{records} = 0
281        if $self->{records} < 0;
282    
283    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
284    for (keys %occ) {  
285      # may reorder posting list    # Be prepared for "Odd number of elements in hash assignment"
286      my %post = unpack 'w*', $self->{db}->{$_};    local $SIG{__WARN__} = sub {
287      $self->{db}->{$O,$_}--;      my $warning = shift;
288        chomp $warning;
289        warn "Catching warning[$warning] during delete of key[$key]";
290      };
291      for (keys %occ) {# may reorder posting list
292        my %post = unpack 'w*', $db->{'p'.$_};
293      delete $post{$key};      delete $post{$key};
294      $self->{db}->{$_} = pack 'w*', %post;      $db->{'p'.$_}    = $self->sort_postings(\%post);
295        _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
296        $db->{'o'.$_} = scalar keys %post;
297    }    }
298    delete $self->{db}->{$M, $key};    delete $db->{'m'. $key};
299  }  }
300    
301  sub intervall {  sub intervall {
# Line 210  sub intervall { Line 318  sub intervall {
318      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
319      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
320    }    }
321    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
322      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
323    } else {  
324      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
325    }    $dbh->seq($first, $value, R_CURSOR);
326    # We assume that word do not start with the character \377  
327    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
328    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
329        
330    push @result, $first;    push @result, substr($first,1);
331    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
332      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
333      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
334      push @result, $word;      push @result, substr($word,1);
335    }    }
336    \@result;                     # speed    \@result;                     # speed
337  }  }
# Line 248  sub prefix { Line 356  sub prefix {
356      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
357    }    }
358    
359    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
360      return ();      return ();
361    }    }
362    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
363    push @result, $word;    push @result, substr($word,1);
364    
365    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
366      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
367      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
368      push @result, $word;      push @result, substr($word,1);
369    }    }
370    \@result;                     # speed    \@result;                     # speed
371  }  }
372    
373    =head2 search($query)
374    
375    The search method supports a range of search algorithms.  It is
376    recommended to tune the index by calling
377    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
378    into the table.  This is a computing intense operation and all inserts
379    and deletes after this optimization are slightly more expensive.  Once
380    reorganized, the index is kept sorted automatically until you switch
381    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
382    
383    When searching a tuned index, a query can be processed faster if the
384    caller requests only the topmost documents.  This can be done by
385    passing a C<top =E<gt>> I<n> parameter to the search method.
386    
387    For single term queries, the method returns only the I<n> top ranking
388    documents.  For multi term queries two optimized algorithms are
389    available. The first algorithm computes the top n documents
390    approximately but very fast, sacrificing a little bit of precision for
391    speed.  The second algorithm computes the topmost I<n> documents
392    precisely.  This algorithm is slower and should be used only for small
393    values of I<n>.  It can be requested by passing the query attribute
394    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
395    While the picky version might not be faster than the brute force
396    version on average for modest size databases it uses less memory and
397    the processing time is almost linear in the number of query terms, not
398    in the size of the lists.
399    
400    =cut
401    
402  sub search {  sub search {
403    my $self  = shift;    my $self  = shift;
404      my $query = shift;
405    
406    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
407    $self->sync;    $self->sync;
408    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
409  }  }
410    
411  sub parse {  sub parse {
# Line 277  sub parse { Line 415  sub parse {
415    &{$self->{func}}(@_);    &{$self->{func}}(@_);
416  }  }
417    
418    sub keys {
419      my $self  = shift;
420    
421      defined $self->{db} or $self->open;
422      keys %{$self->{db}};
423    }
424    
425  sub search_prefix {  sub search_prefix {
426    my $self  = shift;    my $self  = shift;
427    
# Line 285  sub search_prefix { Line 430  sub search_prefix {
430    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
431  }  }
432    
433    sub _complain ($$) {
434      my ($action, $term) = @_;
435    
436      require Carp;
437      Carp::cluck
438        (sprintf("WAIT database inconsistency during $action [%s]: ".
439                 "Please rebuild index\n",
440                 $term,));
441    }
442    
443  sub search_raw {  sub search_raw {
444    my $self  = shift;    my $self  = shift;
445    my %occ;    my $query = shift;
446    my %score;    my %score;
447    
448    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
449      # documents.
450      my $wanted = $query->{top};
451      my $strict = $query->{picky};
452    
453      # Return at least $minacc documents. Zero means all matching
454      # documents.
455      # my $minacc = $query->{accus} || $wanted;
456    
457      # Open index and flush cache if necessary
458    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
459    $self->sync;    $self->sync;
460    grep $occ{$_}++, @_;  
461    for (keys %occ) {    # We keep duplicates
462      if (defined $self->{db}->{$_}) {    my @terms =
463        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
464        my $idf = log($self->{records}/$self->{db}->{$O,$_});      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
465        my $did;        # check which words occur in the index.
466        for $did (keys %post) {        grep { $self->{db}->{'o'.$_} } @_;
467          $score{$did} = 0 unless defined $score{$did}; # perl -w  
468          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    return () unless @terms;                 # nothing to search for
469            if $self->{db}->{$M, $did}; # db may be broken  
470      # We special-case one term queries here.  If the index was sorted,
471      # choping off the rest of the list will return the same ranking.
472      if ($wanted and @terms == 1) {
473        my $term  = shift @terms;
474        my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
475        my @res;
476    
477        if ($self->{reorg}) { # or not $query->{picky}
478          @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
479        } else {
480          @res = unpack 'w*',                $self->{db}->{'p'.$term};
481        }
482    
483        for (my $i=1; $i<@res; $i+=2) {
484          # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
485          # above was written badly, allows two DIV_ZERO problems.
486          my $maxtf = $self->{db}->{"m". $res[$i-1]};
487          unless ($maxtf) {
488            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
489            $maxtf = 1;
490          }
491          $res[$i] = ($res[$i] / $maxtf) * $idf;
492        }
493    
494        return @res
495      }
496    
497      # We separate exhaustive search here to avoid overhead and make the
498      # code more readable. The block can be removed without changing the
499      # result.
500      unless ($wanted) {
501        for (@terms) {
502          my $df      = $self->{db}->{'o'.$_};
503    
504          # The frequency *must* be 1 at least since the posting list is nonempty
505          _complain('search for term', $_) and $df = 1 if $df < 1;
506    
507          # Unpack posting list for current query term $_
508          my %post = unpack 'w*', $self->{db}->{'p'.$_};
509    
510          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
511          # This is the inverse document frequency. The log of the inverse
512          # fraction of documents the term occurs in.
513          my $idf = log($self->{records}/$df);
514          for my $did (keys %post) {
515            if (my $freq = $self->{db}->{'m'. $did}) {
516              $score{$did} += $post{$did} / $freq * $idf;
517            }
518          }
519        }
520        # warn sprintf "Used %d accumulators\n", scalar keys %score;
521        return %score;
522      }
523    
524      # A sloppy but fast algorithm for multiple term queries.
525      unless ($strict) {
526        for (@terms) {
527          # Unpack posting list for current query term $_
528          my %post = unpack 'w*', $self->{db}->{'p'.$_};
529    
530          # Lookup the number of documents the term occurs in (document frequency)
531          my $occ  = $self->{db}->{'o'.$_};
532    
533          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
534          # The frequency *must* be 1 at least since the posting list is nonempty
535          _complain('search for term', $_) and $occ = 1 if $occ < 1;
536    
537          # This is the inverse document frequency. The log of the inverse
538          # fraction of documents the term occurs in.
539          my $idf = log($self->{records}/$occ);
540    
541          # If we have a reasonable number of accumulators, change the
542          # loop to iterate over the accumulators.  This will compromise
543          # quality for better speed.  The algorithm still computes the
544          # exact weights, but the result is not guaranteed to contain the
545          # *best* results.  The database might contain documents better
546          # than the worst returned document.
547          
548          # We process the lists in order of increasing length.  When the
549          # number of accumulators exceeds $wanted, no new documents are
550          # added, only the ranking/weighting of the seen documents is
551          # improved.  The resulting ranking list must be pruned, since only
552          # the top most documents end up near their "optimal" rank.
553          
554          if (keys %score < $wanted) {
555            for my $did (keys %post) {
556              if (my $freq = $self->{db}->{'m'. $did}) {
557                $score{$did} += $post{$did} / $freq * $idf;
558              }
559            }
560          } else {
561            for my $did (keys %score) {
562              next unless exists $post{$did};
563              if (my $freq = $self->{db}->{'m'. $did}) {
564                $score{$did} += $post{$did} / $freq * $idf;
565              }
566            }
567          }
568        }
569        return %score;
570      }
571      my @max; $max[$#terms+1]=0;
572      my @idf;
573    
574      # Preparation loop.  This extra loop makes sense only when "reorg"
575      # and "wanted" are true.  But at the time beeing, keeping the code
576      # for the different search algorithms in one place seems more
577      # desirable than some minor speedup of the brute force version.  We
578      # do cache $idf though.
579    
580      for (my $i = $#terms; $i >=0; $i--) {
581        local $_ = $terms[$i];
582        # Lookup the number of documents the term occurs in (document frequency)
583        my $df      = $self->{db}->{'o'.$_};
584    
585        # The frequency *must* be 1 at least since the posting list is nonempty
586        _complain('search for term', $_) and $df = 1 if $df < 1;
587    
588        # This is the inverse document frequency. The log of the inverse
589        # fraction of documents the term occurs in.
590        $idf[$i] = log($self->{records}/$df);
591    
592        my ($did,$occ);
593        if ($self->{reorg}) {
594          ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
595        } else {                    # Maybe this costs more than it helps
596          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
597        }
598        my $freq      = $self->{db}->{'m'. $did};
599        my $max       = $occ/$freq*$idf[$i];
600        $max[$i]      = $max + $max[$i+1];
601      }
602    
603      # Main loop
604      for my $i (0 .. $#terms) {
605        my $term = $terms[$i];
606        # Unpack posting list for current query term $term. We loose the
607        # sorting order because the assignment to a hash.
608        my %post = unpack 'w*', $self->{db}->{'p'.$term};
609    
610        _complain('search for term', $term)
611          if $self->{db}->{'o'.$term} != keys %post;
612    
613        my $idf  = $idf[$i];
614        my $full;                   # Need to process all postings
615        my $chop;                   # Score necessary to enter the ranking list
616    
617        if (# We know that wanted is true since we especial cased the
618            # exhaustive search.
619    
620            $wanted and
621    
622            # We did sort here if necessary in
623            # the preparation loop
624            # $self->{reorg} and
625    
626            scalar keys %score > $wanted) {
627          $chop = (sort { $b <=> $a } values %score)[$wanted];
628          $full = $max[$i] > $chop;
629        } else {
630          $full = 1;
631        }
632    
633        if ($full) {
634          # We need to inspect the full list. Either $wanted is not given,
635          # the index is not sorted, or we don't have enough accumulators
636          # yet.
637          if (defined $chop) {
638            # We might be able to avoid allocating accumulators
639            for my $did (keys %post) {
640              if (my $freq = $self->{db}->{'m'. $did}) {
641                my $wgt = $post{$did} / $freq * $idf;
642                # We add an accumulator if $wgt exeeds $chop
643                if (exists $score{$did} or $wgt > $chop) {
644                  $score{$did} += $wgt;
645                }
646              }
647            }
648          } else {
649            # Allocate acumulators for each seen document.
650            for my $did (keys %post) {
651              if (my $freq = $self->{db}->{'m'. $did}) {
652                $score{$did} += $post{$did} / $freq * $idf;
653              }
654            }
655          }
656        } else {
657          # Update existing accumulators
658          for my $did (keys %score) {
659            next unless exists $post{$did};
660            if (my $freq = $self->{db}->{'m'. $did}) {
661              $score{$did} += $post{$did} / $freq * $idf;
662            }
663        }        }
664      }      }
665    }    }
666      #warn sprintf "Used %d accumulators\n", scalar keys %score;
667    %score;    %score;
668  }  }
669    
670    sub set {
671      my ($self, $attr, $value) = @_;
672    
673      die "No such indexy attribute: '$attr'" unless $attr eq 'top';
674    
675      return delete $self->{reorg} if $value == 0;
676    
677      return if     $self->{reorg};     # we are sorted already
678      return unless $self->{mode} & O_RDWR;
679      defined $self->{db} or $self->open;
680    
681      $self->sync;
682      while (my($key, $value) = each %{$self->{db}}) {
683        next if $key !~ /^p/;
684        $self->{db}->{$key} = $self->sort_postings($value);
685      }
686      $self->{reorg} = 1;
687    }
688    
689  sub sync {  sub sync {
690    my $self = shift;    my $self = shift;
691    
692    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
693      print STDERR "\aFlushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
694      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
695        $self->{db}->{$key} .= $value;        $self->{db}->{"p". $key} ||= "";
696        #delete $self->{cache}->{$key};        if ($self->{reorg}) {
697            $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
698                                                       . $value);
699          } else {
700            $self->{db}->{'p'.$key} .= $value;
701          }
702      }      }
703      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
704        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
705        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
706        #delete $self->{cdict}->{$key};      }
707      }      $self->{cache}  = {};
708      $self->{cache} = {};      $self->{cdict}  = {};
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
709      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
710    }    }
711  }  }
712    

Legend:
Removed from v.11  
changed lines
  Added in v.36

  ViewVC Help
Powered by ViewVC 1.1.26