/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 22 by ulpfr, Sat Nov 11 16:58:53 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Sat Nov 11 16:32:38 2000
8  # Language        : CPerl  # Language        : CPerl
 # Status          : Unknown, Use with caution!  
9  #  #
10  # Copyright (c) 1996-1997, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
# Line 19  use WAIT::Filter; Line 18  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances  # The dictionary has three different key types:
22  my $M = pack('C', 0xff)."m";                  # maxtf  #  'o'.$word
23    #
24    #     The document frequency is the number of documents a term occurs
25    #     in. The idea is that a term occuring in a significant part of the
26    #     documents is not too significant.
27    #
28    # 'm'.$word
29    #
30    #     The maximum term frequency of a document is the frequency of the
31    #     most frequent term in the document.  It is related to the document
32    #     length obviously.  A document in which the most frequnet term occurs
33    #     100 times is probably much longer than a document whichs most
34    #     frequent term occurs five time.
35    #
36    # 'p'.$word
37    #
38    #     Under this key we store the actual posting list as pairs of
39    #     packed integers.
40    
41    my $no_old_index_support = 0; # do not check for old indices if set
42    
43  sub new {  sub new {
44    my $type = shift;    my $type = shift;
# Line 63  sub _split_pos { Line 81  sub _split_pos {
81  sub _xfiltergen {  sub _xfiltergen {
82    my $filter = pop @_;    my $filter = pop @_;
83    
84    if ($filter eq 'stop') {      # avoid the slow stopword elimination  # Oops, we cannot overrule the user's choice. Other filters may kill
85      return _xfiltergen(@_);            # it's cheaper to look them up afterwards  # stopwords, such as isotr clobbers "isn't" to "isnt".
86    }  
87    #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
88    #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
89    #  }
90    if (@_) {    if (@_) {
91      if ($filter =~ /^split(\d*)/) {      if ($filter =~ /^split(\d*)/) {
92        if ($1) {        if ($1) {
# Line 121  sub drop { Line 142  sub drop {
142    }    }
143  }  }
144    
145    sub is_an_old_index {
146      my $self = shift;
147    
148      return 0 if $no_old_index_support;
149      return $self->{old_index} if exists $self->{old_index};
150    
151      # We can only guess if this is an old index. We lookup the first 10
152      # $O entries. If all values are integers, we assume that the index
153      # is an old one.
154    
155      defined $self->{db} or $self->open;
156      $self->sync;
157      my $dbh = $self->{dbh};       # for convenience
158    
159      my $O = pack('C', 0xff)."o";
160      my ($word, $value) = ($O.$;);
161      $dbh->seq($word, $value, R_CURSOR);
162      for (my $i=0; $i<10;$i++) {
163        if ($value !~ /^\d+$/) {
164          return $self->{old_index} = 0;
165        }
166        if ($dbh->seq($word, $value, R_NEXT) or # no values left
167            $word !~ /^$O/o                     # no $O values left
168           ) {
169          # we are not sure enough that this is an old index
170          return $self->{old_index} = 0;
171        }
172      }
173      return $self->{old_index} = 1;
174    }
175    
176  sub open {  sub open {
177    my $self = shift;    my $self = shift;
178    my $file = $self->{file};    my $file = $self->{file};
# Line 132  sub open { Line 184  sub open {
184        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
185      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
186                         $self->{mode}, 0664, $DB_BTREE);                         $self->{mode}, 0664, $DB_BTREE);
 #    tie(%{$self->{cache}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
187      $self->{cache} = {}      $self->{cache} = {}
188        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
189      $self->{cdict} = {}      $self->{cdict} = {}
190        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
191      $self->{cached} = 0;      $self->{cached} = 0;
192        if (!$no_old_index_support and $self->is_an_old_index()) {
193          warn "This is an old index, upgrade you database";
194          require WAIT::InvertedIndexOld;
195          bless $self, 'WAIT::InvertedIndexOld';
196        }
197    }    }
198  }  }
199    
# Line 148  sub insert { Line 201  sub insert {
201    my $self  = shift;    my $self  = shift;
202    my $key   = shift;    my $key   = shift;
203    my %occ;    my %occ;
204      
205    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
206    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
207    my ($word, $noc);    my ($word, $noc);
208    $self->{records}++;    $self->{records}++;
209    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
210      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
211        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
212        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
213      } else {      } else {          
214        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
215        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
216      }      }
217      $self->{cached}++;      $self->{cached}++;
218    }    }
219      # This cache limit should be configurable
220    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
221    my $maxtf = 0;    my $maxtf = 0;
222    for (values %occ) {    for (values %occ) {
223      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
224    }    }
225    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
226    }
227    
228    # We sort postings by increasing max term frequency (~ by increasing
229    # document length.  This reduces the quality degradation if we process
230    # only the first part of a posting list.
231    
232    sub sort_postings {
233      my $self = shift;
234      my $post = shift;             # reference to a hash or packed string
235    
236      if (ref $post) {
237        # we skip the sort part, if the index is not sorted
238        return pack('w*', %$post) unless $self->{reorg};
239      } else {
240        $post = { unpack 'w*', $post };
241      }
242    
243      my $r = '';
244    
245      # Sort posting list by increasing ratio of maximum term frequency (~
246      # "document length") and term frequency. This rati multipied by the
247      # inverse document frequence gives the score for a term.  This sort
248      # order can be exploited for tuning of single term queries.
249    
250      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
251                                          <=>
252                             $post->{$a} / $self->{db}->{'m'. $a}
253                        } keys %$post) {
254        $r .= pack 'w2', $did, $post->{$did};
255      }
256      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
257      $r;
258  }  }
259    
260  sub delete {  sub delete {
# Line 176  sub delete { Line 262  sub delete {
262    my $key   = shift;    my $key   = shift;
263    my %occ;    my %occ;
264    
265      my $db;
266    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
267      $db = $self->{db};
268    $self->sync;    $self->sync;
269    $self->{records}--;    $self->{records}--;
270    
271      # less than zero documents in database?
272      _complain('delete of document', $key) and $self->{records} = 0
273        if $self->{records} < 0;
274    
275    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
276    for (keys %occ) {  
277      # may reorder posting list    for (keys %occ) {# may reorder posting list
278      my %post = unpack 'w*', $self->{db}->{$_};      my %post = unpack 'w*', $db->{'p'.$_};
     $self->{db}->{$O,$_}--;  
279      delete $post{$key};      delete $post{$key};
280      $self->{db}->{$_} = pack 'w*', %post;      $db->{'p'.$_}    = $self->sort_postings(\%post);
281        _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
282        $db->{'o'.$_} = scalar keys %post;
283    }    }
284    delete $self->{db}->{$M, $key};    delete $db->{'m'. $key};
285  }  }
286    
287  sub intervall {  sub intervall {
# Line 210  sub intervall { Line 304  sub intervall {
304      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
305      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
306    }    }
307    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
308      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
309    } else {  
310      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
311    }    $dbh->seq($first, $value, R_CURSOR);
312    # We assume that word do not start with the character \377  
313    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
314    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
315        
316    push @result, $first;    push @result, substr($first,1);
317    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
318      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
319      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
320      push @result, $word;      push @result, substr($word,1);
321    }    }
322    \@result;                     # speed    \@result;                     # speed
323  }  }
# Line 248  sub prefix { Line 342  sub prefix {
342      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
343    }    }
344    
345    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
346      return ();      return ();
347    }    }
348    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
349    push @result, $word;    push @result, substr($word,1);
350    
351    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
352      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
353      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
354      push @result, $word;      push @result, substr($word,1);
355    }    }
356    \@result;                     # speed    \@result;                     # speed
357  }  }
358    
359    =head2 search($query)
360    
361    The search method supports a range of search algorithms.  It is
362    recommended to tune the index by calling
363    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
364    into the table.  This is a computing intense operation and all inserts
365    and deletes after this optimization are slightly more expensive.  Once
366    reorganized, the index is kept sorted automatically until you switch
367    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
368    
369    When searching a tuned index, a query can be processed faster if the
370    caller requests only the topmost documents.  This can be done by
371    passing a C<top =E<gt>> I<n> parameter to the search method.
372    
373    For single term queries, the method returns only the I<n> top ranking
374    documents.  For multi term queries two optimized algorithms are
375    available. The first algorithm computes the top n documents
376    approximately but very fast, sacrificing a little bit of precision for
377    speed.  The second algorithm computes the topmost I<n> documents
378    precisely.  This algorithm is slower and should be used only for small
379    values of I<n>.  It can be requested by passing the query attribute
380    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
381    While the picky version might not be faster than the brute force
382    version on average for modest size databases it uses less memory and
383    the processing time is almost linear in the number of query terms, not
384    in the size of the lists.
385    
386    =cut
387    
388  sub search {  sub search {
389    my $self  = shift;    my $self  = shift;
390      my $query = shift;
391    
392    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
393    $self->sync;    $self->sync;
394    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
395  }  }
396    
397  sub parse {  sub parse {
# Line 277  sub parse { Line 401  sub parse {
401    &{$self->{func}}(@_);    &{$self->{func}}(@_);
402  }  }
403    
404    sub keys {
405      my $self  = shift;
406    
407      defined $self->{db} or $self->open;
408      keys %{$self->{db}};
409    }
410    
411  sub search_prefix {  sub search_prefix {
412    my $self  = shift;    my $self  = shift;
413    
# Line 285  sub search_prefix { Line 416  sub search_prefix {
416    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
417  }  }
418    
419    sub _complain ($$) {
420      my ($action, $term) = @_;
421    
422      require Carp;
423      Carp::cluck
424        (sprintf("WAIT database inconsistency during $action [%s]: ".
425                 "Please rebuild index\n",
426                 $term,));
427    }
428    
429  sub search_raw {  sub search_raw {
430    my $self  = shift;    my $self  = shift;
431    my %occ;    my $query = shift;
432    my %score;    my %score;
433    
434    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
435      # documents.
436      my $wanted = $query->{top};
437      my $strict = $query->{picky};
438    
439      # Return at least $minacc documents. Zero means all matching
440      # documents.
441      # my $minacc = $query->{accus} || $wanted;
442    
443      # Open index and flush cache if necessary
444    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
445    $self->sync;    $self->sync;
446    grep $occ{$_}++, @_;  
447    for (keys %occ) {    # We keep duplicates
448      if (defined $self->{db}->{$_}) {    my @terms =
449        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
450        my $idf = log($self->{records}/$self->{db}->{$O,$_});      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
451        my $did;        # check which words occur in the index.
452        for $did (keys %post) {        grep { $self->{db}->{'o'.$_} } @_;
453          $score{$did} = 0 unless defined $score{$did}; # perl -w  
454          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    return () unless @terms;                 # nothing to search for
455            if $self->{db}->{$M, $did}; # db may be broken  
456      # We special-case one term queries here.  If the index was sorted,
457      # choping off the rest of the list will return the same ranking.
458      if ($wanted and @terms == 1) {
459        my $term  = shift @terms;
460        my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
461        my @res;
462    
463        if ($self->{reorg}) { # or not $query->{picky}
464          @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
465        } else {
466          @res = unpack 'w*',                $self->{db}->{'p'.$term};
467        }
468    
469        for (my $i=1; $i<@res; $i+=2) {
470          $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
471        }
472    
473        return @res
474      }
475    
476      # We separate exhaustive search here to avoid overhead and make the
477      # code more readable. The block can be removed without changing the
478      # result.
479      unless ($wanted) {
480        for (@terms) {
481          my $df      = $self->{db}->{'o'.$_};
482    
483          # The frequency *must* be 1 at least since the posting list is nonempty
484          _complain('search for term', $_) and $df = 1 if $df < 1;
485    
486          # Unpack posting list for current query term $_
487          my %post = unpack 'w*', $self->{db}->{'p'.$_};
488    
489          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
490          # This is the inverse document frequency. The log of the inverse
491          # fraction of documents the term occurs in.
492          my $idf = log($self->{records}/$df);
493          for my $did (keys %post) {
494            if (my $freq = $self->{db}->{'m'. $did}) {
495              $score{$did} += $post{$did} / $freq * $idf;
496            }
497          }
498        }
499        # warn sprintf "Used %d accumulators\n", scalar keys %score;
500        return %score;
501      }
502    
503      # A sloppy but fast algorithm for multiple term queries.
504      unless ($strict) {
505        for (@terms) {
506          # Unpack posting list for current query term $_
507          my %post = unpack 'w*', $self->{db}->{'p'.$_};
508    
509          # Lookup the number of documents the term occurs in (document frequency)
510          my $occ  = $self->{db}->{'o'.$_};
511    
512          _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
513          # The frequency *must* be 1 at least since the posting list is nonempty
514          _complain('search for term', $_) and $occ = 1 if $occ < 1;
515    
516          # This is the inverse document frequency. The log of the inverse
517          # fraction of documents the term occurs in.
518          my $idf = log($self->{records}/$occ);
519    
520          # If we have a reasonable number of accumulators, change the
521          # loop to iterate over the accumulators.  This will compromise
522          # quality for better speed.  The algorithm still computes the
523          # exact weights, but the result is not guaranteed to contain the
524          # *best* results.  The database might contain documents better
525          # than the worst returned document.
526          
527          # We process the lists in order of increasing length.  When the
528          # number of accumulators exceeds $wanted, no new documents are
529          # added, only the ranking/weighting of the seen documents is
530          # improved.  The resulting ranking list must be pruned, since only
531          # the top most documents end up near their "optimal" rank.
532          
533          if (keys %score < $wanted) {
534            for my $did (keys %post) {
535              if (my $freq = $self->{db}->{'m'. $did}) {
536                $score{$did} += $post{$did} / $freq * $idf;
537              }
538            }
539          } else {
540            for my $did (keys %score) {
541              next unless exists $post{$did};
542              if (my $freq = $self->{db}->{'m'. $did}) {
543                $score{$did} += $post{$did} / $freq * $idf;
544              }
545            }
546          }
547        }
548        return %score;
549      }
550      my @max; $max[$#terms+1]=0;
551      my @idf;
552    
553      # Preparation loop.  This extra loop makes sense only when "reorg"
554      # and "wanted" are true.  But at the time beeing, keeping the code
555      # for the different search algorithms in one place seems more
556      # desirable than some minor speedup of the brute force version.  We
557      # do cache $idf though.
558    
559      for (my $i = $#terms; $i >=0; $i--) {
560        local $_ = $terms[$i];
561        # Lookup the number of documents the term occurs in (document frequency)
562        my $df      = $self->{db}->{'o'.$_};
563    
564        # The frequency *must* be 1 at least since the posting list is nonempty
565        _complain('search for term', $_) and $df = 1 if $df < 1;
566    
567        # This is the inverse document frequency. The log of the inverse
568        # fraction of documents the term occurs in.
569        $idf[$i] = log($self->{records}/$df);
570    
571        my ($did,$occ);
572        if ($self->{reorg}) {
573          ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
574        } else {                    # Maybe this costs more than it helps
575          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
576        }
577        my $freq      = $self->{db}->{'m'. $did};
578        my $max       = $occ/$freq*$idf[$i];
579        $max[$i]      = $max + $max[$i+1];
580      }
581    
582      # Main loop
583      for my $i (0 .. $#terms) {
584        my $term = $terms[$i];
585        # Unpack posting list for current query term $term. We loose the
586        # sorting order because the assignment to a hash.
587        my %post = unpack 'w*', $self->{db}->{'p'.$term};
588    
589        _complain('search for term', $term)
590          if $self->{db}->{'o'.$term} != keys %post;
591    
592        my $idf  = $idf[$i];
593        my $full;                   # Need to process all postings
594        my $chop;                   # Score necessary to enter the ranking list
595    
596        if (# We know that wanted is true since we especial cased the
597            # exhaustive search.
598    
599            $wanted and
600    
601            # We did sort here if necessary in
602            # the preparation loop
603            # $self->{reorg} and
604    
605            scalar keys %score > $wanted) {
606          $chop = (sort { $b <=> $a } values %score)[$wanted];
607          $full = $max[$i] > $chop;
608        } else {
609          $full = 1;
610        }
611    
612        if ($full) {
613          # We need to inspect the full list. Either $wanted is not given,
614          # the index is not sorted, or we don't have enough accumulators
615          # yet.
616          if (defined $chop) {
617            # We might be able to avoid allocating accumulators
618            for my $did (keys %post) {
619              if (my $freq = $self->{db}->{'m'. $did}) {
620                my $wgt = $post{$did} / $freq * $idf;
621                # We add an accumulator if $wgt exeeds $chop
622                if (exists $score{$did} or $wgt > $chop) {
623                  $score{$did} += $wgt;
624                }
625              }
626            }
627          } else {
628            # Allocate acumulators for each seen document.
629            for my $did (keys %post) {
630              if (my $freq = $self->{db}->{'m'. $did}) {
631                $score{$did} += $post{$did} / $freq * $idf;
632              }
633            }
634          }
635        } else {
636          # Update existing accumulators
637          for my $did (keys %score) {
638            next unless exists $post{$did};
639            if (my $freq = $self->{db}->{'m'. $did}) {
640              $score{$did} += $post{$did} / $freq * $idf;
641            }
642        }        }
643      }      }
644    }    }
645      #warn sprintf "Used %d accumulators\n", scalar keys %score;
646    %score;    %score;
647  }  }
648    
649    sub set {
650      my ($self, $attr, $value) = @_;
651    
652      die "No such indexy attribute: '$attr'" unless $attr eq 'top';
653    
654      return delete $self->{reorg} if $value == 0;
655    
656      return if     $self->{reorg};     # we are sorted already
657      return unless $self->{mode} & O_RDWR;
658      defined $self->{db} or $self->open;
659    
660      $self->sync;
661      while (my($key, $value) = each %{$self->{db}}) {
662        next if $key !~ /^p/;
663        $self->{db}->{$key} = $self->sort_postings($value);
664      }
665      $self->{reorg} = 1;
666    }
667    
668  sub sync {  sub sync {
669    my $self = shift;    my $self = shift;
670    
671    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
672      print STDERR "\aFlushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
673      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
674        $self->{db}->{$key} .= $value;        if ($self->{reorg}) {
675        #delete $self->{cache}->{$key};          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
676                                                       . $value);
677          } else {
678            $self->{db}->{'p'.$key} .= $value;
679          }
680      }      }
681      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
682        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
683        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
684        #delete $self->{cdict}->{$key};      }
685      }      $self->{cache}  = {};
686      $self->{cache} = {};      $self->{cdict}  = {};
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
687      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
688    }    }
689  }  }
690    

Legend:
Removed from v.11  
changed lines
  Added in v.22

  ViewVC Help
Powered by ViewVC 1.1.26