/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 13 by ulpfr, Fri Apr 28 15:42:44 2000 UTC revision 19 by ulpfr, Tue May 9 11:29:45 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Cperl -*-  #                              -*- Mode: Perl -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Tue May  9 08:33:28 2000
8  # Language        : CPerl  # Language        : CPerl
9  # Status          : Unknown, Use with caution!  #
10  #  # (C) Copyright 1996-2000, Ulrich Pfeifer
11  # Copyright (c) 1996-1997, Ulrich Pfeifer  #
 #  
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
# Line 19  use WAIT::Filter; Line 18  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances  my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)
22  my $M = pack('C', 0xff)."m";                  # maxtf  
23    # The document frequency is the number of documents a term occurs
24    # in. The idea is that a term occuring in a significant part of the
25    # documents is not too significant.
26    
27    my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)
28    
29    # The maximum term frequency of a document is the frequency of the
30    # most frequent term in the document.  It is related to the document
31    # length obviously.  A document in which the most frequnet term occurs
32    # 100 times is probably much longer than a document whichs most
33    # frequent term occurs five time.
34    
35  sub new {  sub new {
36    my $type = shift;    my $type = shift;
# Line 135  sub open { Line 145  sub open {
145        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
146      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
147                         $self->{mode}, 0664, $DB_BTREE);                         $self->{mode}, 0664, $DB_BTREE);
 #    tie(%{$self->{cache}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
148      $self->{cache} = {}      $self->{cache} = {}
149        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
150      $self->{cdict} = {}      $self->{cdict} = {}
151        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
152      $self->{cached} = 0;      $self->{cached} = 0;
# Line 166  sub insert { Line 172  sub insert {
172      }      }
173      $self->{cached}++;      $self->{cached}++;
174    }    }
175      # This cache limit should be configurable
176    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
177    my $maxtf = 0;    my $maxtf = 0;
178    for (values %occ) {    for (values %occ) {
# Line 174  sub insert { Line 181  sub insert {
181    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{$M, $key} = $maxtf;
182  }  }
183    
184    # We sort postings by increasing max term frequency (~ by increasing
185    # document length.  This reduces the quality degradation if we process
186    # only the first part of a posting list.
187    
188    sub sort_postings {
189      my $self = shift;
190      my $post = shift;             # reference to a hash or packed string
191    
192      if (ref $post) {
193        # we skip the sort part, if the index is not sorted
194        return pack('w*', %$post) unless $self->{reorg};
195      } else {
196        $post = { unpack 'w*', $post };
197      }
198    
199      my $r = '';
200    
201      # Sort posting list by increasing ratio of maximum term frequency (~
202      # "document length") and term frequency. This rati multipied by the
203      # inverse document frequence gives the score for a term.  This sort
204      # order can be exploited for tuning of single term queries.
205    
206      for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
207                                          <=>
208                             $post->{$a} / $self->{db}->{$M, $a}
209                        } keys %$post) {
210        $r .= pack 'w2', $did, $post->{$did};
211      }
212      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
213      $r;
214    }
215    
216  sub delete {  sub delete {
217    my $self  = shift;    my $self  = shift;
218    my $key   = shift;    my $key   = shift;
219    my %occ;    my %occ;
220    
221      my $db;
222    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
223      $db = $self->{db};
224    $self->sync;    $self->sync;
225    $self->{records}--;    $self->{records}--;
226    
227      # less than zero documents in database?
228      _complain('delete of document', $key) and $self->{records} = 0
229        if $self->{records} < 0;
230    
231    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
232    for (keys %occ) {  
233      # may reorder posting list    for (keys %occ) {# may reorder posting list
234      my %post = unpack 'w*', $self->{db}->{$_};      my %post = unpack 'w*', $db->{$_};
     $self->{db}->{$O,$_}--;  
235      delete $post{$key};      delete $post{$key};
236      $self->{db}->{$_} = pack 'w*', %post;      $db->{$_}    = $self->sort_postings(\%post);
237        _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
238        $db->{$O,$_} = scalar keys %post;
239    }    }
240    delete $self->{db}->{$M, $key};    delete $db->{$M, $key};
241  }  }
242    
243  sub intervall {  sub intervall {
# Line 265  sub prefix { Line 312  sub prefix {
312    \@result;                     # speed    \@result;                     # speed
313  }  }
314    
315    =head2 search($query)
316    
317    The search method supports a range of search algorithms.  It is
318    recommended to tune the index by calling
319    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
320    into the table.  This is a computing intense operation and all inserts
321    and deletes after this optimization are slightly more expensive.  Once
322    reorganized, the index is kept sorted automatically until you switch
323    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
324    
325    When searching a tuned index, a query can be processed faster if the
326    caller requests only the topmost documents.  This can be done by
327    passing a C<top =E<gt>> I<n> parameter to the search method.
328    
329    For single term queries, the method returns only the I<n> top ranking
330    documents.  For multi term queries two optimized algorithms are
331    available. The first algorithm computes the top n documents
332    approximately but very fast, sacrificing a little bit of precision for
333    speed.  The second algorithm computes the topmost I<n> documents
334    precisely.  This algorithm is slower and should be used only for small
335    values of I<n>.  It can be requested by passing the query attribute
336    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
337    While the picky version might not be faster than the brute force
338    version on average for modest size databases it uses less memory and
339    the processing time is almost linear in the number of query terms, not
340    in the size of the lists.
341    
342    =cut
343    
344  sub search {  sub search {
345    my $self  = shift;    my $self  = shift;
346      my $query = shift;
347    
348    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
349    $self->sync;    $self->sync;
350    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
351  }  }
352    
353  sub parse {  sub parse {
# Line 295  sub search_prefix { Line 372  sub search_prefix {
372    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw(map($self->prefix($_), @_));
373  }  }
374    
375    sub _complain ($$) {
376      my ($action, $term) = @_;
377    
378      require Carp;
379      Carp::cluck
380        (sprintf("WAIT database inconsistency during $action [%s]: ".
381                 "Please rebuild index\n",
382                 $term,));
383    }
384    
385  sub search_raw {  sub search_raw {
386    my $self  = shift;    my $self  = shift;
387    my %occ;    my $query = shift;
388    my %score;    my %score;
389    
390    return () unless @_;    # Top $wanted documents must be correct. Zero means all matching
391      # documents.
392      my $wanted = $query->{top};
393      my $strict = $query->{picky};
394    
395      # Return at least $minacc documents. Zero means all matching
396      # documents.
397      # my $minacc = $query->{accus} || $wanted;
398    
399      # Open index and flush cache if necessary
400    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
401    $self->sync;    $self->sync;
402    grep $occ{$_}++, @_;  
403    for (keys %occ) {    # We keep duplicates
404      if (defined $self->{db}->{$_}) {    my @terms =
405        # Sort words by decreasing document frequency
406        sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
407          # check which words occur in the index.
408          grep { $self->{db}->{$O,$_} } @_;
409    
410      return () unless @terms;                 # nothing to search for
411    
412      # We special-case one term queries here.  If the index was sorted,
413      # choping off the rest of the list will return the same ranking.
414      if ($wanted and @terms == 1) {
415        my $term  = shift @terms;
416        my $idf   = log($self->{records}/$self->{db}->{$O,$term});
417        my @res;
418    
419        if ($self->{reorg}) { # or not $query->{picky}
420          @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
421        } else {
422          @res = unpack 'w*',                $self->{db}->{$term};
423        }
424    
425        for (my $i=1; $i<@res; $i+=2) {
426          $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
427        }
428    
429        return @res
430      }
431    
432      # We separate exhaustive search here to avoid overhead and make the
433      # code more readable. The block can be removed without changing the
434      # result.
435      unless ($wanted) {
436        for (@terms) {
437          my $df      = $self->{db}->{$O,$_};
438    
439          # The frequency *must* be 1 at least since the posting list is nonempty
440          _complain('search for term', $_) and $df = 1 if $df < 1;
441    
442          # Unpack posting list for current query term $_
443          my %post = unpack 'w*', $self->{db}->{$_};
444    
445          _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
446          # This is the inverse document frequency. The log of the inverse
447          # fraction of documents the term occurs in.
448          my $idf = log($self->{records}/$df);
449          for my $did (keys %post) {
450            if (my $freq = $self->{db}->{$M, $did}) {
451              $score{$did} += $post{$did} / $freq * $idf;
452            }
453          }
454        }
455        # warn sprintf "Used %d accumulators\n", scalar keys %score;
456        return %score;
457      }
458    
459      # A sloppy but fast algorithm for multiple term queries.
460      unless ($strict) {
461        for (@terms) {
462          # Unpack posting list for current query term $_
463        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{$_};
464        my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));  
465        my $did;        # Lookup the number of documents the term occurs in (document frequency)
466        for $did (keys %post) {        my $occ  = $self->{db}->{$O,$_};
467          $score{$did} = 0 unless defined $score{$did}; # perl -w  
468          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
469            if $self->{db}->{$M, $did}; # db may be broken        # The frequency *must* be 1 at least since the posting list is nonempty
470          _complain('search for term', $_) and $occ = 1 if $occ < 1;
471    
472          # This is the inverse document frequency. The log of the inverse
473          # fraction of documents the term occurs in.
474          my $idf = log($self->{records}/$occ);
475    
476          # If we have a reasonable number of accumulators, change the
477          # loop to iterate over the accumulators.  This will compromise
478          # quality for better speed.  The algorithm still computes the
479          # exact weights, but the result is not guaranteed to contain the
480          # *best* results.  The database might contain documents better
481          # than the worst returned document.
482          
483          # We process the lists in order of increasing length.  When the
484          # number of accumulators exceeds $wanted, no new documents are
485          # added, only the ranking/weighting of the seen documents is
486          # improved.  The resulting ranking list must be pruned, since only
487          # the top most documents end up near their "optimal" rank.
488          
489          if (keys %score < $wanted) {
490            for my $did (keys %post) {
491              if (my $freq = $self->{db}->{$M, $did}) {
492                $score{$did} += $post{$did} / $freq * $idf;
493              }
494            }
495          } else {
496            for my $did (keys %score) {
497              next unless exists $post{$did};
498              if (my $freq = $self->{db}->{$M, $did}) {
499                $score{$did} += $post{$did} / $freq * $idf;
500              }
501            }
502          }
503        }
504        return %score;
505      }
506      my @max; $max[$#terms+1]=0;
507      my @idf;
508    
509      # Preparation loop.  This extra loop makes sense only when "reorg"
510      # and "wanted" are true.  But at the time beeing, keeping the code
511      # for the different search algorithms in one place seems more
512      # desirable than some minor speedup of the brute force version.  We
513      # do cache $idf though.
514    
515      for (my $i = $#terms; $i >=0; $i--) {
516        local $_ = $terms[$i];
517        # Lookup the number of documents the term occurs in (document frequency)
518        my $df      = $self->{db}->{$O,$_};
519    
520        # The frequency *must* be 1 at least since the posting list is nonempty
521        _complain('search for term', $_) and $df = 1 if $df < 1;
522    
523        # This is the inverse document frequency. The log of the inverse
524        # fraction of documents the term occurs in.
525        $idf[$i] = log($self->{records}/$df);
526    
527        my ($did,$occ);
528        if ($self->{reorg}) {
529          ($did,$occ) = unpack 'w2', $self->{db}->{$_};
530        } else {                    # Maybe this costs more than it helps
531          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
532        }
533        my $freq      = $self->{db}->{$M, $did};
534        my $max       = $occ/$freq*$idf[$i];
535        $max[$i]      = $max + $max[$i+1];
536      }
537    
538      # Main loop
539      for my $i (0 .. $#terms) {
540        my $term = $terms[$i];
541        # Unpack posting list for current query term $term. We loose the
542        # sorting order because the assignment to a hash.
543        my %post = unpack 'w*', $self->{db}->{$term};
544    
545        _complain('search for term', $term)
546          if $self->{db}->{$O,$term} != keys %post;
547    
548        my $idf  = $idf[$i];
549        my $full;                   # Need to process all postings
550        my $chop;                   # Score necessary to enter the ranking list
551    
552        if (# We know that wanted is true since we especial cased the
553            # exhaustive search.
554    
555            $wanted and
556    
557            # We did sort here if necessary in
558            # the preparation loop
559            # $self->{reorg} and
560    
561            scalar keys %score > $wanted) {
562          $chop = (sort { $b <=> $a } values %score)[$wanted];
563          $full = $max[$i] > $chop;
564        } else {
565          $full = 1;
566        }
567    
568        if ($full) {
569          # We need to inspect the full list. Either $wanted is not given,
570          # the index is not sorted, or we don't have enough accumulators
571          # yet.
572          if (defined $chop) {
573            # We might be able to avoid allocating accumulators
574            for my $did (keys %post) {
575              if (my $freq = $self->{db}->{$M, $did}) {
576                my $wgt = $post{$did} / $freq * $idf;
577                # We add an accumulator if $wgt exeeds $chop
578                if (exists $score{$did} or $wgt > $chop) {
579                  $score{$did} += $wgt;
580                }
581              }
582            }
583          } else {
584            # Allocate acumulators for each seen document.
585            for my $did (keys %post) {
586              if (my $freq = $self->{db}->{$M, $did}) {
587                $score{$did} += $post{$did} / $freq * $idf;
588              }
589            }
590          }
591        } else {
592          # Update existing accumulators
593          for my $did (keys %score) {
594            next unless exists $post{$did};
595            if (my $freq = $self->{db}->{$M, $did}) {
596              $score{$did} += $post{$did} / $freq * $idf;
597            }
598        }        }
599      }      }
600    }    }
601      #warn sprintf "Used %d accumulators\n", scalar keys %score;
602    %score;    %score;
603  }  }
604    
605    sub set {
606      my ($self, $attr, $value) = @_;
607    
608      die "No such indexy attribute: '$attr'" unless $attr eq 'top';
609    
610      return delete $self->{reorg} if $value == 0;
611    
612      return if     $self->{reorg};     # we are sorted already
613      return unless $self->{mode} & O_RDWR;
614      defined $self->{db} or $self->open;
615    
616      $self->sync;
617      while (my($key, $value) = each %{$self->{db}}) {
618        next if $key =~ /^\377[om]/;
619        $self->{db}->{$key} = $self->sort_postings($value);
620      }
621      $self->{reorg} = 1;
622    }
623    
624  sub sync {  sub sync {
625    my $self = shift;    my $self = shift;
626    
627    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
628      print STDERR "Flushing $self->{cached} postings\n";      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
629      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
630        $self->{db}->{$key} .= $value;        if ($self->{reorg}) {
631        #delete $self->{cache}->{$key};          $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
632                                                       . $value);
633          } else {
634            $self->{db}->{$key} .= $value;
635          }
636      }      }
637      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
638        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
639        $self->{db}->{$key} += $value;        $self->{db}->{$key} += $value;
       #delete $self->{cdict}->{$key};  
640      }      }
641      $self->{cache} = {};      $self->{cache}  = {};
642      $self->{cdict} = {};      $self->{cdict}  = {};
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
643      $self->{cached} = 0;      $self->{cached} = 0;
     # $self->{dbh}->sync if $self->{dbh};  
644    }    }
645  }  }
646    

Legend:
Removed from v.13  
changed lines
  Added in v.19

  ViewVC Help
Powered by ViewVC 1.1.26