--- branches/CPAN/lib/WAIT/InvertedIndex.pm	2000/04/28 15:42:44	13
+++ branches/CPAN/lib/WAIT/InvertedIndex.pm	2000/05/09 11:29:45	19
@@ -1,15 +1,14 @@
-#                              -*- Mode: Cperl -*- 
-# InvertedIndex.pm --
-# ITIID           : $ITI$ $Header $__Header$
+#                              -*- Mode: Perl -*- 
+# $Basename: InvertedIndex.pm $
+# $Revision: 1.30 $
 # Author          : Ulrich Pfeifer
 # Created On      : Thu Aug  8 13:05:10 1996
 # Last Modified By: Ulrich Pfeifer
-# Last Modified On: Sun Nov 22 18:44:42 1998
+# Last Modified On: Tue May  9 08:33:28 2000
 # Language        : CPerl
-# Status          : Unknown, Use with caution!
-#
-# Copyright (c) 1996-1997, Ulrich Pfeifer
-#
+# 
+# (C) Copyright 1996-2000, Ulrich Pfeifer
+# 
 
 package WAIT::InvertedIndex;
 use strict;
@@ -19,8 +18,19 @@
 use Carp;
 use vars qw(%FUNC);
 
-my $O = pack('C', 0xff)."o";                  # occurances
-my $M = pack('C', 0xff)."m";                  # maxtf
+my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)
+
+# The document frequency is the number of documents a term occurs
+# in. The idea is that a term occuring in a significant part of the
+# documents is not too significant.
+
+my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)
+
+# The maximum term frequency of a document is the frequency of the
+# most frequent term in the document.  It is related to the document
+# length obviously.  A document in which the most frequnet term occurs
+# 100 times is probably much longer than a document whichs most
+# frequent term occurs five time.
 
 sub new {
   my $type = shift;
@@ -135,12 +145,8 @@
       eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                        $self->{mode}, 0664, $DB_BTREE);
-#    tie(%{$self->{cache}}, 'DB_File', undef,
-#        $self->{mode}, 0664, $DB_BTREE)
     $self->{cache} = {}
       if $self->{mode} & O_RDWR;
-#    tie(%{$self->{cdict}}, 'DB_File', undef,
-#        $self->{mode}, 0664, $DB_BTREE)
     $self->{cdict} = {}
       if $self->{mode} & O_RDWR;
     $self->{cached} = 0;
@@ -166,6 +172,7 @@
     }
     $self->{cached}++;
   }
+  # This cache limit should be configurable
   $self->sync if $self->{cached} > 100_000;
   my $maxtf = 0;
   for (values %occ) {
@@ -174,23 +181,63 @@
   $self->{db}->{$M, $key} = $maxtf;
 }
 
+# We sort postings by increasing max term frequency (~ by increasing
+# document length.  This reduces the quality degradation if we process
+# only the first part of a posting list.
+
+sub sort_postings {
+  my $self = shift;
+  my $post = shift;             # reference to a hash or packed string
+
+  if (ref $post) {
+    # we skip the sort part, if the index is not sorted
+    return pack('w*', %$post) unless $self->{reorg};
+  } else {
+    $post = { unpack 'w*', $post };
+  }
+
+  my $r = '';
+
+  # Sort posting list by increasing ratio of maximum term frequency (~
+  # "document length") and term frequency. This rati multipied by the
+  # inverse document frequence gives the score for a term.  This sort
+  # order can be exploited for tuning of single term queries.
+
+  for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
+                                      <=>
+                         $post->{$a} / $self->{db}->{$M, $a}
+                    } keys %$post) {
+    $r .= pack 'w2', $did, $post->{$did};
+  }
+  #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
+  $r;
+}
+
 sub delete {
   my $self  = shift;
   my $key   = shift;
   my %occ;
 
+  my $db;
   defined $self->{db} or $self->open;
+  $db = $self->{db};
   $self->sync;
   $self->{records}--;
+
+  # less than zero documents in database?
+  _complain('delete of document', $key) and $self->{records} = 0
+    if $self->{records} < 0;
+
   grep $occ{$_}++, &{$self->{func}}(@_);
-  for (keys %occ) {
-    # may reorder posting list
-    my %post = unpack 'w*', $self->{db}->{$_};
-    $self->{db}->{$O,$_}--;
+
+  for (keys %occ) {# may reorder posting list
+    my %post = unpack 'w*', $db->{$_};
     delete $post{$key};
-    $self->{db}->{$_} = pack 'w*', %post;
+    $db->{$_}    = $self->sort_postings(\%post);
+    _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
+    $db->{$O,$_} = scalar keys %post;
   }
-  delete $self->{db}->{$M, $key};
+  delete $db->{$M, $key};
 }
 
 sub intervall {
@@ -265,12 +312,42 @@
   \@result;                     # speed
 }
 
+=head2 search($query)
+
+The search method supports a range of search algorithms.  It is
+recommended to tune the index by calling
+C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
+into the table.  This is a computing intense operation and all inserts
+and deletes after this optimization are slightly more expensive.  Once
+reorganized, the index is kept sorted automatically until you switch
+the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
+
+When searching a tuned index, a query can be processed faster if the
+caller requests only the topmost documents.  This can be done by
+passing a C<top =E<gt>> I<n> parameter to the search method.
+
+For single term queries, the method returns only the I<n> top ranking
+documents.  For multi term queries two optimized algorithms are
+available. The first algorithm computes the top n documents
+approximately but very fast, sacrificing a little bit of precision for
+speed.  The second algorithm computes the topmost I<n> documents
+precisely.  This algorithm is slower and should be used only for small
+values of I<n>.  It can be requested by passing the query attribute
+C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
+While the picky version might not be faster than the brute force
+version on average for modest size databases it uses less memory and
+the processing time is almost linear in the number of query terms, not
+in the size of the lists.
+
+=cut
+
 sub search {
   my $self  = shift;
+  my $query = shift;
 
   defined $self->{db} or $self->open;
   $self->sync;
-  $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
+  $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
 }
 
 sub parse {
@@ -295,50 +372,275 @@
   $self->search_raw(map($self->prefix($_), @_));
 }
 
+sub _complain ($$) {
+  my ($action, $term) = @_;
+
+  require Carp;
+  Carp::cluck
+    (sprintf("WAIT database inconsistency during $action [%s]: ".
+             "Please rebuild index\n",
+             $term,));
+}
+
 sub search_raw {
   my $self  = shift;
-  my %occ;
+  my $query = shift;
   my %score;
 
-  return () unless @_;
+  # Top $wanted documents must be correct. Zero means all matching
+  # documents.
+  my $wanted = $query->{top};
+  my $strict = $query->{picky};
+
+  # Return at least $minacc documents. Zero means all matching
+  # documents.
+  # my $minacc = $query->{accus} || $wanted;
 
+  # Open index and flush cache if necessary
   defined $self->{db} or $self->open;
   $self->sync;
-  grep $occ{$_}++, @_;
-  for (keys %occ) {
-    if (defined $self->{db}->{$_}) {
+
+  # We keep duplicates
+  my @terms = 
+    # Sort words by decreasing document frequency
+    sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
+      # check which words occur in the index. 
+      grep { $self->{db}->{$O,$_} } @_;
+
+  return () unless @terms;                 # nothing to search for
+
+  # We special-case one term queries here.  If the index was sorted,
+  # choping off the rest of the list will return the same ranking.
+  if ($wanted and @terms == 1) {
+    my $term  = shift @terms;
+    my $idf   = log($self->{records}/$self->{db}->{$O,$term});
+    my @res;
+
+    if ($self->{reorg}) { # or not $query->{picky}
+      @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
+    } else {
+      @res = unpack 'w*',                $self->{db}->{$term};
+    }
+
+    for (my $i=1; $i<@res; $i+=2) {
+      $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
+    }
+
+    return @res
+  }
+
+  # We separate exhaustive search here to avoid overhead and make the
+  # code more readable. The block can be removed without changing the
+  # result.
+  unless ($wanted) {
+    for (@terms) {
+      my $df      = $self->{db}->{$O,$_};
+
+      # The frequency *must* be 1 at least since the posting list is nonempty
+      _complain('search for term', $_) and $df = 1 if $df < 1;
+
+      # Unpack posting list for current query term $_
+      my %post = unpack 'w*', $self->{db}->{$_};
+
+      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+      # This is the inverse document frequency. The log of the inverse
+      # fraction of documents the term occurs in.
+      my $idf = log($self->{records}/$df);
+      for my $did (keys %post) {
+        if (my $freq = $self->{db}->{$M, $did}) {
+          $score{$did} += $post{$did} / $freq * $idf;
+        }
+      }
+    }
+    # warn sprintf "Used %d accumulators\n", scalar keys %score;
+    return %score;
+  }
+
+  # A sloppy but fast algorithm for multiple term queries.
+  unless ($strict) {
+    for (@terms) {
+      # Unpack posting list for current query term $_
       my %post = unpack 'w*', $self->{db}->{$_};
-      my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));
-      my $did;
-      for $did (keys %post) {
-        $score{$did} = 0 unless defined $score{$did}; # perl -w 
-        $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
-          if $self->{db}->{$M, $did}; # db may be broken
+
+      # Lookup the number of documents the term occurs in (document frequency)
+      my $occ  = $self->{db}->{$O,$_};
+
+      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+      # The frequency *must* be 1 at least since the posting list is nonempty
+      _complain('search for term', $_) and $occ = 1 if $occ < 1;
+
+      # This is the inverse document frequency. The log of the inverse
+      # fraction of documents the term occurs in.
+      my $idf = log($self->{records}/$occ);
+
+      # If we have a reasonable number of accumulators, change the
+      # loop to iterate over the accumulators.  This will compromise
+      # quality for better speed.  The algorithm still computes the
+      # exact weights, but the result is not guaranteed to contain the
+      # *best* results.  The database might contain documents better
+      # than the worst returned document.
+      
+      # We process the lists in order of increasing length.  When the
+      # number of accumulators exceeds $wanted, no new documents are
+      # added, only the ranking/weighting of the seen documents is
+      # improved.  The resulting ranking list must be pruned, since only
+      # the top most documents end up near their "optimal" rank.
+      
+      if (keys %score < $wanted) {
+        for my $did (keys %post) {
+          if (my $freq = $self->{db}->{$M, $did}) {
+            $score{$did} += $post{$did} / $freq * $idf;
+          }
+        }
+      } else {
+        for my $did (keys %score) {
+          next unless exists $post{$did};
+          if (my $freq = $self->{db}->{$M, $did}) {
+            $score{$did} += $post{$did} / $freq * $idf;
+          }
+        }
+      }
+    }
+    return %score;
+  }
+  my @max; $max[$#terms+1]=0;
+  my @idf;
+
+  # Preparation loop.  This extra loop makes sense only when "reorg"
+  # and "wanted" are true.  But at the time beeing, keeping the code
+  # for the different search algorithms in one place seems more
+  # desirable than some minor speedup of the brute force version.  We
+  # do cache $idf though.
+
+  for (my $i = $#terms; $i >=0; $i--) {
+    local $_ = $terms[$i];
+    # Lookup the number of documents the term occurs in (document frequency)
+    my $df      = $self->{db}->{$O,$_};
+
+    # The frequency *must* be 1 at least since the posting list is nonempty
+    _complain('search for term', $_) and $df = 1 if $df < 1;
+
+    # This is the inverse document frequency. The log of the inverse
+    # fraction of documents the term occurs in.
+    $idf[$i] = log($self->{records}/$df);
+
+    my ($did,$occ);
+    if ($self->{reorg}) {
+      ($did,$occ) = unpack 'w2', $self->{db}->{$_};
+    } else {                    # Maybe this costs more than it helps
+      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
+    }
+    my $freq      = $self->{db}->{$M, $did};
+    my $max       = $occ/$freq*$idf[$i];
+    $max[$i]      = $max + $max[$i+1];
+  }
+
+  # Main loop 
+  for my $i (0 .. $#terms) {
+    my $term = $terms[$i];
+    # Unpack posting list for current query term $term. We loose the
+    # sorting order because the assignment to a hash.
+    my %post = unpack 'w*', $self->{db}->{$term};
+
+    _complain('search for term', $term)
+      if $self->{db}->{$O,$term} != keys %post;
+
+    my $idf  = $idf[$i];
+    my $full;                   # Need to process all postings
+    my $chop;                   # Score necessary to enter the ranking list
+
+    if (# We know that wanted is true since we especial cased the
+        # exhaustive search.
+
+        $wanted and
+
+        # We did sort here if necessary in
+        # the preparation loop
+        # $self->{reorg} and
+
+        scalar keys %score > $wanted) {
+      $chop = (sort { $b <=> $a } values %score)[$wanted];
+      $full = $max[$i] > $chop;
+    } else {
+      $full = 1;
+    }
+
+    if ($full) {
+      # We need to inspect the full list. Either $wanted is not given,
+      # the index is not sorted, or we don't have enough accumulators
+      # yet.
+      if (defined $chop) {
+        # We might be able to avoid allocating accumulators
+        for my $did (keys %post) {
+          if (my $freq = $self->{db}->{$M, $did}) {
+            my $wgt = $post{$did} / $freq * $idf;
+            # We add an accumulator if $wgt exeeds $chop
+            if (exists $score{$did} or $wgt > $chop) {
+              $score{$did} += $wgt;
+            }
+          }
+        }
+      } else {
+        # Allocate acumulators for each seen document.
+        for my $did (keys %post) {
+          if (my $freq = $self->{db}->{$M, $did}) {
+            $score{$did} += $post{$did} / $freq * $idf;
+          }
+        }
+      }
+    } else {
+      # Update existing accumulators
+      for my $did (keys %score) {
+        next unless exists $post{$did};
+        if (my $freq = $self->{db}->{$M, $did}) {
+          $score{$did} += $post{$did} / $freq * $idf;
+        }
       }
     }
   }
+  #warn sprintf "Used %d accumulators\n", scalar keys %score;
   %score;
 }
 
+sub set {
+  my ($self, $attr, $value) = @_;
+
+  die "No such indexy attribute: '$attr'" unless $attr eq 'top';
+
+  return delete $self->{reorg} if $value == 0;
+
+  return if     $self->{reorg};     # we are sorted already
+  return unless $self->{mode} & O_RDWR;
+  defined $self->{db} or $self->open;
+
+  $self->sync;
+  while (my($key, $value) = each %{$self->{db}}) {
+    next if $key =~ /^\377[om]/;
+    $self->{db}->{$key} = $self->sort_postings($value);
+  }
+  $self->{reorg} = 1;
+}
+
 sub sync {
   my $self = shift;
 
   if ($self->{mode} & O_RDWR) {
-    print STDERR "Flushing $self->{cached} postings\n";
+    print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
     while (my($key, $value) = each %{$self->{cache}}) {
-      $self->{db}->{$key} .= $value;
-      #delete $self->{cache}->{$key};
+      if ($self->{reorg}) {
+        $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
+                                                   . $value);
+      } else {
+        $self->{db}->{$key} .= $value;
+      }
     }
     while (my($key, $value) = each %{$self->{cdict}}) {
       $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
       $self->{db}->{$key} += $value;
-      #delete $self->{cdict}->{$key};
     }
-    $self->{cache} = {};
-    $self->{cdict} = {};
-    # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
+    $self->{cache}  = {};
+    $self->{cdict}  = {};
     $self->{cached} = 0;
-    # $self->{dbh}->sync if $self->{dbh};
   }
 }