/[wait]/branches/CPAN/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /branches/CPAN/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-revision 18 by ulpfr,
Fri Apr 28 15:42:44 2000 UTC
+revision 19 by ulpfr,
Tue May  9 11:29:45 2000 UTC
 Line 1
- #                              -*- Mode: Cperl -*-
+ #                              -*- Mode: Perl -*-
- # InvertedIndex.pm --
+ # $Basename: InvertedIndex.pm $
- # ITIID           : $ITI$ $Header $__Header$
+ # $Revision: 1.30 $
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 22 18:44:42 1998
+ # Last Modified On: Tue May  9 08:33:28 2000
  # Language        : CPerl
- # Status          : Unknown, Use with caution!
+ #
- #
+ # (C) Copyright 1996-2000, Ulrich Pfeifer
- # Copyright (c) 1996-1997, Ulrich Pfeifer
+ #
- #
  package WAIT::InvertedIndex;
  use strict;
-Line 19 
 use WAIT::Filter;
+Line 18 
 use WAIT::Filter;
  use Carp;
  use vars qw(%FUNC);
- my $O = pack('C', 0xff)."o";                  # occurances
+ my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)
- my $M = pack('C', 0xff)."m";                  # maxtf
+ # The document frequency is the number of documents a term occurs
+ # in. The idea is that a term occuring in a significant part of the
+ # documents is not too significant.
+ my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)
+ # The maximum term frequency of a document is the frequency of the
+ # most frequent term in the document.  It is related to the document
+ # length obviously.  A document in which the most frequnet term occurs
+ # 100 times is probably much longer than a document whichs most
+ # frequent term occurs five time.
  sub new {
    my $type = shift;
-Line 135 
 sub open {
+Line 145 
 sub open {
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_BTREE);
- #    tie(%{$self->{cache}}, 'DB_File', undef,
- #        $self->{mode}, 0664, $DB_BTREE)
      $self->{cache} = {}
        if $self->{mode} & O_RDWR;
- #    tie(%{$self->{cdict}}, 'DB_File', undef,
- #        $self->{mode}, 0664, $DB_BTREE)
      $self->{cdict} = {}
        if $self->{mode} & O_RDWR;
      $self->{cached} = 0;
-Line 166 
 sub insert {
+Line 172 
 sub insert {
      }
      $self->{cached}++;
    }
+   # This cache limit should be configurable
    $self->sync if $self->{cached} > 100_000;
    my $maxtf = 0;
    for (values %occ) {
-Line 174 
 sub insert {
+Line 181 
 sub insert {
    $self->{db}->{$M, $key} = $maxtf;
  }
+ # We sort postings by increasing max term frequency (~ by increasing
+ # document length.  This reduces the quality degradation if we process
+ # only the first part of a posting list.
+ sub sort_postings {
+   my $self = shift;
+   my $post = shift;             # reference to a hash or packed string
+   if (ref $post) {
+     # we skip the sort part, if the index is not sorted
+     return pack('w*', %$post) unless $self->{reorg};
+   } else {
+     $post = { unpack 'w*', $post };
+   }
+   my $r = '';
+   # Sort posting list by increasing ratio of maximum term frequency (~
+   # "document length") and term frequency. This rati multipied by the
+   # inverse document frequence gives the score for a term.  This sort
+   # order can be exploited for tuning of single term queries.
+   for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
+                                       <=>
+                          $post->{$a} / $self->{db}->{$M, $a}
+                     } keys %$post) {
+     $r .= pack 'w2', $did, $post->{$did};
+   }
+   #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
+   $r;
+ }
  sub delete {
    my $self  = shift;
    my $key   = shift;
    my %occ;
+   my $db;
    defined $self->{db} or $self->open;
+   $db = $self->{db};
    $self->sync;
    $self->{records}--;
+   # less than zero documents in database?
+   _complain('delete of document', $key) and $self->{records} = 0
+     if $self->{records} < 0;
    grep $occ{$_}++, &{$self->{func}}(@_);
-   for (keys %occ) {
-     # may reorder posting list
+   for (keys %occ) {# may reorder posting list
-     my %post = unpack 'w*', $self->{db}->{$_};
+     my %post = unpack 'w*', $db->{$_};
-     $self->{db}->{$O,$_}--;
      delete $post{$key};
-     $self->{db}->{$_} = pack 'w*', %post;
+     $db->{$_}    = $self->sort_postings(\%post);
+     _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
+     $db->{$O,$_} = scalar keys %post;
    }
-   delete $self->{db}->{$M, $key};
+   delete $db->{$M, $key};
  }
  sub intervall {
-Line 265 
 sub prefix {
+Line 312 
 sub prefix {
    \@result;                     # speed
  }
+ =head2 search($query)
+ The search method supports a range of search algorithms.  It is
+ recommended to tune the index by calling
+ C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
+ into the table.  This is a computing intense operation and all inserts
+ and deletes after this optimization are slightly more expensive.  Once
+ reorganized, the index is kept sorted automatically until you switch
+ the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
+ When searching a tuned index, a query can be processed faster if the
+ caller requests only the topmost documents.  This can be done by
+ passing a C<top =E<gt>> I<n> parameter to the search method.
+ For single term queries, the method returns only the I<n> top ranking
+ documents.  For multi term queries two optimized algorithms are
+ available. The first algorithm computes the top n documents
+ approximately but very fast, sacrificing a little bit of precision for
+ speed.  The second algorithm computes the topmost I<n> documents
+ precisely.  This algorithm is slower and should be used only for small
+ values of I<n>.  It can be requested by passing the query attribute
+ C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
+ While the picky version might not be faster than the brute force
+ version on average for modest size databases it uses less memory and
+ the processing time is almost linear in the number of query terms, not
+ in the size of the lists.
+ =cut
  sub search {
    my $self  = shift;
+   my $query = shift;
    defined $self->{db} or $self->open;
    $self->sync;
-   $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
+   $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
  }
  sub parse {
-Line 295 
 sub search_prefix {
+Line 372 
 sub search_prefix {
    $self->search_raw(map($self->prefix($_), @_));
  }
+ sub _complain ($$) {
+   my ($action, $term) = @_;
+   require Carp;
+   Carp::cluck
+     (sprintf("WAIT database inconsistency during $action [%s]: ".
+              "Please rebuild index\n",
+              $term,));
+ }
  sub search_raw {
    my $self  = shift;
-   my %occ;
+   my $query = shift;
    my %score;
-   return () unless @_;
+   # Top $wanted documents must be correct. Zero means all matching
+   # documents.
+   my $wanted = $query->{top};
+   my $strict = $query->{picky};
+   # Return at least $minacc documents. Zero means all matching
+   # documents.
+   # my $minacc = $query->{accus} || $wanted;
+   # Open index and flush cache if necessary
    defined $self->{db} or $self->open;
    $self->sync;
-   grep $occ{$_}++, @_;
-   for (keys %occ) {
+   # We keep duplicates
-     if (defined $self->{db}->{$_}) {
+   my @terms =
+     # Sort words by decreasing document frequency
+     sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
+       # check which words occur in the index.
+       grep { $self->{db}->{$O,$_} } @_;
+   return () unless @terms;                 # nothing to search for
+   # We special-case one term queries here.  If the index was sorted,
+   # choping off the rest of the list will return the same ranking.
+   if ($wanted and @terms == 1) {
+     my $term  = shift @terms;
+     my $idf   = log($self->{records}/$self->{db}->{$O,$term});
+     my @res;
+     if ($self->{reorg}) { # or not $query->{picky}
+       @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
+     } else {
+       @res = unpack 'w*',                $self->{db}->{$term};
+     }
+     for (my $i=1; $i<@res; $i+=2) {
+       $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
+     }
+     return @res
+   }
+   # We separate exhaustive search here to avoid overhead and make the
+   # code more readable. The block can be removed without changing the
+   # result.
+   unless ($wanted) {
+     for (@terms) {
+       my $df      = $self->{db}->{$O,$_};
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $df = 1 if $df < 1;
+       # Unpack posting list for current query term $_
+       my %post = unpack 'w*', $self->{db}->{$_};
+       _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+       # This is the inverse document frequency. The log of the inverse
+       # fraction of documents the term occurs in.
+       my $idf = log($self->{records}/$df);
+       for my $did (keys %post) {
+         if (my $freq = $self->{db}->{$M, $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
+       }
+     }
+     # warn sprintf "Used %d accumulators\n", scalar keys %score;
+     return %score;
+   }
+   # A sloppy but fast algorithm for multiple term queries.
+   unless ($strict) {
+     for (@terms) {
+       # Unpack posting list for current query term $_
        my %post = unpack 'w*', $self->{db}->{$_};
-       my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));
-       my $did;
+       # Lookup the number of documents the term occurs in (document frequency)
-       for $did (keys %post) {
+       my $occ  = $self->{db}->{$O,$_};
-         $score{$did} = 0 unless defined $score{$did}; # perl -w
-         $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
+       _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
-           if $self->{db}->{$M, $did}; # db may be broken
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $occ = 1 if $occ < 1;
+       # This is the inverse document frequency. The log of the inverse
+       # fraction of documents the term occurs in.
+       my $idf = log($self->{records}/$occ);
+       # If we have a reasonable number of accumulators, change the
+       # loop to iterate over the accumulators.  This will compromise
+       # quality for better speed.  The algorithm still computes the
+       # exact weights, but the result is not guaranteed to contain the
+       # *best* results.  The database might contain documents better
+       # than the worst returned document.
+       # We process the lists in order of increasing length.  When the
+       # number of accumulators exceeds $wanted, no new documents are
+       # added, only the ranking/weighting of the seen documents is
+       # improved.  The resulting ranking list must be pruned, since only
+       # the top most documents end up near their "optimal" rank.
+       if (keys %score < $wanted) {
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{$M, $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       } else {
+         for my $did (keys %score) {
+           next unless exists $post{$did};
+           if (my $freq = $self->{db}->{$M, $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       }
+     }
+     return %score;
+   }
+   my @max; $max[$#terms+1]=0;
+   my @idf;
+   # Preparation loop.  This extra loop makes sense only when "reorg"
+   # and "wanted" are true.  But at the time beeing, keeping the code
+   # for the different search algorithms in one place seems more
+   # desirable than some minor speedup of the brute force version.  We
+   # do cache $idf though.
+   for (my $i = $#terms; $i >=0; $i--) {
+     local $_ = $terms[$i];
+     # Lookup the number of documents the term occurs in (document frequency)
+     my $df      = $self->{db}->{$O,$_};
+     # The frequency *must* be 1 at least since the posting list is nonempty
+     _complain('search for term', $_) and $df = 1 if $df < 1;
+     # This is the inverse document frequency. The log of the inverse
+     # fraction of documents the term occurs in.
+     $idf[$i] = log($self->{records}/$df);
+     my ($did,$occ);
+     if ($self->{reorg}) {
+       ($did,$occ) = unpack 'w2', $self->{db}->{$_};
+     } else {                    # Maybe this costs more than it helps
+       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
+     }
+     my $freq      = $self->{db}->{$M, $did};
+     my $max       = $occ/$freq*$idf[$i];
+     $max[$i]      = $max + $max[$i+1];
+   }
+   # Main loop
+   for my $i (0 .. $#terms) {
+     my $term = $terms[$i];
+     # Unpack posting list for current query term $term. We loose the
+     # sorting order because the assignment to a hash.
+     my %post = unpack 'w*', $self->{db}->{$term};
+     _complain('search for term', $term)
+       if $self->{db}->{$O,$term} != keys %post;
+     my $idf  = $idf[$i];
+     my $full;                   # Need to process all postings
+     my $chop;                   # Score necessary to enter the ranking list
+     if (# We know that wanted is true since we especial cased the
+         # exhaustive search.
+         $wanted and
+         # We did sort here if necessary in
+         # the preparation loop
+         # $self->{reorg} and
+         scalar keys %score > $wanted) {
+       $chop = (sort { $b <=> $a } values %score)[$wanted];
+       $full = $max[$i] > $chop;
+     } else {
+       $full = 1;
+     }
+     if ($full) {
+       # We need to inspect the full list. Either $wanted is not given,
+       # the index is not sorted, or we don't have enough accumulators
+       # yet.
+       if (defined $chop) {
+         # We might be able to avoid allocating accumulators
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{$M, $did}) {
+             my $wgt = $post{$did} / $freq * $idf;
+             # We add an accumulator if $wgt exeeds $chop
+             if (exists $score{$did} or $wgt > $chop) {
+               $score{$did} += $wgt;
+             }
+           }
+         }
+       } else {
+         # Allocate acumulators for each seen document.
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{$M, $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       }
+     } else {
+       # Update existing accumulators
+       for my $did (keys %score) {
+         next unless exists $post{$did};
+         if (my $freq = $self->{db}->{$M, $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
        }
      }
    }
+   #warn sprintf "Used %d accumulators\n", scalar keys %score;
    %score;
  }
+ sub set {
+   my ($self, $attr, $value) = @_;
+   die "No such indexy attribute: '$attr'" unless $attr eq 'top';
+   return delete $self->{reorg} if $value == 0;
+   return if     $self->{reorg};     # we are sorted already
+   return unless $self->{mode} & O_RDWR;
+   defined $self->{db} or $self->open;
+   $self->sync;
+   while (my($key, $value) = each %{$self->{db}}) {
+     next if $key =~ /^\377[om]/;
+     $self->{db}->{$key} = $self->sort_postings($value);
+   }
+   $self->{reorg} = 1;
+ }
  sub sync {
    my $self = shift;
    if ($self->{mode} & O_RDWR) {
-     print STDERR "Flushing $self->{cached} postings\n";
+     print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
      while (my($key, $value) = each %{$self->{cache}}) {
-       $self->{db}->{$key} .= $value;
+       if ($self->{reorg}) {
-       #delete $self->{cache}->{$key};
+         $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
+                                                    . $value);
+       } else {
+         $self->{db}->{$key} .= $value;
+       }
      }
      while (my($key, $value) = each %{$self->{cdict}}) {
        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
        $self->{db}->{$key} += $value;
-       #delete $self->{cdict}->{$key};
      }
-     $self->{cache} = {};
+     $self->{cache}  = {};
-     $self->{cdict} = {};
+     $self->{cdict}  = {};
-     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
      $self->{cached} = 0;
-     # $self->{dbh}->sync if $self->{dbh};
    }
  }

 Legend:



Removed from v.18
 


changed lines


 
Added in v.19
 Legend:



Removed from v.18
 


changed lines


 
Added in v.19
-Removed from v.18
+Added in v.19

	ViewVC Help
Powered by ViewVC 1.1.26