/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-branches/CPAN/lib/WAIT/InvertedIndex.pm
revision 11 by unknown,
Fri Apr 28 15:41:10 2000 UTC
+cvs-head/lib/WAIT/InvertedIndex.pm
revision 22 by ulpfr,
Sat Nov 11 16:58:53 2000 UTC
 Line 1
  #                              -*- Mode: Perl -*-
- # InvertedIndex.pm --
+ # $Basename: InvertedIndex.pm $
- # ITIID           : $ITI$ $Header $__Header$
+ # $Revision: 1.30 $
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 22 18:44:42 1998
+ # Last Modified On: Sat Nov 11 16:32:38 2000
  # Language        : CPerl
- # Status          : Unknown, Use with caution!
  #
- # Copyright (c) 1996-1997, Ulrich Pfeifer
+ # (C) Copyright 1996-2000, Ulrich Pfeifer
  #
  package WAIT::InvertedIndex;
-Line 19 
 use WAIT::Filter;
+Line 18 
 use WAIT::Filter;
  use Carp;
  use vars qw(%FUNC);
- my $O = pack('C', 0xff)."o";                  # occurances
+ # The dictionary has three different key types:
- my $M = pack('C', 0xff)."m";                  # maxtf
+ #  'o'.$word
+ #
+ #     The document frequency is the number of documents a term occurs
+ #     in. The idea is that a term occuring in a significant part of the
+ #     documents is not too significant.
+ #
+ # 'm'.$word
+ #
+ #     The maximum term frequency of a document is the frequency of the
+ #     most frequent term in the document.  It is related to the document
+ #     length obviously.  A document in which the most frequnet term occurs
+ #     100 times is probably much longer than a document whichs most
+ #     frequent term occurs five time.
+ #
+ # 'p'.$word
+ #
+ #     Under this key we store the actual posting list as pairs of
+ #     packed integers.
+ my $no_old_index_support = 0; # do not check for old indices if set
  sub new {
    my $type = shift;
-Line 63 
 sub _split_pos {
+Line 81 
 sub _split_pos {
  sub _xfiltergen {
    my $filter = pop @_;
-   if ($filter eq 'stop') {      # avoid the slow stopword elimination
+ # Oops, we cannot overrule the user's choice. Other filters may kill
-     return _xfiltergen(@_);            # it's cheaper to look them up afterwards
+ # stopwords, such as isotr clobbers "isn't" to "isnt".
-   }
+ #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
+ #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
+ #  }
    if (@_) {
      if ($filter =~ /^split(\d*)/) {
        if ($1) {
-Line 121 
 sub drop {
+Line 142 
 sub drop {
    }
  }
+ sub is_an_old_index {
+   my $self = shift;
+   return 0 if $no_old_index_support;
+   return $self->{old_index} if exists $self->{old_index};
+   # We can only guess if this is an old index. We lookup the first 10
+   # $O entries. If all values are integers, we assume that the index
+   # is an old one.
+   defined $self->{db} or $self->open;
+   $self->sync;
+   my $dbh = $self->{dbh};       # for convenience
+   my $O = pack('C', 0xff)."o";
+   my ($word, $value) = ($O.$;);
+   $dbh->seq($word, $value, R_CURSOR);
+   for (my $i=0; $i<10;$i++) {
+     if ($value !~ /^\d+$/) {
+       return $self->{old_index} = 0;
+     }
+     if ($dbh->seq($word, $value, R_NEXT) or # no values left
+         $word !~ /^$O/o                     # no $O values left
+        ) {
+       # we are not sure enough that this is an old index
+       return $self->{old_index} = 0;
+     }
+   }
+   return $self->{old_index} = 1;
+ }
  sub open {
    my $self = shift;
    my $file = $self->{file};
-Line 132 
 sub open {
+Line 184 
 sub open {
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                         $self->{mode}, 0664, $DB_BTREE);
- #    tie(%{$self->{cache}}, 'DB_File', undef,
- #        $self->{mode}, 0664, $DB_BTREE)
      $self->{cache} = {}
        if $self->{mode} & O_RDWR;
- #    tie(%{$self->{cdict}}, 'DB_File', undef,
- #        $self->{mode}, 0664, $DB_BTREE)
      $self->{cdict} = {}
        if $self->{mode} & O_RDWR;
      $self->{cached} = 0;
+     if (!$no_old_index_support and $self->is_an_old_index()) {
+       warn "This is an old index, upgrade you database";
+       require WAIT::InvertedIndexOld;
+       bless $self, 'WAIT::InvertedIndexOld';
+     }
    }
  }
-Line 148 
 sub insert {
+Line 201 
 sub insert {
    my $self  = shift;
    my $key   = shift;
    my %occ;
    defined $self->{db} or $self->open;
    grep $occ{$_}++, &{$self->{func}}(@_);
    my ($word, $noc);
    $self->{records}++;
    while (($word, $noc) = each %occ) {
      if (defined $self->{cache}->{$word}) {
-       $self->{cdict}->{$O,$word}++;
+       $self->{cdict}->{$word}++;
        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
      } else {
-       $self->{cdict}->{$O,$word} = 1;
+       $self->{cdict}->{$word} = 1;
        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
      }
      $self->{cached}++;
    }
+   # This cache limit should be configurable
    $self->sync if $self->{cached} > 100_000;
    my $maxtf = 0;
    for (values %occ) {
      $maxtf = $_ if $_ > $maxtf;
    }
-   $self->{db}->{$M, $key} = $maxtf;
+   $self->{db}->{'m'. $key} = $maxtf;
+ }
+ # We sort postings by increasing max term frequency (~ by increasing
+ # document length.  This reduces the quality degradation if we process
+ # only the first part of a posting list.
+ sub sort_postings {
+   my $self = shift;
+   my $post = shift;             # reference to a hash or packed string
+   if (ref $post) {
+     # we skip the sort part, if the index is not sorted
+     return pack('w*', %$post) unless $self->{reorg};
+   } else {
+     $post = { unpack 'w*', $post };
+   }
+   my $r = '';
+   # Sort posting list by increasing ratio of maximum term frequency (~
+   # "document length") and term frequency. This rati multipied by the
+   # inverse document frequence gives the score for a term.  This sort
+   # order can be exploited for tuning of single term queries.
+   for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
+                                       <=>
+                          $post->{$a} / $self->{db}->{'m'. $a}
+                     } keys %$post) {
+     $r .= pack 'w2', $did, $post->{$did};
+   }
+   #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
+   $r;
  }
  sub delete {
-Line 176 
 sub delete {
+Line 262 
 sub delete {
    my $key   = shift;
    my %occ;
+   my $db;
    defined $self->{db} or $self->open;
+   $db = $self->{db};
    $self->sync;
    $self->{records}--;
+   # less than zero documents in database?
+   _complain('delete of document', $key) and $self->{records} = 0
+     if $self->{records} < 0;
    grep $occ{$_}++, &{$self->{func}}(@_);
-   for (keys %occ) {
-     # may reorder posting list
+   for (keys %occ) {# may reorder posting list
-     my %post = unpack 'w*', $self->{db}->{$_};
+     my %post = unpack 'w*', $db->{'p'.$_};
-     $self->{db}->{$O,$_}--;
      delete $post{$key};
-     $self->{db}->{$_} = pack 'w*', %post;
+     $db->{'p'.$_}    = $self->sort_postings(\%post);
+     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
+     $db->{'o'.$_} = scalar keys %post;
    }
-   delete $self->{db}->{$M, $key};
+   delete $db->{'m'. $key};
  }
  sub intervall {
-Line 210 
 sub intervall {
+Line 304 
 sub intervall {
      ($first) = &{$self->{'ifunc'}}($first) if $first;
      ($last)  = &{$self->{'ifunc'}}($last) if $last;
    }
-   if (defined $first and $first ne '') {         # set the cursor to $first
+   $first = 'p'.($first||'');
-     $dbh->seq($first, $value, R_CURSOR);
+   $last  = (defined $last)?'p'.$last:'q';
-   } else {
-     $dbh->seq($first, $value, R_FIRST);
+   # set the cursor to $first
-   }
+   $dbh->seq($first, $value, R_CURSOR);
-   # We assume that word do not start with the character \377
-   # $last = pack 'C', 0xff unless defined $last and $last ne '';
+   # $first would be after the last word
-   return () if defined $last and $first gt $last; # $first would be after the last word
+   return () if $first gt $last;
-   push @result, $first;
+   push @result, substr($first,1);
    while (!$dbh->seq($word, $value, R_NEXT)) {
      # We should limit this to a "resonable" number of words
-     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
+     last if $word gt $last;
-     push @result, $word;
+     push @result, substr($word,1);
    }
    \@result;                     # speed
  }
-Line 248 
 sub prefix {
+Line 342 
 sub prefix {
      ($prefix) = &{$self->{'pfunc'}}($prefix);
    }
-   if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
+   if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
      return ();
    }
-   return () if $word !~ /^$prefix/;
+   return () if $word !~ /^p$prefix/;
-   push @result, $word;
+   push @result, substr($word,1);
    while (!$dbh->seq($word, $value, R_NEXT)) {
      # We should limit this to a "resonable" number of words
-     last if $word !~ /^$prefix/;
+     last if $word !~ /^p$prefix/;
-     push @result, $word;
+     push @result, substr($word,1);
    }
    \@result;                     # speed
  }
+ =head2 search($query)
+ The search method supports a range of search algorithms.  It is
+ recommended to tune the index by calling
+ C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
+ into the table.  This is a computing intense operation and all inserts
+ and deletes after this optimization are slightly more expensive.  Once
+ reorganized, the index is kept sorted automatically until you switch
+ the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
+ When searching a tuned index, a query can be processed faster if the
+ caller requests only the topmost documents.  This can be done by
+ passing a C<top =E<gt>> I<n> parameter to the search method.
+ For single term queries, the method returns only the I<n> top ranking
+ documents.  For multi term queries two optimized algorithms are
+ available. The first algorithm computes the top n documents
+ approximately but very fast, sacrificing a little bit of precision for
+ speed.  The second algorithm computes the topmost I<n> documents
+ precisely.  This algorithm is slower and should be used only for small
+ values of I<n>.  It can be requested by passing the query attribute
+ C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
+ While the picky version might not be faster than the brute force
+ version on average for modest size databases it uses less memory and
+ the processing time is almost linear in the number of query terms, not
+ in the size of the lists.
+ =cut
  sub search {
    my $self  = shift;
+   my $query = shift;
    defined $self->{db} or $self->open;
    $self->sync;
-   $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
+   $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
  }
  sub parse {
-Line 277 
 sub parse {
+Line 401 
 sub parse {
    &{$self->{func}}(@_);
  }
+ sub keys {
+   my $self  = shift;
+   defined $self->{db} or $self->open;
+   keys %{$self->{db}};
+ }
  sub search_prefix {
    my $self  = shift;
-Line 285 
 sub search_prefix {
+Line 416 
 sub search_prefix {
    $self->search_raw(map($self->prefix($_), @_));
  }
+ sub _complain ($$) {
+   my ($action, $term) = @_;
+   require Carp;
+   Carp::cluck
+     (sprintf("WAIT database inconsistency during $action [%s]: ".
+              "Please rebuild index\n",
+              $term,));
+ }
  sub search_raw {
    my $self  = shift;
-   my %occ;
+   my $query = shift;
    my %score;
-   return () unless @_;
+   # Top $wanted documents must be correct. Zero means all matching
+   # documents.
+   my $wanted = $query->{top};
+   my $strict = $query->{picky};
+   # Return at least $minacc documents. Zero means all matching
+   # documents.
+   # my $minacc = $query->{accus} || $wanted;
+   # Open index and flush cache if necessary
    defined $self->{db} or $self->open;
    $self->sync;
-   grep $occ{$_}++, @_;
-   for (keys %occ) {
+   # We keep duplicates
-     if (defined $self->{db}->{$_}) {
+   my @terms =
-       my %post = unpack 'w*', $self->{db}->{$_};
+     # Sort words by decreasing document frequency
-       my $idf = log($self->{records}/$self->{db}->{$O,$_});
+     sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
-       my $did;
+       # check which words occur in the index.
-       for $did (keys %post) {
+       grep { $self->{db}->{'o'.$_} } @_;
-         $score{$did} = 0 unless defined $score{$did}; # perl -w
-         $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
+   return () unless @terms;                 # nothing to search for
-           if $self->{db}->{$M, $did}; # db may be broken
+   # We special-case one term queries here.  If the index was sorted,
+   # choping off the rest of the list will return the same ranking.
+   if ($wanted and @terms == 1) {
+     my $term  = shift @terms;
+     my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
+     my @res;
+     if ($self->{reorg}) { # or not $query->{picky}
+       @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
+     } else {
+       @res = unpack 'w*',                $self->{db}->{'p'.$term};
+     }
+     for (my $i=1; $i<@res; $i+=2) {
+       $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
+     }
+     return @res
+   }
+   # We separate exhaustive search here to avoid overhead and make the
+   # code more readable. The block can be removed without changing the
+   # result.
+   unless ($wanted) {
+     for (@terms) {
+       my $df      = $self->{db}->{'o'.$_};
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $df = 1 if $df < 1;
+       # Unpack posting list for current query term $_
+       my %post = unpack 'w*', $self->{db}->{'p'.$_};
+       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
+       # This is the inverse document frequency. The log of the inverse
+       # fraction of documents the term occurs in.
+       my $idf = log($self->{records}/$df);
+       for my $did (keys %post) {
+         if (my $freq = $self->{db}->{'m'. $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
+       }
+     }
+     # warn sprintf "Used %d accumulators\n", scalar keys %score;
+     return %score;
+   }
+   # A sloppy but fast algorithm for multiple term queries.
+   unless ($strict) {
+     for (@terms) {
+       # Unpack posting list for current query term $_
+       my %post = unpack 'w*', $self->{db}->{'p'.$_};
+       # Lookup the number of documents the term occurs in (document frequency)
+       my $occ  = $self->{db}->{'o'.$_};
+       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $occ = 1 if $occ < 1;
+       # This is the inverse document frequency. The log of the inverse
+       # fraction of documents the term occurs in.
+       my $idf = log($self->{records}/$occ);
+       # If we have a reasonable number of accumulators, change the
+       # loop to iterate over the accumulators.  This will compromise
+       # quality for better speed.  The algorithm still computes the
+       # exact weights, but the result is not guaranteed to contain the
+       # *best* results.  The database might contain documents better
+       # than the worst returned document.
+       # We process the lists in order of increasing length.  When the
+       # number of accumulators exceeds $wanted, no new documents are
+       # added, only the ranking/weighting of the seen documents is
+       # improved.  The resulting ranking list must be pruned, since only
+       # the top most documents end up near their "optimal" rank.
+       if (keys %score < $wanted) {
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       } else {
+         for my $did (keys %score) {
+           next unless exists $post{$did};
+           if (my $freq = $self->{db}->{'m'. $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       }
+     }
+     return %score;
+   }
+   my @max; $max[$#terms+1]=0;
+   my @idf;
+   # Preparation loop.  This extra loop makes sense only when "reorg"
+   # and "wanted" are true.  But at the time beeing, keeping the code
+   # for the different search algorithms in one place seems more
+   # desirable than some minor speedup of the brute force version.  We
+   # do cache $idf though.
+   for (my $i = $#terms; $i >=0; $i--) {
+     local $_ = $terms[$i];
+     # Lookup the number of documents the term occurs in (document frequency)
+     my $df      = $self->{db}->{'o'.$_};
+     # The frequency *must* be 1 at least since the posting list is nonempty
+     _complain('search for term', $_) and $df = 1 if $df < 1;
+     # This is the inverse document frequency. The log of the inverse
+     # fraction of documents the term occurs in.
+     $idf[$i] = log($self->{records}/$df);
+     my ($did,$occ);
+     if ($self->{reorg}) {
+       ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
+     } else {                    # Maybe this costs more than it helps
+       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
+     }
+     my $freq      = $self->{db}->{'m'. $did};
+     my $max       = $occ/$freq*$idf[$i];
+     $max[$i]      = $max + $max[$i+1];
+   }
+   # Main loop
+   for my $i (0 .. $#terms) {
+     my $term = $terms[$i];
+     # Unpack posting list for current query term $term. We loose the
+     # sorting order because the assignment to a hash.
+     my %post = unpack 'w*', $self->{db}->{'p'.$term};
+     _complain('search for term', $term)
+       if $self->{db}->{'o'.$term} != keys %post;
+     my $idf  = $idf[$i];
+     my $full;                   # Need to process all postings
+     my $chop;                   # Score necessary to enter the ranking list
+     if (# We know that wanted is true since we especial cased the
+         # exhaustive search.
+         $wanted and
+         # We did sort here if necessary in
+         # the preparation loop
+         # $self->{reorg} and
+         scalar keys %score > $wanted) {
+       $chop = (sort { $b <=> $a } values %score)[$wanted];
+       $full = $max[$i] > $chop;
+     } else {
+       $full = 1;
+     }
+     if ($full) {
+       # We need to inspect the full list. Either $wanted is not given,
+       # the index is not sorted, or we don't have enough accumulators
+       # yet.
+       if (defined $chop) {
+         # We might be able to avoid allocating accumulators
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
+             my $wgt = $post{$did} / $freq * $idf;
+             # We add an accumulator if $wgt exeeds $chop
+             if (exists $score{$did} or $wgt > $chop) {
+               $score{$did} += $wgt;
+             }
+           }
+         }
+       } else {
+         # Allocate acumulators for each seen document.
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       }
+     } else {
+       # Update existing accumulators
+       for my $did (keys %score) {
+         next unless exists $post{$did};
+         if (my $freq = $self->{db}->{'m'. $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
        }
      }
    }
+   #warn sprintf "Used %d accumulators\n", scalar keys %score;
    %score;
  }
+ sub set {
+   my ($self, $attr, $value) = @_;
+   die "No such indexy attribute: '$attr'" unless $attr eq 'top';
+   return delete $self->{reorg} if $value == 0;
+   return if     $self->{reorg};     # we are sorted already
+   return unless $self->{mode} & O_RDWR;
+   defined $self->{db} or $self->open;
+   $self->sync;
+   while (my($key, $value) = each %{$self->{db}}) {
+     next if $key !~ /^p/;
+     $self->{db}->{$key} = $self->sort_postings($value);
+   }
+   $self->{reorg} = 1;
+ }
  sub sync {
    my $self = shift;
    if ($self->{mode} & O_RDWR) {
-     print STDERR "\aFlushing $self->{cached} postings\n";
+     print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
      while (my($key, $value) = each %{$self->{cache}}) {
-       $self->{db}->{$key} .= $value;
+       if ($self->{reorg}) {
-       #delete $self->{cache}->{$key};
+         $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
+                                                    . $value);
+       } else {
+         $self->{db}->{'p'.$key} .= $value;
+       }
      }
      while (my($key, $value) = each %{$self->{cdict}}) {
-       $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
+       $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
-       $self->{db}->{$key} += $value;
+       $self->{db}->{'o'.$key} += $value;
-       #delete $self->{cdict}->{$key};
+     }
-     }
+     $self->{cache}  = {};
-     $self->{cache} = {};
+     $self->{cdict}  = {};
-     $self->{cdict} = {};
-     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
      $self->{cached} = 0;
-     # $self->{dbh}->sync if $self->{dbh};
    }
  }

 Legend:



Removed from v.11
 


changed lines


 
Added in v.22
 Legend:



Removed from v.11
 


changed lines


 
Added in v.22
-Removed from v.11
+Added in v.22

	ViewVC Help
Powered by ViewVC 1.1.26