/[wait]/trunk/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-branches/CPAN/lib/WAIT/InvertedIndex.pm
revision 11 by unknown,
Fri Apr 28 15:41:10 2000 UTC
+trunk/lib/WAIT/InvertedIndex.pm
revision 108 by dpavlin,
Tue Jul 13 17:41:12 2004 UTC
 Line 1
- #                              -*- Mode: Perl -*-
+ # -*- Mode: cperl; fill-column: 79 -*-
- # InvertedIndex.pm --
+ # $Basename: InvertedIndex.pm $
- # ITIID           : $ITI$ $Header $__Header$
+ # $Revision: 1.30 $
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 22 18:44:42 1998
+ # Last Modified On: Mon Apr 22 16:52:01 2002
  # Language        : CPerl
- # Status          : Unknown, Use with caution!
  #
- # Copyright (c) 1996-1997, Ulrich Pfeifer
+ # (C) Copyright 1996-2002, Ulrich Pfeifer
  #
  package WAIT::InvertedIndex;
  use strict;
- use DB_File;
+ use BerkeleyDB;
  use Fcntl;
  use WAIT::Filter;
  use Carp;
- use vars qw(%FUNC);
+ use vars qw(%FUNC $VERSION);
+ use Time::HiRes qw(time);
- my $O = pack('C', 0xff)."o";                  # occurances
+ $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
- my $M = pack('C', 0xff)."m";                  # maxtf
+ use constant DOCFREQ_O     => "o";
+ use constant MAXTF_M       => "m";
+ use constant POSTINGLIST_P => "p";
+ use constant PMATCH        => qr/^(??{POSTINGLIST_P()})/;
+ # The dictionary has three different key types:
+ #  'o'.$word
+ #
+ #     The document frequency is the number of documents a term occurs
+ #     in. The idea is that a term occuring in a significant portion of the
+ #     documents is not too significant.
+ #
+ # 'm'.$word
+ #
+ #     The maximum term frequency of a document is the frequency of the
+ #     most frequent term in the document.  It is related to the document
+ #     length obviously.  A document in which the most frequent term occurs
+ #     100 times is probably much longer than a document whichs most
+ #     frequent term occurs five time.
+ #
+ # 'p'.$word
+ #
+ #     Under this key we store the actual posting list as pairs of
+ #     packed integers.
  sub new {
    my $type = shift;
    my %parm = @_;
    my $self = {};
-   $self->{file}     = $parm{file}     or croak "No file specified";
+   for my $x (qw(file attr subname env maindbfile tablename)) {
-   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
+     $self->{$x}     = $parm{$x}     or croak "No $x specified";
+   }
    $self->{filter}   = $parm{filter};
    $self->{'name'}   = $parm{'name'};
    $self->{records}  = 0;
-Line 44 
 sub new {
+Line 71 
 sub new {
    bless $self, ref($type) || $type;
  }
- sub name {$_[0]->{'name'}}
+ for my $accessor (qw(name maindbfile tablename subname)) {
+   no strict 'refs';
+   *{$accessor} = sub {
+     my($self) = @_;
+     return $self->{$accessor} if $self->{$accessor};
+     require Carp;
+     Carp::confess("accessor $accessor not there");
+   }
+ }
  sub _split_pos {
    my ($text, $pos) = @{$_[0]};
-Line 63 
 sub _split_pos {
+Line 98 
 sub _split_pos {
  sub _xfiltergen {
    my $filter = pop @_;
-   if ($filter eq 'stop') {      # avoid the slow stopword elimination
+ # Oops, we cannot overrule the user's choice. Other filters may kill
-     return _xfiltergen(@_);            # it's cheaper to look them up afterwards
+ # stopwords, such as isotr clobbers "isn't" to "isnt".
-   }
+ #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
+ #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
+ #  }
    if (@_) {
      if ($filter =~ /^split(\d*)/) {
        if ($1) {
-Line 130 
 sub open {
+Line 168 
 sub open {
    } else {
      $self->{func}     =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
-     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
+     my $flags;
-                        $self->{mode}, 0664, $DB_BTREE);
+     if ($self->{mode} & O_RDWR) {
- #    tie(%{$self->{cache}}, 'DB_File', undef,
+       $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
- #        $self->{mode}, 0664, $DB_BTREE)
+       warn "Flags on inverted $file set to 'writing'";
+     } else {
+       $flags = DB_RDONLY;
+       # warn "Flags on inverted $file set to 'readonly'";
+     }
+     my $filename = $self->maindbfile or die;
+     my $subname  = join("/",$self->tablename || die,$self->subname || die);
+     my $env = $self->{env} || "[undef]";
+     $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
+                        # Filename => $file,
+                        Filename => $filename,
+                        $self->{env} ? (Env  => $self->{env}) : (),
+                        Subname => $subname,
+                        Mode => 0664,
+                        Flags => $flags,
+                        $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
+                        $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
+                        ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
      $self->{cache} = {}
        if $self->{mode} & O_RDWR;
- #    tie(%{$self->{cdict}}, 'DB_File', undef,
- #        $self->{mode}, 0664, $DB_BTREE)
      $self->{cdict} = {}
        if $self->{mode} & O_RDWR;
      $self->{cached} = 0;
-Line 148 
 sub insert {
+Line 201 
 sub insert {
    my $self  = shift;
    my $key   = shift;
    my %occ;
    defined $self->{db} or $self->open;
+   defined $self->{db} or die "open didn't help!!!";
    grep $occ{$_}++, &{$self->{func}}(@_);
    my ($word, $noc);
    $self->{records}++;
    while (($word, $noc) = each %occ) {
      if (defined $self->{cache}->{$word}) {
-       $self->{cdict}->{$O,$word}++;
+       $self->{cdict}->{$word}++;
        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
      } else {
-       $self->{cdict}->{$O,$word} = 1;
+       $self->{cdict}->{$word} = 1;
        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
      }
      $self->{cached}++;
    }
+   # This cache limit should be configurable
    $self->sync if $self->{cached} > 100_000;
    my $maxtf = 0;
    for (values %occ) {
      $maxtf = $_ if $_ > $maxtf;
    }
-   $self->{db}->{$M, $key} = $maxtf;
+   $self->{db}->{MAXTF_M . $key} = $maxtf;
+ }
+ # We sort postings by increasing max term frequency (~ by increasing
+ # document length.  This reduces the quality degradation if we process
+ # only the first part of a posting list.
+ sub sort_postings {
+   my $self = shift;
+   my $post = shift;             # reference to a hash or packed string
+   if (ref $post) {
+     # we skip the sort part, if the index is not sorted
+     return pack('w*', %$post) unless $self->{reorg};
+   } else {
+     $post = { unpack 'w*', $post };
+   }
+   my $r = '';
+   # Sort posting list by increasing ratio of maximum term frequency (~
+   # "document length") and term frequency. This ratio multipied by the
+   # inverse document frequence gives the score for a term.  This sort
+   # order can be exploited for tuning of single term queries.
+   for my $did (keys %$post) { # sanity check
+     unless ($self->{db}->{MAXTF_M . $did}) {
+       warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
+       $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
+     }
+   }
+   for my $did (sort {    $post->{$b} / $self->{db}->{MAXTF_M . $b}
+                                       <=>
+                          $post->{$a} / $self->{db}->{MAXTF_M . $a}
+                     } keys %$post) {
+     $r .= pack 'w2', $did, $post->{$did};
+   }
+   #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
+   $r;
  }
  sub delete {
-Line 176 
 sub delete {
+Line 269 
 sub delete {
    my $key   = shift;
    my %occ;
+   my $db;
    defined $self->{db} or $self->open;
+   $db = $self->{db};
    $self->sync;
    $self->{records}--;
+   # less than zero documents in database?
+   _complain('delete of document', $key) and $self->{records} = 0
+     if $self->{records} < 0;
    grep $occ{$_}++, &{$self->{func}}(@_);
-   for (keys %occ) {
-     # may reorder posting list
+   # Be prepared for "Odd number of elements in hash assignment"
-     my %post = unpack 'w*', $self->{db}->{$_};
+   local $SIG{__WARN__} = sub {
-     $self->{db}->{$O,$_}--;
+     my $warning = shift;
+     chomp $warning;
+     warn "Catching warning[$warning] during delete of key[$key]";
+   };
+   for (keys %occ) {# may reorder posting list
+     my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
      delete $post{$key};
-     $self->{db}->{$_} = pack 'w*', %post;
+     $db->{POSTINGLIST_P . $_}    = $self->sort_postings(\%post);
+     _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
+     $db->{DOCFREQ_O . $_} = scalar keys %post;
    }
-   delete $self->{db}->{$M, $key};
+   delete $db->{MAXTF_M . $key};
  }
  sub intervall {
    my ($self, $first, $last) = @_;
-   my $value = '';
-   my $word  = '';
-   my @result;
-   return unless exists $self->{'intervall'};
+   die "intervall broken in this version of WAIT: need to fix the
+   R_CURSOR and R_NEXT lines";
-   defined $self->{db} or $self->open;
+ ####      my $value = '';
-   $self->sync;
+ ####      my $word  = '';
-   my $dbh = $self->{dbh};       # for convenience
+ ####      my @result;
+ ####
-   if (ref $self->{'intervall'}) {
+ ####      return unless exists $self->{'intervall'};
-     unless (exists $self->{'ifunc'}) {
+ ####
-       $self->{'ifunc'} =
+ ####      defined $self->{db} or $self->open;
-         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
+ ####      $self->sync;
-     }
+ ####      my $dbh = $self->{dbh};       # for convenience
-     ($first) = &{$self->{'ifunc'}}($first) if $first;
+ ####
-     ($last)  = &{$self->{'ifunc'}}($last) if $last;
+ ####      if (ref $self->{'intervall'}) {
-   }
+ ####        unless (exists $self->{'ifunc'}) {
-   if (defined $first and $first ne '') {         # set the cursor to $first
+ ####          $self->{'ifunc'} =
-     $dbh->seq($first, $value, R_CURSOR);
+ ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
-   } else {
+ ####        }
-     $dbh->seq($first, $value, R_FIRST);
+ ####        ($first) = &{$self->{'ifunc'}}($first) if $first;
-   }
+ ####        ($last)  = &{$self->{'ifunc'}}($last) if $last;
-   # We assume that word do not start with the character \377
+ ####      }
-   # $last = pack 'C', 0xff unless defined $last and $last ne '';
+ ####      $first = POSTINGLIST_P . ($first||'');
-   return () if defined $last and $first gt $last; # $first would be after the last word
+ ####      $last  = (defined $last)?POSTINGLIST_P . $last:'q';
+ ####
-   push @result, $first;
+ ####      # set the cursor to $first
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####      $dbh->seq($first, $value, R_CURSOR);
-     # We should limit this to a "resonable" number of words
+ ####
-     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
+ ####      # $first would be after the last word
-     push @result, $word;
+ ####      return () if $first gt $last;
-   }
+ ####
-   \@result;                     # speed
+ ####      push @result, substr($first,1);
+ ####      while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####        # We should limit this to a "resonable" number of words
+ ####        last if $word gt $last;
+ ####        push @result, substr($word,1);
+ ####      }
+ ####      \@result;                     # speed
  }
  sub prefix {
    my ($self, $prefix) = @_;
-   my $value = '';
-   my $word  = '';
-   my @result;
-   return () unless defined $prefix; # Full dictionary requested !!
+   die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
-   return unless exists $self->{'prefix'};
-   defined $self->{db} or $self->open;
-   $self->sync;
-   my $dbh = $self->{dbh};
-   if (ref $self->{'prefix'}) {
-     unless (exists $self->{'pfunc'}) {
-       $self->{'pfunc'} =
-         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
-     }
-     ($prefix) = &{$self->{'pfunc'}}($prefix);
-   }
-   if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
-     return ();
-   }
-   return () if $word !~ /^$prefix/;
-   push @result, $word;
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####      my $value = '';
-     # We should limit this to a "resonable" number of words
+ ####      my $word  = '';
-     last if $word !~ /^$prefix/;
+ ####      my @result;
-     push @result, $word;
+ ####
-   }
+ ####      return () unless defined $prefix; # Full dictionary requested !!
-   \@result;                     # speed
+ ####      return unless exists $self->{'prefix'};
- }
+ ####      defined $self->{db} or $self->open;
+ ####      $self->sync;
+ ####      my $dbh = $self->{dbh};
+ ####
+ ####      if (ref $self->{'prefix'}) {
+ ####        unless (exists $self->{'pfunc'}) {
+ ####          $self->{'pfunc'} =
+ ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
+ ####        }
+ ####        ($prefix) = &{$self->{'pfunc'}}($prefix);
+ ####      }
+ ####
+ ####      if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
+ ####        return ();
+ ####      }
+ ####      return () if $word !~ /^p$prefix/;
+ ####      push @result, substr($word,1);
+ ####
+ ####      while (!$dbh->seq($word, $value, R_NEXT)) {
+ ####        # We should limit this to a "resonable" number of words
+ ####        last if $word !~ /^p$prefix/;
+ ####        push @result, substr($word,1);
+ ####      }
+ ####      \@result;                     # speed
+ }
+ =head2 search($query)
+ The search method supports a range of search algorithms.  It is
+ recommended to tune the index by calling
+ C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
+ into the table.  This is a computing intense operation and all inserts
+ and deletes after this optimization are slightly more expensive.  Once
+ reorganized, the index is kept sorted automatically until you switch
+ the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
+ When searching a tuned index, a query can be processed faster if the
+ caller requests only the topmost documents.  This can be done by
+ passing a C<top =E<gt>> I<n> parameter to the search method.
+ For single term queries, the method returns only the I<n> top ranking
+ documents.  For multi term queries two optimized algorithms are
+ available. The first algorithm computes the top n documents
+ approximately but very fast, sacrificing a little bit of precision for
+ speed.  The second algorithm computes the topmost I<n> documents
+ precisely.  This algorithm is slower and should be used only for small
+ values of I<n>.  It can be requested by passing the query attribute
+ C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
+ While the picky version might not be faster than the brute force
+ version on average for modest size databases it uses less memory and
+ the processing time is almost linear in the number of query terms, not
+ in the size of the lists.
- sub search {
+ =cut
+ sub search_ref {
    my $self  = shift;
+   my $query = shift;
+   my $debugtime = 0;
+   my($time,$entertime);
+   our $STARTTIME;
+   if ($debugtime) {
+     $time = time;
+     $STARTTIME ||= $time;
+     if ($time-$STARTTIME > 5) {
+       $STARTTIME = $time;
+       warn "STARTTIME: $STARTTIME\n";
+     }
+     $entertime = time-$STARTTIME;
+     warn sprintf "ENTER TIME: %.4f\n", $entertime;
+   }
    defined $self->{db} or $self->open;
    $self->sync;
-   $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
+   my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
+   if ($debugtime) {
+     my $leavetime = time-$STARTTIME;
+     warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
+     if ($leavetime-$entertime > .4) {
+       require Data::Dumper;
+       print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
+           Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
+     }
+   }
+   $ref;
  }
  sub parse {
-Line 282 
 sub search_prefix {
+Line 450 
 sub search_prefix {
    # print "search_prefix(@_)\n";
    defined $self->{db} or $self->open;
-   $self->search_raw(map($self->prefix($_), @_));
+   $self->search_raw_ref(map($self->prefix($_), @_));
  }
- sub search_raw {
+ sub _complain ($$) {
+   my ($action, $term) = @_;
+   require Carp;
+   Carp::cluck
+     (sprintf("WAIT database inconsistency during $action [%s]: ".
+              "Please rebuild index\n",
+              $term,));
+ }
+ sub search_raw_ref {
    my $self  = shift;
-   my %occ;
+   my $query = shift;
+   # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
    my %score;
-   return () unless @_;
+   # Top $top_wanted documents must be correct. Zero means all matching documents.
+   my $top_wanted = $query->{top};
+   my $picky_strict = $query->{picky};
+   # the option is really ignore_excess
+   my $ignore_excess = $query->{ignore_excess};
+   # Return at least $minacc documents. Zero means all matching documents.
+   # my $minacc = $query->{accus} || $top_wanted;
+   # Open index and flush cache if necessary
    defined $self->{db} or $self->open;
    $self->sync;
-   grep $occ{$_}++, @_;
-   for (keys %occ) {
+   # We keep duplicates
-     if (defined $self->{db}->{$_}) {
+   my @terms =
-       my %post = unpack 'w*', $self->{db}->{$_};
+     # Sort words by decreasing document frequency
-       my $idf = log($self->{records}/$self->{db}->{$O,$_});
+     sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
-       my $did;
+       # check which words occur in the index.
-       for $did (keys %post) {
+       grep { $self->{db}->{DOCFREQ_O . $_} } @_;
-         $score{$did} = 0 unless defined $score{$did}; # perl -w
-         $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
+   # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
-           if $self->{db}->{$M, $did}; # db may be broken
+   return unless @terms;
+   # We special-case one term queries here.  If the index was sorted,
+   # choping off the rest of the list will return the same ranking.
+   if ($top_wanted and @terms == 1) {
+     my $term  = shift @terms;
+     my $idf   = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
+     my @res;
+     if ($self->{reorg}) { # or not $query->{picky}
+       @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
+       # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
+     } else {
+       @res = unpack 'w*',                $self->{db}->{POSTINGLIST_P . $term};
+     }
+     for (my $i=1; $i<@res; $i+=2) {
+       # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
+       # above was written badly, allows two DIV_ZERO problems.
+       my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
+       unless ($maxtf) {
+         warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
+         $maxtf = 1;
+       }
+       $res[$i] = ($res[$i] / $maxtf) * $idf;
+     }
+     my %res = @res; # bloed: @res waere schon sortiert gewesen
+     return \%res;
+   }
+   # We separate exhaustive search here to avoid overhead and make the
+   # code more readable. The block can be removed without changing the
+   # result.
+   unless ($top_wanted) {
+     for (@terms) {
+       my $df      = $self->{db}->{DOCFREQ_O . $_};
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $df = 1 if $df < 1;
+       # Unpack posting list for current query term $_
+       my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
+       _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
+       # This is the inverse document frequency. The log of the inverse
+       # fraction of documents the term occurs in.
+       my $idf = log($self->{records}/$df);
+       for my $did (keys %post) {
+         if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
+       }
+     }
+     # warn sprintf "Used %d accumulators\n", scalar keys %score;
+     return \%score;
+   }
+   # A sloppy but fast algorithm for multiple term queries.
+   unless ($picky_strict) {
+     for (@terms) {
+       # Unpack posting list for current query term $_
+       my %post;
+       if ($self->{reorg} && $top_wanted && $ignore_excess) {
+         %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
+       } else {
+         %post = unpack 'w*',                        $self->{db}->{POSTINGLIST_P . $_};
+       }
+       # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
+       # Lookup the number of documents the term occurs in (document frequency)
+       my $occ  = $self->{db}->{DOCFREQ_O . $_};
+       _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
+       # The frequency *must* be 1 at least since the posting list is nonempty
+       _complain('search for term', $_) and $occ = 1 if $occ < 1;
+       # This is the inverse document frequency. The log of the inverse fraction
+       # of documents the term occurs in.
+       my $idf = log($self->{records}/$occ);
+       # If we have a reasonable number of accumulators, change the
+       # loop to iterate over the accumulators.  This will compromise
+       # quality for better speed.  The algorithm still computes the
+       # exact weights, but the result is not guaranteed to contain the
+       # *best* results.  The database might contain documents better
+       # than the worst returned document.
+       # We process the lists in order of increasing length.  When the
+       # number of accumulators exceeds $wanted, no new documents are
+       # added, only the ranking/weighting of the seen documents is
+       # improved.  The resulting ranking list must be pruned, since only
+       # the top most documents end up near their "optimal" rank.
+       if (keys %score < $top_wanted) {
+         # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
+         # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
+         # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
+         # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
+         # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
+         # vermutlich billig.
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       } else {
+         for my $did (keys %score) {
+           next unless exists $post{$did};
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
        }
      }
+     warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
+                  time,
+                  join(":",@terms),
+                  scalar keys %score,
+                 );
+     return \%score;
+   }
+   my @max; $max[$#terms+1]=0;
+   my @idf;
+   # Preparation loop.  This extra loop makes sense only when "reorg"
+   # and "wanted" are true.  But at the time beeing, keeping the code
+   # for the different search algorithms in one place seems more
+   # desirable than some minor speedup of the brute force version.  We
+   # do cache $idf though.
+   for (my $i = $#terms; $i >=0; $i--) {
+     local $_ = $terms[$i];
+     # Lookup the number of documents the term occurs in (document frequency)
+     my $df      = $self->{db}->{DOCFREQ_O . $_};
+     # The frequency *must* be 1 at least since the posting list is nonempty
+     _complain('search for term', $_) and $df = 1 if $df < 1;
+     # This is the inverse document frequency. The log of the inverse
+     # fraction of documents the term occurs in.
+     $idf[$i] = log($self->{records}/$df);
+     my ($did,$occ);
+     if ($self->{reorg}) {
+       ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
+     } else {                    # Maybe this costs more than it helps
+       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
+     }
+     my $freq      = $self->{db}->{MAXTF_M . $did};
+     my $max       = $occ/$freq*$idf[$i];
+     $max[$i]      = $max + $max[$i+1];
    }
-   %score;
+   # Main loop
+   for my $i (0 .. $#terms) {
+     my $term = $terms[$i];
+     # Unpack posting list for current query term $term. We loose the
+     # sorting order because the assignment to a hash.
+     my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
+     _complain('search for term', $term)
+       if $self->{db}->{DOCFREQ_O . $term} != keys %post;
+     my $idf  = $idf[$i];
+     my $full;                   # Need to process all postings
+     my $chop;                   # Score necessary to enter the ranking list
+     if (# We know that wanted is true since we special cased the
+         # exhaustive search.
+         $top_wanted and
+         # We did sort here if necessary in the preparation loop:
+         # $self->{reorg} and
+         scalar keys %score > $top_wanted) {
+       $chop = (sort { $b <=> $a } values %score)[$top_wanted];
+       $full = $max[$i] > $chop;
+     } else {
+       $full = 1;
+     }
+     if ($full) {
+       # We need to inspect the full list. Either $top_wanted is not given,
+       # the index is not sorted, or we don't have enough accumulators
+       # yet.
+       if (defined $chop) {
+         # We might be able to avoid allocating accumulators
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+             my $wgt = $post{$did} / $freq * $idf;
+             # We add an accumulator if $wgt exeeds $chop
+             if (exists $score{$did} or $wgt > $chop) {
+               $score{$did} += $wgt;
+             }
+           }
+         }
+       } else {
+         # Allocate acumulators for each seen document.
+         for my $did (keys %post) {
+           if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+             $score{$did} += $post{$did} / $freq * $idf;
+           }
+         }
+       }
+     } else {
+       # Update existing accumulators
+       for my $did (keys %score) {
+         next unless exists $post{$did};
+         if (my $freq = $self->{db}->{MAXTF_M . $did}) {
+           $score{$did} += $post{$did} / $freq * $idf;
+         }
+       }
+     }
+   }
+   #warn sprintf "Used %d accumulators\n", scalar keys %score;
+   \%score;
+ }
+ sub set {
+   my ($self, $attr, $value) = @_;
+   die "No such index attribute: '$attr'" unless $attr eq 'top';
+   return delete $self->{reorg} if $value == 0;
+   return if     $self->{reorg};     # we are sorted already
+   return unless $self->{mode} & O_RDWR;
+   defined $self->{db} or $self->open;
+   $self->sync;
+   while (my($key, $value) = each %{$self->{db}}) {
+     next if $key !~ /^p/; # some day use PMATCH
+     $self->{db}{$key} = $self->sort_postings($value);
+   }
+   $self->{reorg} = 1;
  }
  sub sync {
    my $self = shift;
+   return unless $self->{mode} & O_RDWR;
-   if ($self->{mode} & O_RDWR) {
+   Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
-     print STDERR "\aFlushing $self->{cached} postings\n";
+         if $self->{cached};
-     while (my($key, $value) = each %{$self->{cache}}) {
+   while (my($key, $value) = each %{$self->{cache}}) {
-       $self->{db}->{$key} .= $value;
+     $self->{db}{POSTINGLIST_P . $key} ||= "";
-       #delete $self->{cache}->{$key};
+     if ($self->{reorg}) {
-     }
+       $self->{db}->{POSTINGLIST_P . $key} =
-     while (my($key, $value) = each %{$self->{cdict}}) {
+           $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
-       $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
+                                . $value);
-       $self->{db}->{$key} += $value;
+     } else {
-       #delete $self->{cdict}->{$key};
+       $self->{db}->{POSTINGLIST_P . $key} .= $value;
      }
-     $self->{cache} = {};
-     $self->{cdict} = {};
-     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
-     $self->{cached} = 0;
-     # $self->{dbh}->sync if $self->{dbh};
    }
+   while (my($key, $value) = each %{$self->{cdict}}) {
+     $self->{db}->{DOCFREQ_O . $key} = 0 unless  $self->{db}->{DOCFREQ_O . $key};
+     $self->{db}->{DOCFREQ_O . $key} += $value;
+   }
+   $self->{cache}  = {};
+   $self->{cdict}  = {};
+   $self->{cached} = 0;
  }
  sub close {
    my $self = shift;
+   delete $self->{env};
    if ($self->{dbh}) {
      $self->sync;
      delete $self->{dbh};
      untie %{$self->{db}};
-     delete $self->{db};
+     for my $att (qw(db func cache cached cdict file maindbfile)) {
-     delete $self->{func};
+       delete $self->{$att};
-     delete $self->{cache};
+     }
-     delete $self->{cached};
+     for my $att (qw(pfunc ifunc xfunc)) {
-     delete $self->{cdict};
+       delete $self->{$att} if defined $self->{$att};
-     delete $self->{pfunc} if defined $self->{pfunc};
+     }
-     delete $self->{ifunc} if defined $self->{ifunc};
-     delete $self->{xfunc} if defined $self->{xfunc};
    }
  }
+ sub keys {
+   my $self  = shift;
+   defined $self->{db} or $self->open;
+   keys %{$self->{db}};
+ }
 ;

 Legend:



Removed from v.11
 


changed lines


 
Added in v.108
 Legend:



Removed from v.11
 


changed lines


 
Added in v.108
-Removed from v.11
+Added in v.108

	ViewVC Help
Powered by ViewVC 1.1.26