lib/WAIT/InvertedIndex.pm

#                              -*- Mode: Perl -*- 
# $Basename: InvertedIndex.pm $
# $Revision: 1.30 $
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sat Nov 11 16:32:38 2000
# Language        : CPerl
# 
# (C) Copyright 1996-2000, Ulrich Pfeifer
# 

package WAIT::InvertedIndex;
use strict;
use DB_File;
use Fcntl;
use WAIT::Filter;
use Carp;
use vars qw(%FUNC);

# The dictionary has three different key types:
#  'o'.$word
#
#     The document frequency is the number of documents a term occurs
#     in. The idea is that a term occuring in a significant part of the
#     documents is not too significant.
#
# 'm'.$word
#
#     The maximum term frequency of a document is the frequency of the
#     most frequent term in the document.  It is related to the document
#     length obviously.  A document in which the most frequnet term occurs
#     100 times is probably much longer than a document whichs most
#     frequent term occurs five time.
#
# 'p'.$word
#
#     Under this key we store the actual posting list as pairs of
#     packed integers.

my $no_old_index_support = 0; # do not check for old indices if set

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  $self->{file}     = $parm{file}     or croak "No file specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{filter}   = $parm{filter};
  $self->{'name'}   = $parm{'name'};
  $self->{records}  = 0;
  for (qw(intervall prefix)) {
    if (exists $parm{$_}) {
      if (ref $parm{$_}) {
        $self->{$_} = [@{$parm{$_}}] # clone
      } else {
        $self->{$_} = $parm{$_}
      }
    }
  }
  bless $self, ref($type) || $type;
}

sub name {$_[0]->{'name'}}

sub _split_pos {
  my ($text, $pos) = @{$_[0]};
  my @result;

  $text =~ s/(^\s+)// and $pos += length($1);
  while ($text =~ s/(^\S+)//) {
    my $word = $1;
    push @result, [$word, $pos];
    $pos += length($word);
    $text =~ s/(^\s+)// and $pos += length($1);
  }
  @result;
}

sub _xfiltergen {
  my $filter = pop @_;

# Oops, we cannot overrule the user's choice. Other filters may kill
# stopwords, such as isotr clobbers "isn't" to "isnt".

#  if ($filter eq 'stop') {      # avoid the slow stopword elimination
#    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
#  }
  if (@_) {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
    }
  } else {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
    }
  }
}

sub parse_pos {
  my $self = shift;

  unless (exists $self->{xfunc}) {
    $self->{xfunc}     =
      eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
    #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
  }
  &{$self->{xfunc}}($_[0]);
}

sub _filtergen {
  my $filter = pop @_;

  if (@_) {
    "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
  } else {
    "map(&WAIT::Filter::$filter(\$_), \@_)";
  }
}

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
    my $file = $self->{file};

    ! (!-e $file or unlink $file);
  } else {                              # notify our database
    croak ref($self)."::drop called directly";
  }
}

sub is_an_old_index {
  my $self = shift;

  return 0 if $no_old_index_support;
  return $self->{old_index} if exists $self->{old_index};

  # We can only guess if this is an old index. We lookup the first 10
  # $O entries. If all values are integers, we assume that the index
  # is an old one.

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  my $O = pack('C', 0xff)."o";
  my ($word, $value) = ($O.$;);
  $dbh->seq($word, $value, R_CURSOR);
  for (my $i=0; $i<10;$i++) {
    if ($value !~ /^\d+$/) {
      return $self->{old_index} = 0;
    }
    if ($dbh->seq($word, $value, R_NEXT) or # no values left
        $word !~ /^$O/o                     # no $O values left
       ) {
      # we are not sure enough that this is an old index
      return $self->{old_index} = 0;
    }
  }
  return $self->{old_index} = 1;
}

sub open {
  my $self = shift;
  my $file = $self->{file};

  if (defined $self->{dbh}) {
    $self->{dbh};
  } else {
    $self->{func}     =
      eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
    $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                       $self->{mode}, 0664, $DB_BTREE);
    $self->{cache} = {}
      if $self->{mode} & O_RDWR;
    $self->{cdict} = {}
      if $self->{mode} & O_RDWR;
    $self->{cached} = 0;
    if (!$no_old_index_support and $self->is_an_old_index()) {
      warn "This is an old index, upgrade you database";
      require WAIT::InvertedIndexOld;
      bless $self, 'WAIT::InvertedIndexOld';
    }
  }
}

sub insert {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  grep $occ{$_}++, &{$self->{func}}(@_);
  my ($word, $noc);
  $self->{records}++;
  while (($word, $noc) = each %occ) {
    if (defined $self->{cache}->{$word}) {
      $self->{cdict}->{$word}++;
      $self->{cache}->{$word} .= pack 'w2', $key, $noc;
    } else {           
      $self->{cdict}->{$word} = 1;
      $self->{cache}->{$word}  = pack 'w2', $key, $noc;
    }
    $self->{cached}++;
  }
  # This cache limit should be configurable
  $self->sync if $self->{cached} > 100_000;
  my $maxtf = 0;
  for (values %occ) {
    $maxtf = $_ if $_ > $maxtf;
  }
  $self->{db}->{'m'. $key} = $maxtf;
}

# We sort postings by increasing max term frequency (~ by increasing
# document length.  This reduces the quality degradation if we process
# only the first part of a posting list.

sub sort_postings {
  my $self = shift;
  my $post = shift;             # reference to a hash or packed string

  if (ref $post) {
    # we skip the sort part, if the index is not sorted
    return pack('w*', %$post) unless $self->{reorg};
  } else {
    $post = { unpack 'w*', $post };
  }

  my $r = '';

  # Sort posting list by increasing ratio of maximum term frequency (~
  # "document length") and term frequency. This rati multipied by the
  # inverse document frequence gives the score for a term.  This sort
  # order can be exploited for tuning of single term queries.

  for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
                                      <=>
                         $post->{$a} / $self->{db}->{'m'. $a}
                    } keys %$post) {
    $r .= pack 'w2', $did, $post->{$did};
  }
  #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
  $r;
}

sub delete {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  my $db;
  defined $self->{db} or $self->open;
  $db = $self->{db};
  $self->sync;
  $self->{records}--;

  # less than zero documents in database?
  _complain('delete of document', $key) and $self->{records} = 0
    if $self->{records} < 0;

  grep $occ{$_}++, &{$self->{func}}(@_);

  for (keys %occ) {# may reorder posting list
    my %post = unpack 'w*', $db->{'p'.$_};
    delete $post{$key};
    $db->{'p'.$_}    = $self->sort_postings(\%post);
    _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
    $db->{'o'.$_} = scalar keys %post;
  }
  delete $db->{'m'. $key};
}

sub intervall {
  my ($self, $first, $last) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return unless exists $self->{'intervall'};

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  if (ref $self->{'intervall'}) {
    unless (exists $self->{'ifunc'}) {
      $self->{'ifunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
    }
    ($first) = &{$self->{'ifunc'}}($first) if $first;
    ($last)  = &{$self->{'ifunc'}}($last) if $last;
  }
  $first = 'p'.($first||'');
  $last  = (defined $last)?'p'.$last:'q';

  # set the cursor to $first
  $dbh->seq($first, $value, R_CURSOR);

  # $first would be after the last word
  return () if $first gt $last;
  
  push @result, substr($first,1);
  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word gt $last;
    push @result, substr($word,1);
  }
  \@result;                     # speed
}

sub prefix {
  my ($self, $prefix) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return () unless defined $prefix; # Full dictionary requested !!
  return unless exists $self->{'prefix'};
  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};
  
  if (ref $self->{'prefix'}) {
    unless (exists $self->{'pfunc'}) {
      $self->{'pfunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
    }
    ($prefix) = &{$self->{'pfunc'}}($prefix);
  }

  if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
    return ();
  }
  return () if $word !~ /^p$prefix/;
  push @result, substr($word,1);

  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word !~ /^p$prefix/;
    push @result, substr($word,1);
  }
  \@result;                     # speed
}

=head2 search($query)

The search method supports a range of search algorithms.  It is
recommended to tune the index by calling
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
into the table.  This is a computing intense operation and all inserts
and deletes after this optimization are slightly more expensive.  Once
reorganized, the index is kept sorted automatically until you switch
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.

When searching a tuned index, a query can be processed faster if the
caller requests only the topmost documents.  This can be done by
passing a C<top =E<gt>> I<n> parameter to the search method.

For single term queries, the method returns only the I<n> top ranking
documents.  For multi term queries two optimized algorithms are
available. The first algorithm computes the top n documents
approximately but very fast, sacrificing a little bit of precision for
speed.  The second algorithm computes the topmost I<n> documents
precisely.  This algorithm is slower and should be used only for small
values of I<n>.  It can be requested by passing the query attribute
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
While the picky version might not be faster than the brute force
version on average for modest size databases it uses less memory and
the processing time is almost linear in the number of query terms, not
in the size of the lists.

=cut

sub search {
  my $self  = shift;
  my $query = shift;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
}

sub parse {
  my $self  = shift;

  defined $self->{db} or $self->open;
  &{$self->{func}}(@_);
}

sub keys {
  my $self  = shift;

  defined $self->{db} or $self->open;
  keys %{$self->{db}};
}

sub search_prefix {
  my $self  = shift;

  # print "search_prefix(@_)\n";
  defined $self->{db} or $self->open;
  $self->search_raw(map($self->prefix($_), @_));
}

sub _complain ($$) {
  my ($action, $term) = @_;

  require Carp;
  Carp::cluck
    (sprintf("WAIT database inconsistency during $action [%s]: ".
             "Please rebuild index\n",
             $term,));
}

sub search_raw {
  my $self  = shift;
  my $query = shift;
  my %score;

  # Top $wanted documents must be correct. Zero means all matching
  # documents.
  my $wanted = $query->{top};
  my $strict = $query->{picky};

  # Return at least $minacc documents. Zero means all matching
  # documents.
  # my $minacc = $query->{accus} || $wanted;

  # Open index and flush cache if necessary
  defined $self->{db} or $self->open;
  $self->sync;

  # We keep duplicates
  my @terms = 
    # Sort words by decreasing document frequency
    sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
      # check which words occur in the index. 
      grep { $self->{db}->{'o'.$_} } @_;

  return () unless @terms;                 # nothing to search for

  # We special-case one term queries here.  If the index was sorted,
  # choping off the rest of the list will return the same ranking.
  if ($wanted and @terms == 1) {
    my $term  = shift @terms;
    my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
    my @res;

    if ($self->{reorg}) { # or not $query->{picky}
      @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
    } else {
      @res = unpack 'w*',                $self->{db}->{'p'.$term};
    }

    for (my $i=1; $i<@res; $i+=2) {
      $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
    }

    return @res
  }

  # We separate exhaustive search here to avoid overhead and make the
  # code more readable. The block can be removed without changing the
  # result.
  unless ($wanted) {
    for (@terms) {
      my $df      = $self->{db}->{'o'.$_};

      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $df = 1 if $df < 1;

      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{'p'.$_};

      _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$df);
      for my $did (keys %post) {
        if (my $freq = $self->{db}->{'m'. $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
    # warn sprintf "Used %d accumulators\n", scalar keys %score;
    return %score;
  }

  # A sloppy but fast algorithm for multiple term queries.
  unless ($strict) {
    for (@terms) {
      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{'p'.$_};

      # Lookup the number of documents the term occurs in (document frequency)
      my $occ  = $self->{db}->{'o'.$_};

      _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $occ = 1 if $occ < 1;

      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$occ);

      # If we have a reasonable number of accumulators, change the
      # loop to iterate over the accumulators.  This will compromise
      # quality for better speed.  The algorithm still computes the
      # exact weights, but the result is not guaranteed to contain the
      # *best* results.  The database might contain documents better
      # than the worst returned document.
      
      # We process the lists in order of increasing length.  When the
      # number of accumulators exceeds $wanted, no new documents are
      # added, only the ranking/weighting of the seen documents is
      # improved.  The resulting ranking list must be pruned, since only
      # the top most documents end up near their "optimal" rank.
      
      if (keys %score < $wanted) {
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{'m'. $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      } else {
        for my $did (keys %score) {
          next unless exists $post{$did};
          if (my $freq = $self->{db}->{'m'. $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    }
    return %score;
  }
  my @max; $max[$#terms+1]=0;
  my @idf;

  # Preparation loop.  This extra loop makes sense only when "reorg"
  # and "wanted" are true.  But at the time beeing, keeping the code
  # for the different search algorithms in one place seems more
  # desirable than some minor speedup of the brute force version.  We
  # do cache $idf though.

  for (my $i = $#terms; $i >=0; $i--) {
    local $_ = $terms[$i];
    # Lookup the number of documents the term occurs in (document frequency)
    my $df      = $self->{db}->{'o'.$_};

    # The frequency *must* be 1 at least since the posting list is nonempty
    _complain('search for term', $_) and $df = 1 if $df < 1;

    # This is the inverse document frequency. The log of the inverse
    # fraction of documents the term occurs in.
    $idf[$i] = log($self->{records}/$df);

    my ($did,$occ);
    if ($self->{reorg}) {
      ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
    } else {                    # Maybe this costs more than it helps
      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
    }
    my $freq      = $self->{db}->{'m'. $did};
    my $max       = $occ/$freq*$idf[$i];
    $max[$i]      = $max + $max[$i+1];
  }

  # Main loop 
  for my $i (0 .. $#terms) {
    my $term = $terms[$i];
    # Unpack posting list for current query term $term. We loose the
    # sorting order because the assignment to a hash.
    my %post = unpack 'w*', $self->{db}->{'p'.$term};

    _complain('search for term', $term)
      if $self->{db}->{'o'.$term} != keys %post;

    my $idf  = $idf[$i];
    my $full;                   # Need to process all postings
    my $chop;                   # Score necessary to enter the ranking list

    if (# We know that wanted is true since we especial cased the
        # exhaustive search.

        $wanted and

        # We did sort here if necessary in
        # the preparation loop
        # $self->{reorg} and

        scalar keys %score > $wanted) {
      $chop = (sort { $b <=> $a } values %score)[$wanted];
      $full = $max[$i] > $chop;
    } else {
      $full = 1;
    }

    if ($full) {
      # We need to inspect the full list. Either $wanted is not given,
      # the index is not sorted, or we don't have enough accumulators
      # yet.
      if (defined $chop) {
        # We might be able to avoid allocating accumulators
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{'m'. $did}) {
            my $wgt = $post{$did} / $freq * $idf;
            # We add an accumulator if $wgt exeeds $chop
            if (exists $score{$did} or $wgt > $chop) {
              $score{$did} += $wgt;
            }
          }
        }
      } else {
        # Allocate acumulators for each seen document.
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{'m'. $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    } else {
      # Update existing accumulators
      for my $did (keys %score) {
        next unless exists $post{$did};
        if (my $freq = $self->{db}->{'m'. $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
  }
  #warn sprintf "Used %d accumulators\n", scalar keys %score;
  %score;
}

sub set {
  my ($self, $attr, $value) = @_;

  die "No such indexy attribute: '$attr'" unless $attr eq 'top';

  return delete $self->{reorg} if $value == 0;

  return if     $self->{reorg};     # we are sorted already
  return unless $self->{mode} & O_RDWR;
  defined $self->{db} or $self->open;

  $self->sync;
  while (my($key, $value) = each %{$self->{db}}) {
    next if $key !~ /^p/;
    $self->{db}->{$key} = $self->sort_postings($value);
  }
  $self->{reorg} = 1;
}

sub sync {
  my $self = shift;

  if ($self->{mode} & O_RDWR) {
    print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
    while (my($key, $value) = each %{$self->{cache}}) {
      if ($self->{reorg}) {
        $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
                                                   . $value);
      } else {
        $self->{db}->{'p'.$key} .= $value;
      }
    }
    while (my($key, $value) = each %{$self->{cdict}}) {
      $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
      $self->{db}->{'o'.$key} += $value;
    }
    $self->{cache}  = {};
    $self->{cdict}  = {};
    $self->{cached} = 0;
  }
}

sub close {
  my $self = shift;

  if ($self->{dbh}) {
    $self->sync;
    delete $self->{dbh};
    untie %{$self->{db}};
    delete $self->{db};
    delete $self->{func};
    delete $self->{cache};
    delete $self->{cached};
    delete $self->{cdict};
    delete $self->{pfunc} if defined $self->{pfunc};
    delete $self->{ifunc} if defined $self->{ifunc};
    delete $self->{xfunc} if defined $self->{xfunc};
  }
}

1;

1	# -- Mode: Perl --
2	# $Basename: InvertedIndex.pm $
3	# $Revision: 1.30 $
4	# Author : Ulrich Pfeifer
5	# Created On : Thu Aug 8 13:05:10 1996
6	# Last Modified By: Ulrich Pfeifer
7	# Last Modified On: Sat Nov 11 16:32:38 2000
8	# Language : CPerl
9	#
10	# (C) Copyright 1996-2000, Ulrich Pfeifer
11	#
12
13	package WAIT::InvertedIndex;
14	use strict;
15	use DB_File;
16	use Fcntl;
17	use WAIT::Filter;
18	use Carp;
19	use vars qw(%FUNC);
20
21	# The dictionary has three different key types:
22	# 'o'.$word
23	#
24	# The document frequency is the number of documents a term occurs
25	# in. The idea is that a term occuring in a significant part of the
26	# documents is not too significant.
27	#
28	# 'm'.$word
29	#
30	# The maximum term frequency of a document is the frequency of the
31	# most frequent term in the document. It is related to the document
32	# length obviously. A document in which the most frequnet term occurs
33	# 100 times is probably much longer than a document whichs most
34	# frequent term occurs five time.
35	#
36	# 'p'.$word
37	#
38	# Under this key we store the actual posting list as pairs of
39	# packed integers.
40
41	my $no_old_index_support = 0; # do not check for old indices if set
42
43	sub new {
44	my $type = shift;
45	my %parm = @_;
46	my $self = {};
47
48	$self->{file} = $parm{file} or croak "No file specified";
49	$self->{attr} = $parm{attr} or croak "No attributes specified";
50	$self->{filter} = $parm{filter};
51	$self->{'name'} = $parm{'name'};
52	$self->{records} = 0;
53	for (qw(intervall prefix)) {
54	if (exists $parm{$_}) {
55	if (ref $parm{$_}) {
56	$self->{$_} = [@{$parm{$_}}] # clone
57	} else {
58	$self->{$_} = $parm{$_}
59	}
60	}
61	}
62	bless $self, ref($type) \|\| $type;
63	}
64
65	sub name {$_[0]->{'name'}}
66
67	sub _split_pos {
68	my ($text, $pos) = @{$_[0]};
69	my @result;
70
71	$text =~ s/(^\s+)// and $pos += length($1);
72	while ($text =~ s/(^\S+)//) {
73	my $word = $1;
74	push @result, [$word, $pos];
75	$pos += length($word);
76	$text =~ s/(^\s+)// and $pos += length($1);
77	}
78	@result;
79	}
80
81	sub _xfiltergen {
82	my $filter = pop @_;
83
84	# Oops, we cannot overrule the user's choice. Other filters may kill
85	# stopwords, such as isotr clobbers "isn't" to "isnt".
86
87	# if ($filter eq 'stop') { # avoid the slow stopword elimination
88	# return _xfiltergen(@_); # it's cheaper to look them up afterwards
89	# }
90	if (@_) {
91	if ($filter =~ /^split(\d*)/) {
92	if ($1) {
93	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
94	} else {
95	"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
96	}
97	} else {
98	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
99	}
100	} else {
101	if ($filter =~ /^split(\d*)/) {
102	if ($1) {
103	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
104	} else {
105	"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
106	}
107	} else {
108	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
109	}
110	}
111	}
112
113	sub parse_pos {
114	my $self = shift;
115
116	unless (exists $self->{xfunc}) {
117	$self->{xfunc} =
118	eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
119	#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
120	}
121	&{$self->{xfunc}}($_[0]);
122	}
123
124	sub _filtergen {
125	my $filter = pop @_;
126
127	if (@_) {
128	"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
129	} else {
130	"map(&WAIT::Filter::$filter(\$_), \@_)";
131	}
132	}
133
134	sub drop {
135	my $self = shift;
136	if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
137	my $file = $self->{file};
138
139	! (!-e $file or unlink $file);
140	} else { # notify our database
141	croak ref($self)."::drop called directly";
142	}
143	}
144
145	sub is_an_old_index {
146	my $self = shift;
147
148	return 0 if $no_old_index_support;
149	return $self->{old_index} if exists $self->{old_index};
150
151	# We can only guess if this is an old index. We lookup the first 10
152	# $O entries. If all values are integers, we assume that the index
153	# is an old one.
154
155	defined $self->{db} or $self->open;
156	$self->sync;
157	my $dbh = $self->{dbh}; # for convenience
158
159	my $O = pack('C', 0xff)."o";
160	my ($word, $value) = ($O.$;);
161	$dbh->seq($word, $value, R_CURSOR);
162	for (my $i=0; $i<10;$i++) {
163	if ($value !~ /^\d+$/) {
164	return $self->{old_index} = 0;
165	}
166	if ($dbh->seq($word, $value, R_NEXT) or # no values left
167	$word !~ /^$O/o # no $O values left
168	) {
169	# we are not sure enough that this is an old index
170	return $self->{old_index} = 0;
171	}
172	}
173	return $self->{old_index} = 1;
174	}
175
176	sub open {
177	my $self = shift;
178	my $file = $self->{file};
179
180	if (defined $self->{dbh}) {
181	$self->{dbh};
182	} else {
183	$self->{func} =
184	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
185	$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
186	$self->{mode}, 0664, $DB_BTREE);
187	$self->{cache} = {}
188	if $self->{mode} & O_RDWR;
189	$self->{cdict} = {}
190	if $self->{mode} & O_RDWR;
191	$self->{cached} = 0;
192	if (!$no_old_index_support and $self->is_an_old_index()) {
193	warn "This is an old index, upgrade you database";
194	require WAIT::InvertedIndexOld;
195	bless $self, 'WAIT::InvertedIndexOld';
196	}
197	}
198	}
199
200	sub insert {
201	my $self = shift;
202	my $key = shift;
203	my %occ;
204
205	defined $self->{db} or $self->open;
206	grep $occ{$_}++, &{$self->{func}}(@_);
207	my ($word, $noc);
208	$self->{records}++;
209	while (($word, $noc) = each %occ) {
210	if (defined $self->{cache}->{$word}) {
211	$self->{cdict}->{$word}++;
212	$self->{cache}->{$word} .= pack 'w2', $key, $noc;
213	} else {
214	$self->{cdict}->{$word} = 1;
215	$self->{cache}->{$word} = pack 'w2', $key, $noc;
216	}
217	$self->{cached}++;
218	}
219	# This cache limit should be configurable
220	$self->sync if $self->{cached} > 100_000;
221	my $maxtf = 0;
222	for (values %occ) {
223	$maxtf = $_ if $_ > $maxtf;
224	}
225	$self->{db}->{'m'. $key} = $maxtf;
226	}
227
228	# We sort postings by increasing max term frequency (~ by increasing
229	# document length. This reduces the quality degradation if we process
230	# only the first part of a posting list.
231
232	sub sort_postings {
233	my $self = shift;
234	my $post = shift; # reference to a hash or packed string
235
236	if (ref $post) {
237	# we skip the sort part, if the index is not sorted
238	return pack('w*', %$post) unless $self->{reorg};
239	} else {
240	$post = { unpack 'w*', $post };
241	}
242
243	my $r = '';
244
245	# Sort posting list by increasing ratio of maximum term frequency (~
246	# "document length") and term frequency. This rati multipied by the
247	# inverse document frequence gives the score for a term. This sort
248	# order can be exploited for tuning of single term queries.
249
250	for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
251	<=>
252	$post->{$a} / $self->{db}->{'m'. $a}
253	} keys %$post) {
254	$r .= pack 'w2', $did, $post->{$did};
255	}
256	#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
257	$r;
258	}
259
260	sub delete {
261	my $self = shift;
262	my $key = shift;
263	my %occ;
264
265	my $db;
266	defined $self->{db} or $self->open;
267	$db = $self->{db};
268	$self->sync;
269	$self->{records}--;
270
271	# less than zero documents in database?
272	_complain('delete of document', $key) and $self->{records} = 0
273	if $self->{records} < 0;
274
275	grep $occ{$_}++, &{$self->{func}}(@_);
276
277	for (keys %occ) {# may reorder posting list
278	my %post = unpack 'w*', $db->{'p'.$_};
279	delete $post{$key};
280	$db->{'p'.$_} = $self->sort_postings(\%post);
281	_complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
282	$db->{'o'.$_} = scalar keys %post;
283	}
284	delete $db->{'m'. $key};
285	}
286
287	sub intervall {
288	my ($self, $first, $last) = @_;
289	my $value = '';
290	my $word = '';
291	my @result;
292
293	return unless exists $self->{'intervall'};
294
295	defined $self->{db} or $self->open;
296	$self->sync;
297	my $dbh = $self->{dbh}; # for convenience
298
299	if (ref $self->{'intervall'}) {
300	unless (exists $self->{'ifunc'}) {
301	$self->{'ifunc'} =
302	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
303	}
304	($first) = &{$self->{'ifunc'}}($first) if $first;
305	($last) = &{$self->{'ifunc'}}($last) if $last;
306	}
307	$first = 'p'.($first\|\|'');
308	$last = (defined $last)?'p'.$last:'q';
309
310	# set the cursor to $first
311	$dbh->seq($first, $value, R_CURSOR);
312
313	# $first would be after the last word
314	return () if $first gt $last;
315
316	push @result, substr($first,1);
317	while (!$dbh->seq($word, $value, R_NEXT)) {
318	# We should limit this to a "resonable" number of words
319	last if $word gt $last;
320	push @result, substr($word,1);
321	}
322	\@result; # speed
323	}
324
325	sub prefix {
326	my ($self, $prefix) = @_;
327	my $value = '';
328	my $word = '';
329	my @result;
330
331	return () unless defined $prefix; # Full dictionary requested !!
332	return unless exists $self->{'prefix'};
333	defined $self->{db} or $self->open;
334	$self->sync;
335	my $dbh = $self->{dbh};
336
337	if (ref $self->{'prefix'}) {
338	unless (exists $self->{'pfunc'}) {
339	$self->{'pfunc'} =
340	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
341	}
342	($prefix) = &{$self->{'pfunc'}}($prefix);
343	}
344
345	if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
346	return ();
347	}
348	return () if $word !~ /^p$prefix/;
349	push @result, substr($word,1);
350
351	while (!$dbh->seq($word, $value, R_NEXT)) {
352	# We should limit this to a "resonable" number of words
353	last if $word !~ /^p$prefix/;
354	push @result, substr($word,1);
355	}
356	\@result; # speed
357	}
358
359	=head2 search($query)
360
361	The search method supports a range of search algorithms. It is
362	recommended to tune the index by calling
363	C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
364	into the table. This is a computing intense operation and all inserts
365	and deletes after this optimization are slightly more expensive. Once
366	reorganized, the index is kept sorted automatically until you switch
367	the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
368
369	When searching a tuned index, a query can be processed faster if the
370	caller requests only the topmost documents. This can be done by
371	passing a C<top =E<gt>> I<n> parameter to the search method.
372
373	For single term queries, the method returns only the I<n> top ranking
374	documents. For multi term queries two optimized algorithms are
375	available. The first algorithm computes the top n documents
376	approximately but very fast, sacrificing a little bit of precision for
377	speed. The second algorithm computes the topmost I<n> documents
378	precisely. This algorithm is slower and should be used only for small
379	values of I<n>. It can be requested by passing the query attribute
380	C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
381	While the picky version might not be faster than the brute force
382	version on average for modest size databases it uses less memory and
383	the processing time is almost linear in the number of query terms, not
384	in the size of the lists.
385
386	=cut
387
388	sub search {
389	my $self = shift;
390	my $query = shift;
391
392	defined $self->{db} or $self->open;
393	$self->sync;
394	$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
395	}
396
397	sub parse {
398	my $self = shift;
399
400	defined $self->{db} or $self->open;
401	&{$self->{func}}(@_);
402	}
403
404	sub keys {
405	my $self = shift;
406
407	defined $self->{db} or $self->open;
408	keys %{$self->{db}};
409	}
410
411	sub search_prefix {
412	my $self = shift;
413
414	# print "search_prefix(@_)\n";
415	defined $self->{db} or $self->open;
416	$self->search_raw(map($self->prefix($_), @_));
417	}
418
419	sub _complain ($$) {
420	my ($action, $term) = @_;
421
422	require Carp;
423	Carp::cluck
424	(sprintf("WAIT database inconsistency during $action [%s]: ".
425	"Please rebuild index\n",
426	$term,));
427	}
428
429	sub search_raw {
430	my $self = shift;
431	my $query = shift;
432	my %score;
433
434	# Top $wanted documents must be correct. Zero means all matching
435	# documents.
436	my $wanted = $query->{top};
437	my $strict = $query->{picky};
438
439	# Return at least $minacc documents. Zero means all matching
440	# documents.
441	# my $minacc = $query->{accus} \|\| $wanted;
442
443	# Open index and flush cache if necessary
444	defined $self->{db} or $self->open;
445	$self->sync;
446
447	# We keep duplicates
448	my @terms =
449	# Sort words by decreasing document frequency
450	sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
451	# check which words occur in the index.
452	grep { $self->{db}->{'o'.$_} } @_;
453
454	return () unless @terms; # nothing to search for
455
456	# We special-case one term queries here. If the index was sorted,
457	# choping off the rest of the list will return the same ranking.
458	if ($wanted and @terms == 1) {
459	my $term = shift @terms;
460	my $idf = log($self->{records}/$self->{db}->{'o'.$term});
461	my @res;
462
463	if ($self->{reorg}) { # or not $query->{picky}
464	@res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
465	} else {
466	@res = unpack 'w*', $self->{db}->{'p'.$term};
467	}
468
469	for (my $i=1; $i<@res; $i+=2) {
470	$res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
471	}
472
473	return @res
474	}
475
476	# We separate exhaustive search here to avoid overhead and make the
477	# code more readable. The block can be removed without changing the
478	# result.
479	unless ($wanted) {
480	for (@terms) {
481	my $df = $self->{db}->{'o'.$_};
482
483	# The frequency must be 1 at least since the posting list is nonempty
484	_complain('search for term', $_) and $df = 1 if $df < 1;
485
486	# Unpack posting list for current query term $_
487	my %post = unpack 'w*', $self->{db}->{'p'.$_};
488
489	_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
490	# This is the inverse document frequency. The log of the inverse
491	# fraction of documents the term occurs in.
492	my $idf = log($self->{records}/$df);
493	for my $did (keys %post) {
494	if (my $freq = $self->{db}->{'m'. $did}) {
495	$score{$did} += $post{$did} / $freq * $idf;
496	}
497	}
498	}
499	# warn sprintf "Used %d accumulators\n", scalar keys %score;
500	return %score;
501	}
502
503	# A sloppy but fast algorithm for multiple term queries.
504	unless ($strict) {
505	for (@terms) {
506	# Unpack posting list for current query term $_
507	my %post = unpack 'w*', $self->{db}->{'p'.$_};
508
509	# Lookup the number of documents the term occurs in (document frequency)
510	my $occ = $self->{db}->{'o'.$_};
511
512	_complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
513	# The frequency must be 1 at least since the posting list is nonempty
514	_complain('search for term', $_) and $occ = 1 if $occ < 1;
515
516	# This is the inverse document frequency. The log of the inverse
517	# fraction of documents the term occurs in.
518	my $idf = log($self->{records}/$occ);
519
520	# If we have a reasonable number of accumulators, change the
521	# loop to iterate over the accumulators. This will compromise
522	# quality for better speed. The algorithm still computes the
523	# exact weights, but the result is not guaranteed to contain the
524	# best results. The database might contain documents better
525	# than the worst returned document.
526
527	# We process the lists in order of increasing length. When the
528	# number of accumulators exceeds $wanted, no new documents are
529	# added, only the ranking/weighting of the seen documents is
530	# improved. The resulting ranking list must be pruned, since only
531	# the top most documents end up near their "optimal" rank.
532
533	if (keys %score < $wanted) {
534	for my $did (keys %post) {
535	if (my $freq = $self->{db}->{'m'. $did}) {
536	$score{$did} += $post{$did} / $freq * $idf;
537	}
538	}
539	} else {
540	for my $did (keys %score) {
541	next unless exists $post{$did};
542	if (my $freq = $self->{db}->{'m'. $did}) {
543	$score{$did} += $post{$did} / $freq * $idf;
544	}
545	}
546	}
547	}
548	return %score;
549	}
550	my @max; $max[$#terms+1]=0;
551	my @idf;
552
553	# Preparation loop. This extra loop makes sense only when "reorg"
554	# and "wanted" are true. But at the time beeing, keeping the code
555	# for the different search algorithms in one place seems more
556	# desirable than some minor speedup of the brute force version. We
557	# do cache $idf though.
558
559	for (my $i = $#terms; $i >=0; $i--) {
560	local $_ = $terms[$i];
561	# Lookup the number of documents the term occurs in (document frequency)
562	my $df = $self->{db}->{'o'.$_};
563
564	# The frequency must be 1 at least since the posting list is nonempty
565	_complain('search for term', $_) and $df = 1 if $df < 1;
566
567	# This is the inverse document frequency. The log of the inverse
568	# fraction of documents the term occurs in.
569	$idf[$i] = log($self->{records}/$df);
570
571	my ($did,$occ);
572	if ($self->{reorg}) {
573	($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
574	} else { # Maybe this costs more than it helps
575	($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
576	}
577	my $freq = $self->{db}->{'m'. $did};
578	my $max = $occ/$freq*$idf[$i];
579	$max[$i] = $max + $max[$i+1];
580	}
581
582	# Main loop
583	for my $i (0 .. $#terms) {
584	my $term = $terms[$i];
585	# Unpack posting list for current query term $term. We loose the
586	# sorting order because the assignment to a hash.
587	my %post = unpack 'w*', $self->{db}->{'p'.$term};
588
589	_complain('search for term', $term)
590	if $self->{db}->{'o'.$term} != keys %post;
591
592	my $idf = $idf[$i];
593	my $full; # Need to process all postings
594	my $chop; # Score necessary to enter the ranking list
595
596	if (# We know that wanted is true since we especial cased the
597	# exhaustive search.
598
599	$wanted and
600
601	# We did sort here if necessary in
602	# the preparation loop
603	# $self->{reorg} and
604
605	scalar keys %score > $wanted) {
606	$chop = (sort { $b <=> $a } values %score)[$wanted];
607	$full = $max[$i] > $chop;
608	} else {
609	$full = 1;
610	}
611
612	if ($full) {
613	# We need to inspect the full list. Either $wanted is not given,
614	# the index is not sorted, or we don't have enough accumulators
615	# yet.
616	if (defined $chop) {
617	# We might be able to avoid allocating accumulators
618	for my $did (keys %post) {
619	if (my $freq = $self->{db}->{'m'. $did}) {
620	my $wgt = $post{$did} / $freq * $idf;
621	# We add an accumulator if $wgt exeeds $chop
622	if (exists $score{$did} or $wgt > $chop) {
623	$score{$did} += $wgt;
624	}
625	}
626	}
627	} else {
628	# Allocate acumulators for each seen document.
629	for my $did (keys %post) {
630	if (my $freq = $self->{db}->{'m'. $did}) {
631	$score{$did} += $post{$did} / $freq * $idf;
632	}
633	}
634	}
635	} else {
636	# Update existing accumulators
637	for my $did (keys %score) {
638	next unless exists $post{$did};
639	if (my $freq = $self->{db}->{'m'. $did}) {
640	$score{$did} += $post{$did} / $freq * $idf;
641	}
642	}
643	}
644	}
645	#warn sprintf "Used %d accumulators\n", scalar keys %score;
646	%score;
647	}
648
649	sub set {
650	my ($self, $attr, $value) = @_;
651
652	die "No such indexy attribute: '$attr'" unless $attr eq 'top';
653
654	return delete $self->{reorg} if $value == 0;
655
656	return if $self->{reorg}; # we are sorted already
657	return unless $self->{mode} & O_RDWR;
658	defined $self->{db} or $self->open;
659
660	$self->sync;
661	while (my($key, $value) = each %{$self->{db}}) {
662	next if $key !~ /^p/;
663	$self->{db}->{$key} = $self->sort_postings($value);
664	}
665	$self->{reorg} = 1;
666	}
667
668	sub sync {
669	my $self = shift;
670
671	if ($self->{mode} & O_RDWR) {
672	print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
673	while (my($key, $value) = each %{$self->{cache}}) {
674	if ($self->{reorg}) {
675	$self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
676	. $value);
677	} else {
678	$self->{db}->{'p'.$key} .= $value;
679	}
680	}
681	while (my($key, $value) = each %{$self->{cdict}}) {
682	$self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
683	$self->{db}->{'o'.$key} += $value;
684	}
685	$self->{cache} = {};
686	$self->{cdict} = {};
687	$self->{cached} = 0;
688	}
689	}
690
691	sub close {
692	my $self = shift;
693
694	if ($self->{dbh}) {
695	$self->sync;
696	delete $self->{dbh};
697	untie %{$self->{db}};
698	delete $self->{db};
699	delete $self->{func};
700	delete $self->{cache};
701	delete $self->{cached};
702	delete $self->{cdict};
703	delete $self->{pfunc} if defined $self->{pfunc};
704	delete $self->{ifunc} if defined $self->{ifunc};
705	delete $self->{xfunc} if defined $self->{xfunc};
706	}
707	}
708
709	1;
710