lib/WAIT/InvertedIndex.pm

#                              -*- Mode: Perl -*- 
# $Basename: InvertedIndex.pm $
# $Revision: 1.30 $
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Tue May  9 08:33:28 2000
# Language        : CPerl
# 
# (C) Copyright 1996-2000, Ulrich Pfeifer
# 

package WAIT::InvertedIndex;
use strict;
use DB_File;
use Fcntl;
use WAIT::Filter;
use Carp;
use vars qw(%FUNC);

my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)

# The document frequency is the number of documents a term occurs
# in. The idea is that a term occuring in a significant part of the
# documents is not too significant.

my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)

# The maximum term frequency of a document is the frequency of the
# most frequent term in the document.  It is related to the document
# length obviously.  A document in which the most frequnet term occurs
# 100 times is probably much longer than a document whichs most
# frequent term occurs five time.

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  $self->{file}     = $parm{file}     or croak "No file specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{filter}   = $parm{filter};
  $self->{'name'}   = $parm{'name'};
  $self->{records}  = 0;
  for (qw(intervall prefix)) {
    if (exists $parm{$_}) {
      if (ref $parm{$_}) {
        $self->{$_} = [@{$parm{$_}}] # clone
      } else {
        $self->{$_} = $parm{$_}
      }
    }
  }
  bless $self, ref($type) || $type;
}

sub name {$_[0]->{'name'}}

sub _split_pos {
  my ($text, $pos) = @{$_[0]};
  my @result;

  $text =~ s/(^\s+)// and $pos += length($1);
  while ($text =~ s/(^\S+)//) {
    my $word = $1;
    push @result, [$word, $pos];
    $pos += length($word);
    $text =~ s/(^\s+)// and $pos += length($1);
  }
  @result;
}

sub _xfiltergen {
  my $filter = pop @_;

# Oops, we cannot overrule the user's choice. Other filters may kill
# stopwords, such as isotr clobbers "isn't" to "isnt".

#  if ($filter eq 'stop') {      # avoid the slow stopword elimination
#    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
#  }
  if (@_) {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
    }
  } else {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
    }
  }
}

sub parse_pos {
  my $self = shift;

  unless (exists $self->{xfunc}) {
    $self->{xfunc}     =
      eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
    #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
  }
  &{$self->{xfunc}}($_[0]);
}

sub _filtergen {
  my $filter = pop @_;

  if (@_) {
    "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
  } else {
    "map(&WAIT::Filter::$filter(\$_), \@_)";
  }
}

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
    my $file = $self->{file};

    ! (!-e $file or unlink $file);
  } else {                              # notify our database
    croak ref($self)."::drop called directly";
  }
}

sub open {
  my $self = shift;
  my $file = $self->{file};

  if (defined $self->{dbh}) {
    $self->{dbh};
  } else {
    $self->{func}     =
      eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
    $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                       $self->{mode}, 0664, $DB_BTREE);
    $self->{cache} = {}
      if $self->{mode} & O_RDWR;
    $self->{cdict} = {}
      if $self->{mode} & O_RDWR;
    $self->{cached} = 0;
  }
}

sub insert {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  grep $occ{$_}++, &{$self->{func}}(@_);
  my ($word, $noc);
  $self->{records}++;
  while (($word, $noc) = each %occ) {
    if (defined $self->{cache}->{$word}) {
      $self->{cdict}->{$O,$word}++;
      $self->{cache}->{$word} .= pack 'w2', $key, $noc;
    } else {
      $self->{cdict}->{$O,$word} = 1;
      $self->{cache}->{$word}  = pack 'w2', $key, $noc;
    }
    $self->{cached}++;
  }
  # This cache limit should be configurable
  $self->sync if $self->{cached} > 100_000;
  my $maxtf = 0;
  for (values %occ) {
    $maxtf = $_ if $_ > $maxtf;
  }
  $self->{db}->{$M, $key} = $maxtf;
}

# We sort postings by increasing max term frequency (~ by increasing
# document length.  This reduces the quality degradation if we process
# only the first part of a posting list.

sub sort_postings {
  my $self = shift;
  my $post = shift;             # reference to a hash or packed string

  if (ref $post) {
    # we skip the sort part, if the index is not sorted
    return pack('w*', %$post) unless $self->{reorg};
  } else {
    $post = { unpack 'w*', $post };
  }

  my $r = '';

  # Sort posting list by increasing ratio of maximum term frequency (~
  # "document length") and term frequency. This rati multipied by the
  # inverse document frequence gives the score for a term.  This sort
  # order can be exploited for tuning of single term queries.

  for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
                                      <=>
                         $post->{$a} / $self->{db}->{$M, $a}
                    } keys %$post) {
    $r .= pack 'w2', $did, $post->{$did};
  }
  #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
  $r;
}

sub delete {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  my $db;
  defined $self->{db} or $self->open;
  $db = $self->{db};
  $self->sync;
  $self->{records}--;

  # less than zero documents in database?
  _complain('delete of document', $key) and $self->{records} = 0
    if $self->{records} < 0;

  grep $occ{$_}++, &{$self->{func}}(@_);

  for (keys %occ) {# may reorder posting list
    my %post = unpack 'w*', $db->{$_};
    delete $post{$key};
    $db->{$_}    = $self->sort_postings(\%post);
    _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
    $db->{$O,$_} = scalar keys %post;
  }
  delete $db->{$M, $key};
}

sub intervall {
  my ($self, $first, $last) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return unless exists $self->{'intervall'};

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  if (ref $self->{'intervall'}) {
    unless (exists $self->{'ifunc'}) {
      $self->{'ifunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
    }
    ($first) = &{$self->{'ifunc'}}($first) if $first;
    ($last)  = &{$self->{'ifunc'}}($last) if $last;
  }
  if (defined $first and $first ne '') {         # set the cursor to $first
    $dbh->seq($first, $value, R_CURSOR);
  } else {
    $dbh->seq($first, $value, R_FIRST);
  }
  # We assume that word do not start with the character \377
  # $last = pack 'C', 0xff unless defined $last and $last ne '';
  return () if defined $last and $first gt $last; # $first would be after the last word
  
  push @result, $first;
  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
    push @result, $word;
  }
  \@result;                     # speed
}

sub prefix {
  my ($self, $prefix) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return () unless defined $prefix; # Full dictionary requested !!
  return unless exists $self->{'prefix'};
  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};
  
  if (ref $self->{'prefix'}) {
    unless (exists $self->{'pfunc'}) {
      $self->{'pfunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
    }
    ($prefix) = &{$self->{'pfunc'}}($prefix);
  }

  if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
    return ();
  }
  return () if $word !~ /^$prefix/;
  push @result, $word;

  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word !~ /^$prefix/;
    push @result, $word;
  }
  \@result;                     # speed
}

=head2 search($query)

The search method supports a range of search algorithms.  It is
recommended to tune the index by calling
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
into the table.  This is a computing intense operation and all inserts
and deletes after this optimization are slightly more expensive.  Once
reorganized, the index is kept sorted automatically until you switch
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.

When searching a tuned index, a query can be processed faster if the
caller requests only the topmost documents.  This can be done by
passing a C<top =E<gt>> I<n> parameter to the search method.

For single term queries, the method returns only the I<n> top ranking
documents.  For multi term queries two optimized algorithms are
available. The first algorithm computes the top n documents
approximately but very fast, sacrificing a little bit of precision for
speed.  The second algorithm computes the topmost I<n> documents
precisely.  This algorithm is slower and should be used only for small
values of I<n>.  It can be requested by passing the query attribute
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
While the picky version might not be faster than the brute force
version on average for modest size databases it uses less memory and
the processing time is almost linear in the number of query terms, not
in the size of the lists.

=cut

sub search {
  my $self  = shift;
  my $query = shift;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
}

sub parse {
  my $self  = shift;

  defined $self->{db} or $self->open;
  &{$self->{func}}(@_);
}

sub keys {
  my $self  = shift;

  defined $self->{db} or $self->open;
  keys %{$self->{db}};
}

sub search_prefix {
  my $self  = shift;

  # print "search_prefix(@_)\n";
  defined $self->{db} or $self->open;
  $self->search_raw(map($self->prefix($_), @_));
}

sub _complain ($$) {
  my ($action, $term) = @_;

  require Carp;
  Carp::cluck
    (sprintf("WAIT database inconsistency during $action [%s]: ".
             "Please rebuild index\n",
             $term,));
}

sub search_raw {
  my $self  = shift;
  my $query = shift;
  my %score;

  # Top $wanted documents must be correct. Zero means all matching
  # documents.
  my $wanted = $query->{top};
  my $strict = $query->{picky};

  # Return at least $minacc documents. Zero means all matching
  # documents.
  # my $minacc = $query->{accus} || $wanted;

  # Open index and flush cache if necessary
  defined $self->{db} or $self->open;
  $self->sync;

  # We keep duplicates
  my @terms = 
    # Sort words by decreasing document frequency
    sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
      # check which words occur in the index. 
      grep { $self->{db}->{$O,$_} } @_;

  return () unless @terms;                 # nothing to search for

  # We special-case one term queries here.  If the index was sorted,
  # choping off the rest of the list will return the same ranking.
  if ($wanted and @terms == 1) {
    my $term  = shift @terms;
    my $idf   = log($self->{records}/$self->{db}->{$O,$term});
    my @res;

    if ($self->{reorg}) { # or not $query->{picky}
      @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
    } else {
      @res = unpack 'w*',                $self->{db}->{$term};
    }

    for (my $i=1; $i<@res; $i+=2) {
      $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
    }

    return @res
  }

  # We separate exhaustive search here to avoid overhead and make the
  # code more readable. The block can be removed without changing the
  # result.
  unless ($wanted) {
    for (@terms) {
      my $df      = $self->{db}->{$O,$_};

      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $df = 1 if $df < 1;

      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{$_};

      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$df);
      for my $did (keys %post) {
        if (my $freq = $self->{db}->{$M, $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
    # warn sprintf "Used %d accumulators\n", scalar keys %score;
    return %score;
  }

  # A sloppy but fast algorithm for multiple term queries.
  unless ($strict) {
    for (@terms) {
      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{$_};

      # Lookup the number of documents the term occurs in (document frequency)
      my $occ  = $self->{db}->{$O,$_};

      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $occ = 1 if $occ < 1;

      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$occ);

      # If we have a reasonable number of accumulators, change the
      # loop to iterate over the accumulators.  This will compromise
      # quality for better speed.  The algorithm still computes the
      # exact weights, but the result is not guaranteed to contain the
      # *best* results.  The database might contain documents better
      # than the worst returned document.
      
      # We process the lists in order of increasing length.  When the
      # number of accumulators exceeds $wanted, no new documents are
      # added, only the ranking/weighting of the seen documents is
      # improved.  The resulting ranking list must be pruned, since only
      # the top most documents end up near their "optimal" rank.
      
      if (keys %score < $wanted) {
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      } else {
        for my $did (keys %score) {
          next unless exists $post{$did};
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    }
    return %score;
  }
  my @max; $max[$#terms+1]=0;
  my @idf;

  # Preparation loop.  This extra loop makes sense only when "reorg"
  # and "wanted" are true.  But at the time beeing, keeping the code
  # for the different search algorithms in one place seems more
  # desirable than some minor speedup of the brute force version.  We
  # do cache $idf though.

  for (my $i = $#terms; $i >=0; $i--) {
    local $_ = $terms[$i];
    # Lookup the number of documents the term occurs in (document frequency)
    my $df      = $self->{db}->{$O,$_};

    # The frequency *must* be 1 at least since the posting list is nonempty
    _complain('search for term', $_) and $df = 1 if $df < 1;

    # This is the inverse document frequency. The log of the inverse
    # fraction of documents the term occurs in.
    $idf[$i] = log($self->{records}/$df);

    my ($did,$occ);
    if ($self->{reorg}) {
      ($did,$occ) = unpack 'w2', $self->{db}->{$_};
    } else {                    # Maybe this costs more than it helps
      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
    }
    my $freq      = $self->{db}->{$M, $did};
    my $max       = $occ/$freq*$idf[$i];
    $max[$i]      = $max + $max[$i+1];
  }

  # Main loop 
  for my $i (0 .. $#terms) {
    my $term = $terms[$i];
    # Unpack posting list for current query term $term. We loose the
    # sorting order because the assignment to a hash.
    my %post = unpack 'w*', $self->{db}->{$term};

    _complain('search for term', $term)
      if $self->{db}->{$O,$term} != keys %post;

    my $idf  = $idf[$i];
    my $full;                   # Need to process all postings
    my $chop;                   # Score necessary to enter the ranking list

    if (# We know that wanted is true since we especial cased the
        # exhaustive search.

        $wanted and

        # We did sort here if necessary in
        # the preparation loop
        # $self->{reorg} and

        scalar keys %score > $wanted) {
      $chop = (sort { $b <=> $a } values %score)[$wanted];
      $full = $max[$i] > $chop;
    } else {
      $full = 1;
    }

    if ($full) {
      # We need to inspect the full list. Either $wanted is not given,
      # the index is not sorted, or we don't have enough accumulators
      # yet.
      if (defined $chop) {
        # We might be able to avoid allocating accumulators
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            my $wgt = $post{$did} / $freq * $idf;
            # We add an accumulator if $wgt exeeds $chop
            if (exists $score{$did} or $wgt > $chop) {
              $score{$did} += $wgt;
            }
          }
        }
      } else {
        # Allocate acumulators for each seen document.
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    } else {
      # Update existing accumulators
      for my $did (keys %score) {
        next unless exists $post{$did};
        if (my $freq = $self->{db}->{$M, $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
  }
  #warn sprintf "Used %d accumulators\n", scalar keys %score;
  %score;
}

sub set {
  my ($self, $attr, $value) = @_;

  die "No such indexy attribute: '$attr'" unless $attr eq 'top';

  return delete $self->{reorg} if $value == 0;

  return if     $self->{reorg};     # we are sorted already
  return unless $self->{mode} & O_RDWR;
  defined $self->{db} or $self->open;

  $self->sync;
  while (my($key, $value) = each %{$self->{db}}) {
    next if $key =~ /^\377[om]/;
    $self->{db}->{$key} = $self->sort_postings($value);
  }
  $self->{reorg} = 1;
}

sub sync {
  my $self = shift;

  if ($self->{mode} & O_RDWR) {
    print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
    while (my($key, $value) = each %{$self->{cache}}) {
      if ($self->{reorg}) {
        $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
                                                   . $value);
      } else {
        $self->{db}->{$key} .= $value;
      }
    }
    while (my($key, $value) = each %{$self->{cdict}}) {
      $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
      $self->{db}->{$key} += $value;
    }
    $self->{cache}  = {};
    $self->{cdict}  = {};
    $self->{cached} = 0;
  }
}

sub close {
  my $self = shift;

  if ($self->{dbh}) {
    $self->sync;
    delete $self->{dbh};
    untie %{$self->{db}};
    delete $self->{db};
    delete $self->{func};
    delete $self->{cache};
    delete $self->{cached};
    delete $self->{cdict};
    delete $self->{pfunc} if defined $self->{pfunc};
    delete $self->{ifunc} if defined $self->{ifunc};
    delete $self->{xfunc} if defined $self->{xfunc};
  }
}

1;

1	ulpfr	19	# -- Mode: Perl --
2			# $Basename: InvertedIndex.pm $
3			# $Revision: 1.30 $
4	ulpfr	10	# Author : Ulrich Pfeifer
5			# Created On : Thu Aug 8 13:05:10 1996
6			# Last Modified By: Ulrich Pfeifer
7	ulpfr	19	# Last Modified On: Tue May 9 08:33:28 2000
8	ulpfr	10	# Language : CPerl
9	ulpfr	19	#
10			# (C) Copyright 1996-2000, Ulrich Pfeifer
11			#
12	ulpfr	10
13			package WAIT::InvertedIndex;
14			use strict;
15			use DB_File;
16			use Fcntl;
17			use WAIT::Filter;
18			use Carp;
19			use vars qw(%FUNC);
20
21	ulpfr	19	my $O = pack('C', 0xff)."o"; # occurances (document ferquency)
22	ulpfr	10
23	ulpfr	19	# The document frequency is the number of documents a term occurs
24			# in. The idea is that a term occuring in a significant part of the
25			# documents is not too significant.
26
27			my $M = pack('C', 0xff)."m"; # maxtf (term frequency)
28
29			# The maximum term frequency of a document is the frequency of the
30			# most frequent term in the document. It is related to the document
31			# length obviously. A document in which the most frequnet term occurs
32			# 100 times is probably much longer than a document whichs most
33			# frequent term occurs five time.
34
35	ulpfr	10	sub new {
36			my $type = shift;
37			my %parm = @_;
38			my $self = {};
39
40			$self->{file} = $parm{file} or croak "No file specified";
41			$self->{attr} = $parm{attr} or croak "No attributes specified";
42			$self->{filter} = $parm{filter};
43			$self->{'name'} = $parm{'name'};
44			$self->{records} = 0;
45			for (qw(intervall prefix)) {
46			if (exists $parm{$_}) {
47			if (ref $parm{$_}) {
48			$self->{$_} = [@{$parm{$_}}] # clone
49			} else {
50			$self->{$_} = $parm{$_}
51			}
52			}
53			}
54			bless $self, ref($type) \|\| $type;
55			}
56
57			sub name {$_[0]->{'name'}}
58
59			sub _split_pos {
60			my ($text, $pos) = @{$_[0]};
61			my @result;
62
63			$text =~ s/(^\s+)// and $pos += length($1);
64			while ($text =~ s/(^\S+)//) {
65			my $word = $1;
66			push @result, [$word, $pos];
67			$pos += length($word);
68			$text =~ s/(^\s+)// and $pos += length($1);
69			}
70			@result;
71			}
72
73			sub _xfiltergen {
74			my $filter = pop @_;
75
76	ulpfr	13	# Oops, we cannot overrule the user's choice. Other filters may kill
77			# stopwords, such as isotr clobbers "isn't" to "isnt".
78
79			# if ($filter eq 'stop') { # avoid the slow stopword elimination
80			# return _xfiltergen(@_); # it's cheaper to look them up afterwards
81			# }
82	ulpfr	10	if (@_) {
83			if ($filter =~ /^split(\d*)/) {
84			if ($1) {
85			"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
86			} else {
87			"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
88			}
89			} else {
90			"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
91			}
92			} else {
93			if ($filter =~ /^split(\d*)/) {
94			if ($1) {
95			"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
96			} else {
97			"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
98			}
99			} else {
100			"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
101			}
102			}
103			}
104
105			sub parse_pos {
106			my $self = shift;
107
108			unless (exists $self->{xfunc}) {
109			$self->{xfunc} =
110			eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
111			#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
112			}
113			&{$self->{xfunc}}($_[0]);
114			}
115
116			sub _filtergen {
117			my $filter = pop @_;
118
119			if (@_) {
120			"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
121			} else {
122			"map(&WAIT::Filter::$filter(\$_), \@_)";
123			}
124			}
125
126			sub drop {
127			my $self = shift;
128			if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
129			my $file = $self->{file};
130
131			! (!-e $file or unlink $file);
132			} else { # notify our database
133			croak ref($self)."::drop called directly";
134			}
135			}
136
137			sub open {
138			my $self = shift;
139			my $file = $self->{file};
140
141			if (defined $self->{dbh}) {
142			$self->{dbh};
143			} else {
144			$self->{func} =
145			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
146			$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
147			$self->{mode}, 0664, $DB_BTREE);
148			$self->{cache} = {}
149			if $self->{mode} & O_RDWR;
150			$self->{cdict} = {}
151			if $self->{mode} & O_RDWR;
152			$self->{cached} = 0;
153			}
154			}
155
156			sub insert {
157			my $self = shift;
158			my $key = shift;
159			my %occ;
160	ulpfr	13
161	ulpfr	10	defined $self->{db} or $self->open;
162			grep $occ{$_}++, &{$self->{func}}(@_);
163			my ($word, $noc);
164			$self->{records}++;
165			while (($word, $noc) = each %occ) {
166			if (defined $self->{cache}->{$word}) {
167			$self->{cdict}->{$O,$word}++;
168			$self->{cache}->{$word} .= pack 'w2', $key, $noc;
169			} else {
170			$self->{cdict}->{$O,$word} = 1;
171			$self->{cache}->{$word} = pack 'w2', $key, $noc;
172	ulpfr	13	}
173	ulpfr	10	$self->{cached}++;
174			}
175	ulpfr	19	# This cache limit should be configurable
176	ulpfr	10	$self->sync if $self->{cached} > 100_000;
177			my $maxtf = 0;
178			for (values %occ) {
179			$maxtf = $_ if $_ > $maxtf;
180			}
181			$self->{db}->{$M, $key} = $maxtf;
182			}
183
184	ulpfr	19	# We sort postings by increasing max term frequency (~ by increasing
185			# document length. This reduces the quality degradation if we process
186			# only the first part of a posting list.
187
188			sub sort_postings {
189			my $self = shift;
190			my $post = shift; # reference to a hash or packed string
191
192			if (ref $post) {
193			# we skip the sort part, if the index is not sorted
194			return pack('w*', %$post) unless $self->{reorg};
195			} else {
196			$post = { unpack 'w*', $post };
197			}
198
199			my $r = '';
200
201			# Sort posting list by increasing ratio of maximum term frequency (~
202			# "document length") and term frequency. This rati multipied by the
203			# inverse document frequence gives the score for a term. This sort
204			# order can be exploited for tuning of single term queries.
205
206			for my $did (sort { $post->{$b} / $self->{db}->{$M, $b}
207			<=>
208			$post->{$a} / $self->{db}->{$M, $a}
209			} keys %$post) {
210			$r .= pack 'w2', $did, $post->{$did};
211			}
212			#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
213			$r;
214			}
215
216	ulpfr	10	sub delete {
217			my $self = shift;
218			my $key = shift;
219			my %occ;
220
221	ulpfr	19	my $db;
222	ulpfr	10	defined $self->{db} or $self->open;
223	ulpfr	19	$db = $self->{db};
224	ulpfr	10	$self->sync;
225			$self->{records}--;
226	ulpfr	19
227			# less than zero documents in database?
228			_complain('delete of document', $key) and $self->{records} = 0
229			if $self->{records} < 0;
230
231	ulpfr	10	grep $occ{$_}++, &{$self->{func}}(@_);
232	ulpfr	19
233			for (keys %occ) {# may reorder posting list
234			my %post = unpack 'w*', $db->{$_};
235	ulpfr	10	delete $post{$key};
236	ulpfr	19	$db->{$_} = $self->sort_postings(\%post);
237			_complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
238			$db->{$O,$_} = scalar keys %post;
239	ulpfr	10	}
240	ulpfr	19	delete $db->{$M, $key};
241	ulpfr	10	}
242
243			sub intervall {
244			my ($self, $first, $last) = @_;
245			my $value = '';
246			my $word = '';
247			my @result;
248
249			return unless exists $self->{'intervall'};
250
251			defined $self->{db} or $self->open;
252			$self->sync;
253			my $dbh = $self->{dbh}; # for convenience
254
255			if (ref $self->{'intervall'}) {
256			unless (exists $self->{'ifunc'}) {
257			$self->{'ifunc'} =
258			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
259			}
260			($first) = &{$self->{'ifunc'}}($first) if $first;
261			($last) = &{$self->{'ifunc'}}($last) if $last;
262			}
263			if (defined $first and $first ne '') { # set the cursor to $first
264			$dbh->seq($first, $value, R_CURSOR);
265			} else {
266			$dbh->seq($first, $value, R_FIRST);
267			}
268			# We assume that word do not start with the character \377
269			# $last = pack 'C', 0xff unless defined $last and $last ne '';
270			return () if defined $last and $first gt $last; # $first would be after the last word
271
272			push @result, $first;
273			while (!$dbh->seq($word, $value, R_NEXT)) {
274			# We should limit this to a "resonable" number of words
275			last if (defined $last and $word gt $last) or $word =~ /^($M\|$O)/o;
276			push @result, $word;
277			}
278			\@result; # speed
279			}
280
281			sub prefix {
282			my ($self, $prefix) = @_;
283			my $value = '';
284			my $word = '';
285			my @result;
286
287			return () unless defined $prefix; # Full dictionary requested !!
288			return unless exists $self->{'prefix'};
289			defined $self->{db} or $self->open;
290			$self->sync;
291			my $dbh = $self->{dbh};
292
293			if (ref $self->{'prefix'}) {
294			unless (exists $self->{'pfunc'}) {
295			$self->{'pfunc'} =
296			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
297			}
298			($prefix) = &{$self->{'pfunc'}}($prefix);
299			}
300
301			if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
302			return ();
303			}
304			return () if $word !~ /^$prefix/;
305			push @result, $word;
306
307			while (!$dbh->seq($word, $value, R_NEXT)) {
308			# We should limit this to a "resonable" number of words
309			last if $word !~ /^$prefix/;
310			push @result, $word;
311			}
312			\@result; # speed
313			}
314
315	ulpfr	19	=head2 search($query)
316
317			The search method supports a range of search algorithms. It is
318			recommended to tune the index by calling
319			C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
320			into the table. This is a computing intense operation and all inserts
321			and deletes after this optimization are slightly more expensive. Once
322			reorganized, the index is kept sorted automatically until you switch
323			the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
324
325			When searching a tuned index, a query can be processed faster if the
326			caller requests only the topmost documents. This can be done by
327			passing a C<top =E<gt>> I<n> parameter to the search method.
328
329			For single term queries, the method returns only the I<n> top ranking
330			documents. For multi term queries two optimized algorithms are
331			available. The first algorithm computes the top n documents
332			approximately but very fast, sacrificing a little bit of precision for
333			speed. The second algorithm computes the topmost I<n> documents
334			precisely. This algorithm is slower and should be used only for small
335			values of I<n>. It can be requested by passing the query attribute
336			C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
337			While the picky version might not be faster than the brute force
338			version on average for modest size databases it uses less memory and
339			the processing time is almost linear in the number of query terms, not
340			in the size of the lists.
341
342			=cut
343
344	ulpfr	10	sub search {
345			my $self = shift;
346	ulpfr	19	my $query = shift;
347	ulpfr	10
348			defined $self->{db} or $self->open;
349			$self->sync;
350	ulpfr	19	$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
351	ulpfr	10	}
352
353			sub parse {
354			my $self = shift;
355
356			defined $self->{db} or $self->open;
357			&{$self->{func}}(@_);
358			}
359
360	ulpfr	13	sub keys {
361			my $self = shift;
362
363			defined $self->{db} or $self->open;
364			keys %{$self->{db}};
365			}
366
367	ulpfr	10	sub search_prefix {
368			my $self = shift;
369
370			# print "search_prefix(@_)\n";
371			defined $self->{db} or $self->open;
372			$self->search_raw(map($self->prefix($_), @_));
373			}
374
375	ulpfr	19	sub _complain ($$) {
376			my ($action, $term) = @_;
377
378			require Carp;
379			Carp::cluck
380			(sprintf("WAIT database inconsistency during $action [%s]: ".
381			"Please rebuild index\n",
382			$term,));
383			}
384
385	ulpfr	10	sub search_raw {
386			my $self = shift;
387	ulpfr	19	my $query = shift;
388	ulpfr	10	my %score;
389
390	ulpfr	19	# Top $wanted documents must be correct. Zero means all matching
391			# documents.
392			my $wanted = $query->{top};
393			my $strict = $query->{picky};
394	ulpfr	10
395	ulpfr	19	# Return at least $minacc documents. Zero means all matching
396			# documents.
397			# my $minacc = $query->{accus} \|\| $wanted;
398
399			# Open index and flush cache if necessary
400	ulpfr	10	defined $self->{db} or $self->open;
401			$self->sync;
402	ulpfr	19
403			# We keep duplicates
404			my @terms =
405			# Sort words by decreasing document frequency
406			sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
407			# check which words occur in the index.
408			grep { $self->{db}->{$O,$_} } @_;
409
410			return () unless @terms; # nothing to search for
411
412			# We special-case one term queries here. If the index was sorted,
413			# choping off the rest of the list will return the same ranking.
414			if ($wanted and @terms == 1) {
415			my $term = shift @terms;
416			my $idf = log($self->{records}/$self->{db}->{$O,$term});
417			my @res;
418
419			if ($self->{reorg}) { # or not $query->{picky}
420			@res = unpack "w". int(2*$wanted), $self->{db}->{$term};
421			} else {
422			@res = unpack 'w*', $self->{db}->{$term};
423			}
424
425			for (my $i=1; $i<@res; $i+=2) {
426			$res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
427			}
428
429			return @res
430			}
431
432			# We separate exhaustive search here to avoid overhead and make the
433			# code more readable. The block can be removed without changing the
434			# result.
435			unless ($wanted) {
436			for (@terms) {
437			my $df = $self->{db}->{$O,$_};
438
439			# The frequency must be 1 at least since the posting list is nonempty
440			_complain('search for term', $_) and $df = 1 if $df < 1;
441
442			# Unpack posting list for current query term $_
443	ulpfr	10	my %post = unpack 'w*', $self->{db}->{$_};
444	ulpfr	19
445			_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
446			# This is the inverse document frequency. The log of the inverse
447			# fraction of documents the term occurs in.
448			my $idf = log($self->{records}/$df);
449			for my $did (keys %post) {
450			if (my $freq = $self->{db}->{$M, $did}) {
451			$score{$did} += $post{$did} / $freq * $idf;
452			}
453	ulpfr	10	}
454			}
455	ulpfr	19	# warn sprintf "Used %d accumulators\n", scalar keys %score;
456			return %score;
457	ulpfr	10	}
458	ulpfr	19
459			# A sloppy but fast algorithm for multiple term queries.
460			unless ($strict) {
461			for (@terms) {
462			# Unpack posting list for current query term $_
463			my %post = unpack 'w*', $self->{db}->{$_};
464
465			# Lookup the number of documents the term occurs in (document frequency)
466			my $occ = $self->{db}->{$O,$_};
467
468			_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
469			# The frequency must be 1 at least since the posting list is nonempty
470			_complain('search for term', $_) and $occ = 1 if $occ < 1;
471
472			# This is the inverse document frequency. The log of the inverse
473			# fraction of documents the term occurs in.
474			my $idf = log($self->{records}/$occ);
475
476			# If we have a reasonable number of accumulators, change the
477			# loop to iterate over the accumulators. This will compromise
478			# quality for better speed. The algorithm still computes the
479			# exact weights, but the result is not guaranteed to contain the
480			# best results. The database might contain documents better
481			# than the worst returned document.
482
483			# We process the lists in order of increasing length. When the
484			# number of accumulators exceeds $wanted, no new documents are
485			# added, only the ranking/weighting of the seen documents is
486			# improved. The resulting ranking list must be pruned, since only
487			# the top most documents end up near their "optimal" rank.
488
489			if (keys %score < $wanted) {
490			for my $did (keys %post) {
491			if (my $freq = $self->{db}->{$M, $did}) {
492			$score{$did} += $post{$did} / $freq * $idf;
493			}
494			}
495			} else {
496			for my $did (keys %score) {
497			next unless exists $post{$did};
498			if (my $freq = $self->{db}->{$M, $did}) {
499			$score{$did} += $post{$did} / $freq * $idf;
500			}
501			}
502			}
503			}
504			return %score;
505			}
506			my @max; $max[$#terms+1]=0;
507			my @idf;
508
509			# Preparation loop. This extra loop makes sense only when "reorg"
510			# and "wanted" are true. But at the time beeing, keeping the code
511			# for the different search algorithms in one place seems more
512			# desirable than some minor speedup of the brute force version. We
513			# do cache $idf though.
514
515			for (my $i = $#terms; $i >=0; $i--) {
516			local $_ = $terms[$i];
517			# Lookup the number of documents the term occurs in (document frequency)
518			my $df = $self->{db}->{$O,$_};
519
520			# The frequency must be 1 at least since the posting list is nonempty
521			_complain('search for term', $_) and $df = 1 if $df < 1;
522
523			# This is the inverse document frequency. The log of the inverse
524			# fraction of documents the term occurs in.
525			$idf[$i] = log($self->{records}/$df);
526
527			my ($did,$occ);
528			if ($self->{reorg}) {
529			($did,$occ) = unpack 'w2', $self->{db}->{$_};
530			} else { # Maybe this costs more than it helps
531			($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
532			}
533			my $freq = $self->{db}->{$M, $did};
534			my $max = $occ/$freq*$idf[$i];
535			$max[$i] = $max + $max[$i+1];
536			}
537
538			# Main loop
539			for my $i (0 .. $#terms) {
540			my $term = $terms[$i];
541			# Unpack posting list for current query term $term. We loose the
542			# sorting order because the assignment to a hash.
543			my %post = unpack 'w*', $self->{db}->{$term};
544
545			_complain('search for term', $term)
546			if $self->{db}->{$O,$term} != keys %post;
547
548			my $idf = $idf[$i];
549			my $full; # Need to process all postings
550			my $chop; # Score necessary to enter the ranking list
551
552			if (# We know that wanted is true since we especial cased the
553			# exhaustive search.
554
555			$wanted and
556
557			# We did sort here if necessary in
558			# the preparation loop
559			# $self->{reorg} and
560
561			scalar keys %score > $wanted) {
562			$chop = (sort { $b <=> $a } values %score)[$wanted];
563			$full = $max[$i] > $chop;
564			} else {
565			$full = 1;
566			}
567
568			if ($full) {
569			# We need to inspect the full list. Either $wanted is not given,
570			# the index is not sorted, or we don't have enough accumulators
571			# yet.
572			if (defined $chop) {
573			# We might be able to avoid allocating accumulators
574			for my $did (keys %post) {
575			if (my $freq = $self->{db}->{$M, $did}) {
576			my $wgt = $post{$did} / $freq * $idf;
577			# We add an accumulator if $wgt exeeds $chop
578			if (exists $score{$did} or $wgt > $chop) {
579			$score{$did} += $wgt;
580			}
581			}
582			}
583			} else {
584			# Allocate acumulators for each seen document.
585			for my $did (keys %post) {
586			if (my $freq = $self->{db}->{$M, $did}) {
587			$score{$did} += $post{$did} / $freq * $idf;
588			}
589			}
590			}
591			} else {
592			# Update existing accumulators
593			for my $did (keys %score) {
594			next unless exists $post{$did};
595			if (my $freq = $self->{db}->{$M, $did}) {
596			$score{$did} += $post{$did} / $freq * $idf;
597			}
598			}
599			}
600			}
601			#warn sprintf "Used %d accumulators\n", scalar keys %score;
602	ulpfr	10	%score;
603			}
604
605	ulpfr	19	sub set {
606			my ($self, $attr, $value) = @_;
607
608			die "No such indexy attribute: '$attr'" unless $attr eq 'top';
609
610			return delete $self->{reorg} if $value == 0;
611
612			return if $self->{reorg}; # we are sorted already
613			return unless $self->{mode} & O_RDWR;
614			defined $self->{db} or $self->open;
615
616			$self->sync;
617			while (my($key, $value) = each %{$self->{db}}) {
618			next if $key =~ /^\377[om]/;
619			$self->{db}->{$key} = $self->sort_postings($value);
620			}
621			$self->{reorg} = 1;
622			}
623
624	ulpfr	10	sub sync {
625			my $self = shift;
626
627			if ($self->{mode} & O_RDWR) {
628	ulpfr	19	print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
629	ulpfr	10	while (my($key, $value) = each %{$self->{cache}}) {
630	ulpfr	19	if ($self->{reorg}) {
631			$self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
632			. $value);
633			} else {
634			$self->{db}->{$key} .= $value;
635			}
636	ulpfr	10	}
637			while (my($key, $value) = each %{$self->{cdict}}) {
638			$self->{db}->{$key} = 0 unless $self->{db}->{$key};
639			$self->{db}->{$key} += $value;
640			}
641	ulpfr	19	$self->{cache} = {};
642			$self->{cdict} = {};
643	ulpfr	10	$self->{cached} = 0;
644			}
645			}
646
647			sub close {
648			my $self = shift;
649
650			if ($self->{dbh}) {
651			$self->sync;
652			delete $self->{dbh};
653			untie %{$self->{db}};
654			delete $self->{db};
655			delete $self->{func};
656			delete $self->{cache};
657			delete $self->{cached};
658			delete $self->{cdict};
659			delete $self->{pfunc} if defined $self->{pfunc};
660			delete $self->{ifunc} if defined $self->{ifunc};
661			delete $self->{xfunc} if defined $self->{xfunc};
662			}
663			}
664
665			1;
666