lib/WAIT/InvertedIndexOld.pm

#                              -*- Mode: Perl -*- 
# $Basename: InvertedIndex.pm $
# $Revision: 1.2 $
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sat Apr 27 18:39:21 2002
# Language        : CPerl
# 
# (C) Copyright 1996-2000, Ulrich Pfeifer
# 

package WAIT::InvertedIndexOld;
use strict;
use DB_File;
use Fcntl;
use WAIT::Filter;
use Carp;
use vars qw(%FUNC);

my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)

# The document frequency is the number of documents a term occurs
# in. The idea is that a term occuring in a significant part of the
# documents is not too significant.

my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)

# The maximum term frequency of a document is the frequency of the
# most frequent term in the document.  It is related to the document
# length obviously.  A document in which the most frequnet term occurs
# 100 times is probably much longer than a document whichs most
# frequent term occurs five time.

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  $self->{file}     = $parm{file}     or croak "No file specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{filter}   = $parm{filter};
  $self->{'name'}   = $parm{'name'};
  $self->{records}  = 0;
  for (qw(intervall prefix)) {
    if (exists $parm{$_}) {
      if (ref $parm{$_}) {
        $self->{$_} = [@{$parm{$_}}] # clone
      } else {
        $self->{$_} = $parm{$_}
      }
    }
  }
  bless $self, ref($type) || $type;
}

sub name {$_[0]->{'name'}}

sub _split_pos {
  my ($text, $pos) = @{$_[0]};
  my @result;

  $text =~ s/(^\s+)// and $pos += length($1);
  while ($text =~ s/(^\S+)//) {
    my $word = $1;
    push @result, [$word, $pos];
    $pos += length($word);
    $text =~ s/(^\s+)// and $pos += length($1);
  }
  @result;
}

sub _xfiltergen {
  my $filter = pop @_;

# Oops, we cannot overrule the user's choice. Other filters may kill
# stopwords, such as isotr clobbers "isn't" to "isnt".

#  if ($filter eq 'stop') {      # avoid the slow stopword elimination
#    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
#  }
  if (@_) {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
    }
  } else {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
    }
  }
}

sub parse_pos {
  my $self = shift;

  unless (exists $self->{xfunc}) {
    $self->{xfunc}     =
      eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
    #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
  }
  &{$self->{xfunc}}($_[0]);
}

sub _filtergen {
  my $filter = pop @_;

  if (@_) {
    "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
  } else {
    "map(&WAIT::Filter::$filter(\$_), \@_)";
  }
}

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
    my $file = $self->{file};

    ! (!-e $file or unlink $file);
  } else {                              # notify our database
    croak ref($self)."::drop called directly";
  }
}

sub open {
  my $self = shift;
  my $file = $self->{file};

  if (defined $self->{dbh}) {
    $self->{dbh};
  } else {
    $self->{func}     =
      eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
    $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                       $self->{mode}, 0664, $DB_BTREE);
    $self->{cache} = {}
      if $self->{mode} & O_RDWR;
    $self->{cdict} = {}
      if $self->{mode} & O_RDWR;
    $self->{cached} = 0;
  }
}

sub insert {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  grep $occ{$_}++, &{$self->{func}}(@_);
  my ($word, $noc);
  $self->{records}++;
  while (($word, $noc) = each %occ) {
    if (defined $self->{cache}->{$word}) {
      $self->{cdict}->{$O,$word}++;
      $self->{cache}->{$word} .= pack 'w2', $key, $noc;
    } else {
      $self->{cdict}->{$O,$word} = 1;
      $self->{cache}->{$word}  = pack 'w2', $key, $noc;
    }
    $self->{cached}++;
  }
  # This cache limit should be configurable
  $self->sync if $self->{cached} > 100_000;
  my $maxtf = 0;
  for (values %occ) {
    $maxtf = $_ if $_ > $maxtf;
  }
  $self->{db}->{$M, $key} = $maxtf;
}

# We sort postings by increasing max term frequency (~ by increasing
# document length.  This reduces the quality degradation if we process
# only the first part of a posting list.

sub sort_postings {
  my $self = shift;
  my $post = shift;             # reference to a hash or packed string

  if (ref $post) {
    # we skip the sort part, if the index is not sorted
    return pack('w*', %$post) unless $self->{reorg};
  } else {
    $post = { unpack 'w*', $post };
  }

  my $r = '';

  # Sort posting list by increasing ratio of maximum term frequency (~
  # "document length") and term frequency. This rati multipied by the
  # inverse document frequence gives the score for a term.  This sort
  # order can be exploited for tuning of single term queries.

  for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
                                      <=>
                         $post->{$a} / $self->{db}->{$M, $a}
                    } keys %$post) {
    $r .= pack 'w2', $did, $post->{$did};
  }
  #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
  $r;
}

sub delete {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  my $db;
  defined $self->{db} or $self->open;
  $db = $self->{db};
  $self->sync;
  $self->{records}--;

  # less than zero documents in database?
  _complain('delete of document', $key) and $self->{records} = 0
    if $self->{records} < 0;

  grep $occ{$_}++, &{$self->{func}}(@_);

  for (keys %occ) {# may reorder posting list
    my %post = unpack 'w*', $db->{$_};
    delete $post{$key};
    $db->{$_}    = $self->sort_postings(\%post);
    _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
    $db->{$O,$_} = scalar keys %post;
  }
  delete $db->{$M, $key};
}

sub intervall {
  my ($self, $first, $last) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return unless exists $self->{'intervall'};

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  if (ref $self->{'intervall'}) {
    unless (exists $self->{'ifunc'}) {
      $self->{'ifunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
    }
    ($first) = &{$self->{'ifunc'}}($first) if $first;
    ($last)  = &{$self->{'ifunc'}}($last) if $last;
  }
  if (defined $first and $first ne '') {         # set the cursor to $first
    $dbh->seq($first, $value, R_CURSOR);
  } else {
    $dbh->seq($first, $value, R_FIRST);
  }
  # We assume that word do not start with the character \377
  # $last = pack 'C', 0xff unless defined $last and $last ne '';
  return () if defined $last and $first gt $last; # $first would be after the last word
  
  push @result, $first;
  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
    push @result, $word;
  }
  \@result;                     # speed
}

sub prefix {
  my ($self, $prefix) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return () unless defined $prefix; # Full dictionary requested !!
  return unless exists $self->{'prefix'};
  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};
  
  if (ref $self->{'prefix'}) {
    unless (exists $self->{'pfunc'}) {
      $self->{'pfunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
    }
    ($prefix) = &{$self->{'pfunc'}}($prefix);
  }

  if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
    return ();
  }
  return () if $word !~ /^$prefix/;
  push @result, $word;

  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word !~ /^$prefix/;
    push @result, $word;
  }
  \@result;                     # speed
}

=head2 search($query)

The search method supports a range of search algorithms.  It is
recommended to tune the index by calling
C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
into the table.  This is a computing intense operation and all inserts
and deletes after this optimization are slightly more expensive.  Once
reorganized, the index is kept sorted automatically until you switch
the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.

When searching a tuned index, a query can be processed faster if the
caller requests only the topmost documents.  This can be done by
passing a C<top =E<gt>> I<n> parameter to the search method.

For single term queries, the method returns only the I<n> top ranking
documents.  For multi term queries two optimized algorithms are
available. The first algorithm computes the top n documents
approximately but very fast, sacrificing a little bit of precision for
speed.  The second algorithm computes the topmost I<n> documents
precisely.  This algorithm is slower and should be used only for small
values of I<n>.  It can be requested by passing the query attribute
C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
While the picky version might not be faster than the brute force
version on average for modest size databases it uses less memory and
the processing time is almost linear in the number of query terms, not
in the size of the lists.

=cut

sub search {
  my $self  = shift;
  my $query = shift;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
}

sub parse {
  my $self  = shift;

  defined $self->{db} or $self->open;
  &{$self->{func}}(@_);
}

sub keys {
  my $self  = shift;

  defined $self->{db} or $self->open;
  keys %{$self->{db}};
}

sub search_prefix {
  my $self  = shift;

  # print "search_prefix(@_)\n";
  defined $self->{db} or $self->open;
  $self->search_raw(map($self->prefix($_), @_));
}

sub _complain ($$) {
  my ($action, $term) = @_;

  require Carp;
  Carp::cluck
    (sprintf("WAIT database inconsistency during $action [%s]: ".
             "Please rebuild index\n",
             $term,));
}

sub search_raw {
  my $self  = shift;
  my $query = shift;
  my %score;

  # Top $wanted documents must be correct. Zero means all matching
  # documents.
  my $wanted = $query->{top};
  my $strict = $query->{picky};

  # Return at least $minacc documents. Zero means all matching
  # documents.
  # my $minacc = $query->{accus} || $wanted;

  # Open index and flush cache if necessary
  defined $self->{db} or $self->open;
  $self->sync;

  # We keep duplicates
  my @terms = 
    # Sort words by decreasing document frequency
    sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
      # check which words occur in the index. 
      grep { $self->{db}->{$O,$_} } @_;

  return () unless @terms;                 # nothing to search for

  # We special-case one term queries here.  If the index was sorted,
  # choping off the rest of the list will return the same ranking.
  if ($wanted and @terms == 1) {
    my $term  = shift @terms;
    my $idf   = log($self->{records}/$self->{db}->{$O,$term});
    my @res;

    if ($self->{reorg}) { # or not $query->{picky}
      @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
    } else {
      @res = unpack 'w*',                $self->{db}->{$term};
    }

    for (my $i=1; $i<@res; $i+=2) {
      $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
    }

    return @res
  }

  # We separate exhaustive search here to avoid overhead and make the
  # code more readable. The block can be removed without changing the
  # result.
  unless ($wanted) {
    for (@terms) {
      my $df      = $self->{db}->{$O,$_};

      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $df = 1 if $df < 1;

      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{$_};

      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$df);
      for my $did (keys %post) {
        if (my $freq = $self->{db}->{$M, $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
    # warn sprintf "Used %d accumulators\n", scalar keys %score;
    return %score;
  }

  # A sloppy but fast algorithm for multiple term queries.
  unless ($strict) {
    for (@terms) {
      # Unpack posting list for current query term $_
      my %post = unpack 'w*', $self->{db}->{$_};

      # Lookup the number of documents the term occurs in (document frequency)
      my $occ  = $self->{db}->{$O,$_};

      _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $occ = 1 if $occ < 1;

      # This is the inverse document frequency. The log of the inverse
      # fraction of documents the term occurs in.
      my $idf = log($self->{records}/$occ);

      # If we have a reasonable number of accumulators, change the
      # loop to iterate over the accumulators.  This will compromise
      # quality for better speed.  The algorithm still computes the
      # exact weights, but the result is not guaranteed to contain the
      # *best* results.  The database might contain documents better
      # than the worst returned document.
      
      # We process the lists in order of increasing length.  When the
      # number of accumulators exceeds $wanted, no new documents are
      # added, only the ranking/weighting of the seen documents is
      # improved.  The resulting ranking list must be pruned, since only
      # the top most documents end up near their "optimal" rank.
      
      if (keys %score < $wanted) {
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      } else {
        for my $did (keys %score) {
          next unless exists $post{$did};
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    }
    return %score;
  }
  my @max; $max[$#terms+1]=0;
  my @idf;

  # Preparation loop.  This extra loop makes sense only when "reorg"
  # and "wanted" are true.  But at the time beeing, keeping the code
  # for the different search algorithms in one place seems more
  # desirable than some minor speedup of the brute force version.  We
  # do cache $idf though.

  for (my $i = $#terms; $i >=0; $i--) {
    local $_ = $terms[$i];
    # Lookup the number of documents the term occurs in (document frequency)
    my $df      = $self->{db}->{$O,$_};

    # The frequency *must* be 1 at least since the posting list is nonempty
    _complain('search for term', $_) and $df = 1 if $df < 1;

    # This is the inverse document frequency. The log of the inverse
    # fraction of documents the term occurs in.
    $idf[$i] = log($self->{records}/$df);

    my ($did,$occ);
    if ($self->{reorg}) {
      ($did,$occ) = unpack 'w2', $self->{db}->{$_};
    } else {                    # Maybe this costs more than it helps
      ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
    }
    my $freq      = $self->{db}->{$M, $did};
    my $max       = $occ/$freq*$idf[$i];
    $max[$i]      = $max + $max[$i+1];
  }

  # Main loop 
  for my $i (0 .. $#terms) {
    my $term = $terms[$i];
    # Unpack posting list for current query term $term. We loose the
    # sorting order because the assignment to a hash.
    my %post = unpack 'w*', $self->{db}->{$term};

    _complain('search for term', $term)
      if $self->{db}->{$O,$term} != keys %post;

    my $idf  = $idf[$i];
    my $full;                   # Need to process all postings
    my $chop;                   # Score necessary to enter the ranking list

    if (# We know that wanted is true since we especial cased the
        # exhaustive search.

        $wanted and

        # We did sort here if necessary in
        # the preparation loop
        # $self->{reorg} and

        scalar keys %score > $wanted) {
      $chop = (sort { $b <=> $a } values %score)[$wanted];
      $full = $max[$i] > $chop;
    } else {
      $full = 1;
    }

    if ($full) {
      # We need to inspect the full list. Either $wanted is not given,
      # the index is not sorted, or we don't have enough accumulators
      # yet.
      if (defined $chop) {
        # We might be able to avoid allocating accumulators
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            my $wgt = $post{$did} / $freq * $idf;
            # We add an accumulator if $wgt exeeds $chop
            if (exists $score{$did} or $wgt > $chop) {
              $score{$did} += $wgt;
            }
          }
        }
      } else {
        # Allocate acumulators for each seen document.
        for my $did (keys %post) {
          if (my $freq = $self->{db}->{$M, $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
      }
    } else {
      # Update existing accumulators
      for my $did (keys %score) {
        next unless exists $post{$did};
        if (my $freq = $self->{db}->{$M, $did}) {
          $score{$did} += $post{$did} / $freq * $idf;
        }
      }
    }
  }
  #warn sprintf "Used %d accumulators\n", scalar keys %score;
  %score;
}

sub set {
  my ($self, $attr, $value) = @_;

  die "No such index attribute: '$attr'" unless $attr eq 'top';

  return delete $self->{reorg} if $value == 0;

  return if     $self->{reorg};     # we are sorted already
  return unless $self->{mode} & O_RDWR;
  defined $self->{db} or $self->open;

  $self->sync;
  while (my($key, $value) = each %{$self->{db}}) {
    next if $key =~ /^\377[om]/;
    $self->{db}->{$key} = $self->sort_postings($value);
  }
  $self->{reorg} = 1;
}

sub sync {
  my $self = shift;

  if ($self->{mode} & O_RDWR) {
    print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
    while (my($key, $value) = each %{$self->{cache}}) {
      if ($self->{reorg}) {
        $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
                                                   . $value);
      } else {
        $self->{db}->{$key} .= $value;
      }
    }
    while (my($key, $value) = each %{$self->{cdict}}) {
      $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
      $self->{db}->{$key} += $value;
    }
    $self->{cache}  = {};
    $self->{cdict}  = {};
    $self->{cached} = 0;
  }
}

sub close {
  my $self = shift;

  if ($self->{dbh}) {
    $self->sync;
    delete $self->{dbh};
    untie %{$self->{db}};
    delete $self->{db};
    delete $self->{func};
    delete $self->{cache};
    delete $self->{cached};
    delete $self->{cdict};
    delete $self->{pfunc} if defined $self->{pfunc};
    delete $self->{ifunc} if defined $self->{ifunc};
    delete $self->{xfunc} if defined $self->{xfunc};
  }
}

1;

1	# -- Mode: Perl --
2	# $Basename: InvertedIndex.pm $
3	# $Revision: 1.2 $
4	# Author : Ulrich Pfeifer
5	# Created On : Thu Aug 8 13:05:10 1996
6	# Last Modified By: Ulrich Pfeifer
7	# Last Modified On: Sat Apr 27 18:39:21 2002
8	# Language : CPerl
9	#
10	# (C) Copyright 1996-2000, Ulrich Pfeifer
11	#
12
13	package WAIT::InvertedIndexOld;
14	use strict;
15	use DB_File;
16	use Fcntl;
17	use WAIT::Filter;
18	use Carp;
19	use vars qw(%FUNC);
20
21	my $O = pack('C', 0xff)."o"; # occurances (document ferquency)
22
23	# The document frequency is the number of documents a term occurs
24	# in. The idea is that a term occuring in a significant part of the
25	# documents is not too significant.
26
27	my $M = pack('C', 0xff)."m"; # maxtf (term frequency)
28
29	# The maximum term frequency of a document is the frequency of the
30	# most frequent term in the document. It is related to the document
31	# length obviously. A document in which the most frequnet term occurs
32	# 100 times is probably much longer than a document whichs most
33	# frequent term occurs five time.
34
35	sub new {
36	my $type = shift;
37	my %parm = @_;
38	my $self = {};
39
40	$self->{file} = $parm{file} or croak "No file specified";
41	$self->{attr} = $parm{attr} or croak "No attributes specified";
42	$self->{filter} = $parm{filter};
43	$self->{'name'} = $parm{'name'};
44	$self->{records} = 0;
45	for (qw(intervall prefix)) {
46	if (exists $parm{$_}) {
47	if (ref $parm{$_}) {
48	$self->{$_} = [@{$parm{$_}}] # clone
49	} else {
50	$self->{$_} = $parm{$_}
51	}
52	}
53	}
54	bless $self, ref($type) \|\| $type;
55	}
56
57	sub name {$_[0]->{'name'}}
58
59	sub _split_pos {
60	my ($text, $pos) = @{$_[0]};
61	my @result;
62
63	$text =~ s/(^\s+)// and $pos += length($1);
64	while ($text =~ s/(^\S+)//) {
65	my $word = $1;
66	push @result, [$word, $pos];
67	$pos += length($word);
68	$text =~ s/(^\s+)// and $pos += length($1);
69	}
70	@result;
71	}
72
73	sub _xfiltergen {
74	my $filter = pop @_;
75
76	# Oops, we cannot overrule the user's choice. Other filters may kill
77	# stopwords, such as isotr clobbers "isn't" to "isnt".
78
79	# if ($filter eq 'stop') { # avoid the slow stopword elimination
80	# return _xfiltergen(@_); # it's cheaper to look them up afterwards
81	# }
82	if (@_) {
83	if ($filter =~ /^split(\d*)/) {
84	if ($1) {
85	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
86	} else {
87	"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
88	}
89	} else {
90	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
91	}
92	} else {
93	if ($filter =~ /^split(\d*)/) {
94	if ($1) {
95	"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
96	} else {
97	"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
98	}
99	} else {
100	"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
101	}
102	}
103	}
104
105	sub parse_pos {
106	my $self = shift;
107
108	unless (exists $self->{xfunc}) {
109	$self->{xfunc} =
110	eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
111	#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
112	}
113	&{$self->{xfunc}}($_[0]);
114	}
115
116	sub _filtergen {
117	my $filter = pop @_;
118
119	if (@_) {
120	"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
121	} else {
122	"map(&WAIT::Filter::$filter(\$_), \@_)";
123	}
124	}
125
126	sub drop {
127	my $self = shift;
128	if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
129	my $file = $self->{file};
130
131	! (!-e $file or unlink $file);
132	} else { # notify our database
133	croak ref($self)."::drop called directly";
134	}
135	}
136
137	sub open {
138	my $self = shift;
139	my $file = $self->{file};
140
141	if (defined $self->{dbh}) {
142	$self->{dbh};
143	} else {
144	$self->{func} =
145	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
146	$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
147	$self->{mode}, 0664, $DB_BTREE);
148	$self->{cache} = {}
149	if $self->{mode} & O_RDWR;
150	$self->{cdict} = {}
151	if $self->{mode} & O_RDWR;
152	$self->{cached} = 0;
153	}
154	}
155
156	sub insert {
157	my $self = shift;
158	my $key = shift;
159	my %occ;
160
161	defined $self->{db} or $self->open;
162	grep $occ{$_}++, &{$self->{func}}(@_);
163	my ($word, $noc);
164	$self->{records}++;
165	while (($word, $noc) = each %occ) {
166	if (defined $self->{cache}->{$word}) {
167	$self->{cdict}->{$O,$word}++;
168	$self->{cache}->{$word} .= pack 'w2', $key, $noc;
169	} else {
170	$self->{cdict}->{$O,$word} = 1;
171	$self->{cache}->{$word} = pack 'w2', $key, $noc;
172	}
173	$self->{cached}++;
174	}
175	# This cache limit should be configurable
176	$self->sync if $self->{cached} > 100_000;
177	my $maxtf = 0;
178	for (values %occ) {
179	$maxtf = $_ if $_ > $maxtf;
180	}
181	$self->{db}->{$M, $key} = $maxtf;
182	}
183
184	# We sort postings by increasing max term frequency (~ by increasing
185	# document length. This reduces the quality degradation if we process
186	# only the first part of a posting list.
187
188	sub sort_postings {
189	my $self = shift;
190	my $post = shift; # reference to a hash or packed string
191
192	if (ref $post) {
193	# we skip the sort part, if the index is not sorted
194	return pack('w*', %$post) unless $self->{reorg};
195	} else {
196	$post = { unpack 'w*', $post };
197	}
198
199	my $r = '';
200
201	# Sort posting list by increasing ratio of maximum term frequency (~
202	# "document length") and term frequency. This rati multipied by the
203	# inverse document frequence gives the score for a term. This sort
204	# order can be exploited for tuning of single term queries.
205
206	for my $did (sort { $post->{$b} / $self->{db}->{$M, $b}
207	<=>
208	$post->{$a} / $self->{db}->{$M, $a}
209	} keys %$post) {
210	$r .= pack 'w2', $did, $post->{$did};
211	}
212	#warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
213	$r;
214	}
215
216	sub delete {
217	my $self = shift;
218	my $key = shift;
219	my %occ;
220
221	my $db;
222	defined $self->{db} or $self->open;
223	$db = $self->{db};
224	$self->sync;
225	$self->{records}--;
226
227	# less than zero documents in database?
228	_complain('delete of document', $key) and $self->{records} = 0
229	if $self->{records} < 0;
230
231	grep $occ{$_}++, &{$self->{func}}(@_);
232
233	for (keys %occ) {# may reorder posting list
234	my %post = unpack 'w*', $db->{$_};
235	delete $post{$key};
236	$db->{$_} = $self->sort_postings(\%post);
237	_complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
238	$db->{$O,$_} = scalar keys %post;
239	}
240	delete $db->{$M, $key};
241	}
242
243	sub intervall {
244	my ($self, $first, $last) = @_;
245	my $value = '';
246	my $word = '';
247	my @result;
248
249	return unless exists $self->{'intervall'};
250
251	defined $self->{db} or $self->open;
252	$self->sync;
253	my $dbh = $self->{dbh}; # for convenience
254
255	if (ref $self->{'intervall'}) {
256	unless (exists $self->{'ifunc'}) {
257	$self->{'ifunc'} =
258	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
259	}
260	($first) = &{$self->{'ifunc'}}($first) if $first;
261	($last) = &{$self->{'ifunc'}}($last) if $last;
262	}
263	if (defined $first and $first ne '') { # set the cursor to $first
264	$dbh->seq($first, $value, R_CURSOR);
265	} else {
266	$dbh->seq($first, $value, R_FIRST);
267	}
268	# We assume that word do not start with the character \377
269	# $last = pack 'C', 0xff unless defined $last and $last ne '';
270	return () if defined $last and $first gt $last; # $first would be after the last word
271
272	push @result, $first;
273	while (!$dbh->seq($word, $value, R_NEXT)) {
274	# We should limit this to a "resonable" number of words
275	last if (defined $last and $word gt $last) or $word =~ /^($M\|$O)/o;
276	push @result, $word;
277	}
278	\@result; # speed
279	}
280
281	sub prefix {
282	my ($self, $prefix) = @_;
283	my $value = '';
284	my $word = '';
285	my @result;
286
287	return () unless defined $prefix; # Full dictionary requested !!
288	return unless exists $self->{'prefix'};
289	defined $self->{db} or $self->open;
290	$self->sync;
291	my $dbh = $self->{dbh};
292
293	if (ref $self->{'prefix'}) {
294	unless (exists $self->{'pfunc'}) {
295	$self->{'pfunc'} =
296	eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
297	}
298	($prefix) = &{$self->{'pfunc'}}($prefix);
299	}
300
301	if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
302	return ();
303	}
304	return () if $word !~ /^$prefix/;
305	push @result, $word;
306
307	while (!$dbh->seq($word, $value, R_NEXT)) {
308	# We should limit this to a "resonable" number of words
309	last if $word !~ /^$prefix/;
310	push @result, $word;
311	}
312	\@result; # speed
313	}
314
315	=head2 search($query)
316
317	The search method supports a range of search algorithms. It is
318	recommended to tune the index by calling
319	C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
320	into the table. This is a computing intense operation and all inserts
321	and deletes after this optimization are slightly more expensive. Once
322	reorganized, the index is kept sorted automatically until you switch
323	the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
324
325	When searching a tuned index, a query can be processed faster if the
326	caller requests only the topmost documents. This can be done by
327	passing a C<top =E<gt>> I<n> parameter to the search method.
328
329	For single term queries, the method returns only the I<n> top ranking
330	documents. For multi term queries two optimized algorithms are
331	available. The first algorithm computes the top n documents
332	approximately but very fast, sacrificing a little bit of precision for
333	speed. The second algorithm computes the topmost I<n> documents
334	precisely. This algorithm is slower and should be used only for small
335	values of I<n>. It can be requested by passing the query attribute
336	C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
337	While the picky version might not be faster than the brute force
338	version on average for modest size databases it uses less memory and
339	the processing time is almost linear in the number of query terms, not
340	in the size of the lists.
341
342	=cut
343
344	sub search {
345	my $self = shift;
346	my $query = shift;
347
348	defined $self->{db} or $self->open;
349	$self->sync;
350	$self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
351	}
352
353	sub parse {
354	my $self = shift;
355
356	defined $self->{db} or $self->open;
357	&{$self->{func}}(@_);
358	}
359
360	sub keys {
361	my $self = shift;
362
363	defined $self->{db} or $self->open;
364	keys %{$self->{db}};
365	}
366
367	sub search_prefix {
368	my $self = shift;
369
370	# print "search_prefix(@_)\n";
371	defined $self->{db} or $self->open;
372	$self->search_raw(map($self->prefix($_), @_));
373	}
374
375	sub _complain ($$) {
376	my ($action, $term) = @_;
377
378	require Carp;
379	Carp::cluck
380	(sprintf("WAIT database inconsistency during $action [%s]: ".
381	"Please rebuild index\n",
382	$term,));
383	}
384
385	sub search_raw {
386	my $self = shift;
387	my $query = shift;
388	my %score;
389
390	# Top $wanted documents must be correct. Zero means all matching
391	# documents.
392	my $wanted = $query->{top};
393	my $strict = $query->{picky};
394
395	# Return at least $minacc documents. Zero means all matching
396	# documents.
397	# my $minacc = $query->{accus} \|\| $wanted;
398
399	# Open index and flush cache if necessary
400	defined $self->{db} or $self->open;
401	$self->sync;
402
403	# We keep duplicates
404	my @terms =
405	# Sort words by decreasing document frequency
406	sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
407	# check which words occur in the index.
408	grep { $self->{db}->{$O,$_} } @_;
409
410	return () unless @terms; # nothing to search for
411
412	# We special-case one term queries here. If the index was sorted,
413	# choping off the rest of the list will return the same ranking.
414	if ($wanted and @terms == 1) {
415	my $term = shift @terms;
416	my $idf = log($self->{records}/$self->{db}->{$O,$term});
417	my @res;
418
419	if ($self->{reorg}) { # or not $query->{picky}
420	@res = unpack "w". int(2*$wanted), $self->{db}->{$term};
421	} else {
422	@res = unpack 'w*', $self->{db}->{$term};
423	}
424
425	for (my $i=1; $i<@res; $i+=2) {
426	$res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
427	}
428
429	return @res
430	}
431
432	# We separate exhaustive search here to avoid overhead and make the
433	# code more readable. The block can be removed without changing the
434	# result.
435	unless ($wanted) {
436	for (@terms) {
437	my $df = $self->{db}->{$O,$_};
438
439	# The frequency must be 1 at least since the posting list is nonempty
440	_complain('search for term', $_) and $df = 1 if $df < 1;
441
442	# Unpack posting list for current query term $_
443	my %post = unpack 'w*', $self->{db}->{$_};
444
445	_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
446	# This is the inverse document frequency. The log of the inverse
447	# fraction of documents the term occurs in.
448	my $idf = log($self->{records}/$df);
449	for my $did (keys %post) {
450	if (my $freq = $self->{db}->{$M, $did}) {
451	$score{$did} += $post{$did} / $freq * $idf;
452	}
453	}
454	}
455	# warn sprintf "Used %d accumulators\n", scalar keys %score;
456	return %score;
457	}
458
459	# A sloppy but fast algorithm for multiple term queries.
460	unless ($strict) {
461	for (@terms) {
462	# Unpack posting list for current query term $_
463	my %post = unpack 'w*', $self->{db}->{$_};
464
465	# Lookup the number of documents the term occurs in (document frequency)
466	my $occ = $self->{db}->{$O,$_};
467
468	_complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
469	# The frequency must be 1 at least since the posting list is nonempty
470	_complain('search for term', $_) and $occ = 1 if $occ < 1;
471
472	# This is the inverse document frequency. The log of the inverse
473	# fraction of documents the term occurs in.
474	my $idf = log($self->{records}/$occ);
475
476	# If we have a reasonable number of accumulators, change the
477	# loop to iterate over the accumulators. This will compromise
478	# quality for better speed. The algorithm still computes the
479	# exact weights, but the result is not guaranteed to contain the
480	# best results. The database might contain documents better
481	# than the worst returned document.
482
483	# We process the lists in order of increasing length. When the
484	# number of accumulators exceeds $wanted, no new documents are
485	# added, only the ranking/weighting of the seen documents is
486	# improved. The resulting ranking list must be pruned, since only
487	# the top most documents end up near their "optimal" rank.
488
489	if (keys %score < $wanted) {
490	for my $did (keys %post) {
491	if (my $freq = $self->{db}->{$M, $did}) {
492	$score{$did} += $post{$did} / $freq * $idf;
493	}
494	}
495	} else {
496	for my $did (keys %score) {
497	next unless exists $post{$did};
498	if (my $freq = $self->{db}->{$M, $did}) {
499	$score{$did} += $post{$did} / $freq * $idf;
500	}
501	}
502	}
503	}
504	return %score;
505	}
506	my @max; $max[$#terms+1]=0;
507	my @idf;
508
509	# Preparation loop. This extra loop makes sense only when "reorg"
510	# and "wanted" are true. But at the time beeing, keeping the code
511	# for the different search algorithms in one place seems more
512	# desirable than some minor speedup of the brute force version. We
513	# do cache $idf though.
514
515	for (my $i = $#terms; $i >=0; $i--) {
516	local $_ = $terms[$i];
517	# Lookup the number of documents the term occurs in (document frequency)
518	my $df = $self->{db}->{$O,$_};
519
520	# The frequency must be 1 at least since the posting list is nonempty
521	_complain('search for term', $_) and $df = 1 if $df < 1;
522
523	# This is the inverse document frequency. The log of the inverse
524	# fraction of documents the term occurs in.
525	$idf[$i] = log($self->{records}/$df);
526
527	my ($did,$occ);
528	if ($self->{reorg}) {
529	($did,$occ) = unpack 'w2', $self->{db}->{$_};
530	} else { # Maybe this costs more than it helps
531	($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
532	}
533	my $freq = $self->{db}->{$M, $did};
534	my $max = $occ/$freq*$idf[$i];
535	$max[$i] = $max + $max[$i+1];
536	}
537
538	# Main loop
539	for my $i (0 .. $#terms) {
540	my $term = $terms[$i];
541	# Unpack posting list for current query term $term. We loose the
542	# sorting order because the assignment to a hash.
543	my %post = unpack 'w*', $self->{db}->{$term};
544
545	_complain('search for term', $term)
546	if $self->{db}->{$O,$term} != keys %post;
547
548	my $idf = $idf[$i];
549	my $full; # Need to process all postings
550	my $chop; # Score necessary to enter the ranking list
551
552	if (# We know that wanted is true since we especial cased the
553	# exhaustive search.
554
555	$wanted and
556
557	# We did sort here if necessary in
558	# the preparation loop
559	# $self->{reorg} and
560
561	scalar keys %score > $wanted) {
562	$chop = (sort { $b <=> $a } values %score)[$wanted];
563	$full = $max[$i] > $chop;
564	} else {
565	$full = 1;
566	}
567
568	if ($full) {
569	# We need to inspect the full list. Either $wanted is not given,
570	# the index is not sorted, or we don't have enough accumulators
571	# yet.
572	if (defined $chop) {
573	# We might be able to avoid allocating accumulators
574	for my $did (keys %post) {
575	if (my $freq = $self->{db}->{$M, $did}) {
576	my $wgt = $post{$did} / $freq * $idf;
577	# We add an accumulator if $wgt exeeds $chop
578	if (exists $score{$did} or $wgt > $chop) {
579	$score{$did} += $wgt;
580	}
581	}
582	}
583	} else {
584	# Allocate acumulators for each seen document.
585	for my $did (keys %post) {
586	if (my $freq = $self->{db}->{$M, $did}) {
587	$score{$did} += $post{$did} / $freq * $idf;
588	}
589	}
590	}
591	} else {
592	# Update existing accumulators
593	for my $did (keys %score) {
594	next unless exists $post{$did};
595	if (my $freq = $self->{db}->{$M, $did}) {
596	$score{$did} += $post{$did} / $freq * $idf;
597	}
598	}
599	}
600	}
601	#warn sprintf "Used %d accumulators\n", scalar keys %score;
602	%score;
603	}
604
605	sub set {
606	my ($self, $attr, $value) = @_;
607
608	die "No such index attribute: '$attr'" unless $attr eq 'top';
609
610	return delete $self->{reorg} if $value == 0;
611
612	return if $self->{reorg}; # we are sorted already
613	return unless $self->{mode} & O_RDWR;
614	defined $self->{db} or $self->open;
615
616	$self->sync;
617	while (my($key, $value) = each %{$self->{db}}) {
618	next if $key =~ /^\377[om]/;
619	$self->{db}->{$key} = $self->sort_postings($value);
620	}
621	$self->{reorg} = 1;
622	}
623
624	sub sync {
625	my $self = shift;
626
627	if ($self->{mode} & O_RDWR) {
628	print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
629	while (my($key, $value) = each %{$self->{cache}}) {
630	if ($self->{reorg}) {
631	$self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
632	. $value);
633	} else {
634	$self->{db}->{$key} .= $value;
635	}
636	}
637	while (my($key, $value) = each %{$self->{cdict}}) {
638	$self->{db}->{$key} = 0 unless $self->{db}->{$key};
639	$self->{db}->{$key} += $value;
640	}
641	$self->{cache} = {};
642	$self->{cdict} = {};
643	$self->{cached} = 0;
644	}
645	}
646
647	sub close {
648	my $self = shift;
649
650	if ($self->{dbh}) {
651	$self->sync;
652	delete $self->{dbh};
653	untie %{$self->{db}};
654	delete $self->{db};
655	delete $self->{func};
656	delete $self->{cache};
657	delete $self->{cached};
658	delete $self->{cdict};
659	delete $self->{pfunc} if defined $self->{pfunc};
660	delete $self->{ifunc} if defined $self->{ifunc};
661	delete $self->{xfunc} if defined $self->{xfunc};
662	}
663	}
664
665	1;
666