/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-branches/CPAN/lib/WAIT/InvertedIndex.pm
revision 19 by ulpfr,
Tue May  9 11:29:45 2000 UTC
+cvs-head/lib/WAIT/InvertedIndex.pm
revision 40 by laperla,
Mon Nov 13 10:44:03 2000 UTC
 Line 4
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Tue May  9 08:33:28 2000
+ # Last Modified On: Sun Nov 12 14:40:21 2000
  # Language        : CPerl
  #
  # (C) Copyright 1996-2000, Ulrich Pfeifer
 Line 16 
 use DB_File;
  use Fcntl;
  use WAIT::Filter;
  use Carp;
- use vars qw(%FUNC);
+ use vars qw(%FUNC $VERSION);
- my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)
+ $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
- # The document frequency is the number of documents a term occurs
+ # The dictionary has three different key types:
- # in. The idea is that a term occuring in a significant part of the
+ #  'o'.$word
- # documents is not too significant.
+ #
+ #     The document frequency is the number of documents a term occurs
+ #     in. The idea is that a term occuring in a significant part of the
+ #     documents is not too significant.
+ #
+ # 'm'.$word
+ #
+ #     The maximum term frequency of a document is the frequency of the
+ #     most frequent term in the document.  It is related to the document
+ #     length obviously.  A document in which the most frequnet term occurs
+ #     100 times is probably much longer than a document whichs most
+ #     frequent term occurs five time.
+ #
+ # 'p'.$word
+ #
+ #     Under this key we store the actual posting list as pairs of
+ #     packed integers.
- my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)
+ my $no_old_index_support = 0; # do not check for old indices if set
- # The maximum term frequency of a document is the frequency of the
- # most frequent term in the document.  It is related to the document
- # length obviously.  A document in which the most frequnet term occurs
- # 100 times is probably much longer than a document whichs most
- # frequent term occurs five time.
  sub new {
    my $type = shift;
-Line 134 
 sub drop {
+Line 144 
 sub drop {
    }
  }
+ sub is_an_old_index {
+   my $self = shift;
+   return 0 if $no_old_index_support;
+   return $self->{old_index} if exists $self->{old_index};
+   # We can only guess if this is an old index. We lookup the first 10
+   # $O entries. If all values are integers, we assume that the index
+   # is an old one.
+   defined $self->{db} or $self->open;
+   $self->sync;
+   my $dbh = $self->{dbh};       # for convenience
+   my $O = pack('C', 0xff)."o";
+   my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
+   if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
+     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
+     return $self->{old_index} = 0;
+   }
+   for (my $i=0; $i<10;$i++) {
+     if ($value !~ /^\d+$/) {
+       # warn "DEBUG: word[$word]value[$value], not an old index";
+       return $self->{old_index} = 0;
+     }
+     if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
+         $word !~ /^$O$;/o                   # no $O values left
+        ) {
+       # we are not sure enough that this is an old index
+       # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
+       return $self->{old_index} = 0;
+     }
+   }
+   # warn "DEBUG: old index";
+   return $self->{old_index} = 1;
+ }
  sub open {
    my $self = shift;
    my $file = $self->{file};
-Line 150 
 sub open {
+Line 197 
 sub open {
      $self->{cdict} = {}
        if $self->{mode} & O_RDWR;
      $self->{cached} = 0;
+     if (!$no_old_index_support and $self->is_an_old_index()) {
+       warn "This is an old index, upgrade you database";
+       require WAIT::InvertedIndexOld;
+       bless $self, 'WAIT::InvertedIndexOld';
+     }
    }
  }
-Line 164 
 sub insert {
+Line 216 
 sub insert {
    $self->{records}++;
    while (($word, $noc) = each %occ) {
      if (defined $self->{cache}->{$word}) {
-       $self->{cdict}->{$O,$word}++;
+       $self->{cdict}->{$word}++;
        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
      } else {
-       $self->{cdict}->{$O,$word} = 1;
+       $self->{cdict}->{$word} = 1;
        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
      }
      $self->{cached}++;
-Line 178 
 sub insert {
+Line 230 
 sub insert {
    for (values %occ) {
      $maxtf = $_ if $_ > $maxtf;
    }
-   $self->{db}->{$M, $key} = $maxtf;
+   $self->{db}->{'m'. $key} = $maxtf;
  }
  # We sort postings by increasing max term frequency (~ by increasing
-Line 203 
 sub sort_postings {
+Line 255 
 sub sort_postings {
    # inverse document frequence gives the score for a term.  This sort
    # order can be exploited for tuning of single term queries.
-   for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}
+   for my $did (keys %$post) { # sanity check
+     unless ($self->{db}->{"m". $did}) {
+       warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
+       $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
+     }
+   }
+   for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
                                        <=>
-                          $post->{$a} / $self->{db}->{$M, $a}
+                          $post->{$a} / $self->{db}->{'m'. $a}
                      } keys %$post) {
      $r .= pack 'w2', $did, $post->{$did};
    }
-Line 230 
 sub delete {
+Line 288 
 sub delete {
    grep $occ{$_}++, &{$self->{func}}(@_);
+   # Be prepared for "Odd number of elements in hash assignment"
+   local $SIG{__WARN__} = sub {
+     my $warning = shift;
+     chomp $warning;
+     warn "Catching warning[$warning] during delete of key[$key]";
+   };
    for (keys %occ) {# may reorder posting list
-     my %post = unpack 'w*', $db->{$_};
+     my %post = unpack 'w*', $db->{'p'.$_};
      delete $post{$key};
-     $db->{$_}    = $self->sort_postings(\%post);
+     $db->{'p'.$_}    = $self->sort_postings(\%post);
-     _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
+     _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
-     $db->{$O,$_} = scalar keys %post;
+     $db->{'o'.$_} = scalar keys %post;
    }
-   delete $db->{$M, $key};
+   delete $db->{'m'. $key};
  }
  sub intervall {
-Line 260 
 sub intervall {
+Line 324 
 sub intervall {
      ($first) = &{$self->{'ifunc'}}($first) if $first;
      ($last)  = &{$self->{'ifunc'}}($last) if $last;
    }
-   if (defined $first and $first ne '') {         # set the cursor to $first
+   $first = 'p'.($first||'');
-     $dbh->seq($first, $value, R_CURSOR);
+   $last  = (defined $last)?'p'.$last:'q';
-   } else {
-     $dbh->seq($first, $value, R_FIRST);
+   # set the cursor to $first
-   }
+   $dbh->seq($first, $value, R_CURSOR);
-   # We assume that word do not start with the character \377
-   # $last = pack 'C', 0xff unless defined $last and $last ne '';
+   # $first would be after the last word
-   return () if defined $last and $first gt $last; # $first would be after the last word
+   return () if $first gt $last;
-   push @result, $first;
+   push @result, substr($first,1);
    while (!$dbh->seq($word, $value, R_NEXT)) {
      # We should limit this to a "resonable" number of words
-     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
+     last if $word gt $last;
-     push @result, $word;
+     push @result, substr($word,1);
    }
    \@result;                     # speed
  }
-Line 298 
 sub prefix {
+Line 362 
 sub prefix {
      ($prefix) = &{$self->{'pfunc'}}($prefix);
    }
-   if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
+   if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
      return ();
    }
-   return () if $word !~ /^$prefix/;
+   return () if $word !~ /^p$prefix/;
-   push @result, $word;
+   push @result, substr($word,1);
    while (!$dbh->seq($word, $value, R_NEXT)) {
      # We should limit this to a "resonable" number of words
-     last if $word !~ /^$prefix/;
+     last if $word !~ /^p$prefix/;
-     push @result, $word;
+     push @result, substr($word,1);
    }
    \@result;                     # speed
  }
-Line 403 
 sub search_raw {
+Line 467 
 sub search_raw {
    # We keep duplicates
    my @terms =
      # Sort words by decreasing document frequency
-     sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
+     sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
        # check which words occur in the index.
-       grep { $self->{db}->{$O,$_} } @_;
+       grep { $self->{db}->{'o'.$_} } @_;
-   return () unless @terms;                 # nothing to search for
+   return unless @terms;
    # We special-case one term queries here.  If the index was sorted,
    # choping off the rest of the list will return the same ranking.
    if ($wanted and @terms == 1) {
      my $term  = shift @terms;
-     my $idf   = log($self->{records}/$self->{db}->{$O,$term});
+     my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
      my @res;
      if ($self->{reorg}) { # or not $query->{picky}
-       @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
+       @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
      } else {
-       @res = unpack 'w*',                $self->{db}->{$term};
+       @res = unpack 'w*',                $self->{db}->{'p'.$term};
      }
      for (my $i=1; $i<@res; $i+=2) {
-       $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
+       # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
+       # above was written badly, allows two DIV_ZERO problems.
+       my $maxtf = $self->{db}->{"m". $res[$i-1]};
+       unless ($maxtf) {
+         warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
+         $maxtf = 1;
+       }
+       $res[$i] = ($res[$i] / $maxtf) * $idf;
      }
      return @res
-Line 434 
 sub search_raw {
+Line 505 
 sub search_raw {
    # result.
    unless ($wanted) {
      for (@terms) {
-       my $df      = $self->{db}->{$O,$_};
+       my $df      = $self->{db}->{'o'.$_};
        # The frequency *must* be 1 at least since the posting list is nonempty
        _complain('search for term', $_) and $df = 1 if $df < 1;
        # Unpack posting list for current query term $_
-       my %post = unpack 'w*', $self->{db}->{$_};
+       my %post = unpack 'w*', $self->{db}->{'p'.$_};
-       _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
        # This is the inverse document frequency. The log of the inverse
        # fraction of documents the term occurs in.
        my $idf = log($self->{records}/$df);
        for my $did (keys %post) {
-         if (my $freq = $self->{db}->{$M, $did}) {
+         if (my $freq = $self->{db}->{'m'. $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
-Line 460 
 sub search_raw {
+Line 531 
 sub search_raw {
    unless ($strict) {
      for (@terms) {
        # Unpack posting list for current query term $_
-       my %post = unpack 'w*', $self->{db}->{$_};
+       my %post = unpack 'w*', $self->{db}->{'p'.$_};
        # Lookup the number of documents the term occurs in (document frequency)
-       my $occ  = $self->{db}->{$O,$_};
+       my $occ  = $self->{db}->{'o'.$_};
-       _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
+       _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
        # The frequency *must* be 1 at least since the posting list is nonempty
        _complain('search for term', $_) and $occ = 1 if $occ < 1;
-Line 488 
 sub search_raw {
+Line 559 
 sub search_raw {
        if (keys %score < $wanted) {
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{$M, $did}) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
        } else {
          for my $did (keys %score) {
            next unless exists $post{$did};
-           if (my $freq = $self->{db}->{$M, $did}) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
-Line 515 
 sub search_raw {
+Line 586 
 sub search_raw {
    for (my $i = $#terms; $i >=0; $i--) {
      local $_ = $terms[$i];
      # Lookup the number of documents the term occurs in (document frequency)
-     my $df      = $self->{db}->{$O,$_};
+     my $df      = $self->{db}->{'o'.$_};
      # The frequency *must* be 1 at least since the posting list is nonempty
      _complain('search for term', $_) and $df = 1 if $df < 1;
-Line 526 
 sub search_raw {
+Line 597 
 sub search_raw {
      my ($did,$occ);
      if ($self->{reorg}) {
-       ($did,$occ) = unpack 'w2', $self->{db}->{$_};
+       ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
      } else {                    # Maybe this costs more than it helps
-       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
+       ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
      }
-     my $freq      = $self->{db}->{$M, $did};
+     my $freq      = $self->{db}->{'m'. $did};
      my $max       = $occ/$freq*$idf[$i];
      $max[$i]      = $max + $max[$i+1];
    }
-Line 540 
 sub search_raw {
+Line 611 
 sub search_raw {
      my $term = $terms[$i];
      # Unpack posting list for current query term $term. We loose the
      # sorting order because the assignment to a hash.
-     my %post = unpack 'w*', $self->{db}->{$term};
+     my %post = unpack 'w*', $self->{db}->{'p'.$term};
      _complain('search for term', $term)
-       if $self->{db}->{$O,$term} != keys %post;
+       if $self->{db}->{'o'.$term} != keys %post;
      my $idf  = $idf[$i];
      my $full;                   # Need to process all postings
-Line 572 
 sub search_raw {
+Line 643 
 sub search_raw {
        if (defined $chop) {
          # We might be able to avoid allocating accumulators
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{$M, $did}) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
              my $wgt = $post{$did} / $freq * $idf;
              # We add an accumulator if $wgt exeeds $chop
              if (exists $score{$did} or $wgt > $chop) {
-Line 583 
 sub search_raw {
+Line 654 
 sub search_raw {
        } else {
          # Allocate acumulators for each seen document.
          for my $did (keys %post) {
-           if (my $freq = $self->{db}->{$M, $did}) {
+           if (my $freq = $self->{db}->{'m'. $did}) {
              $score{$did} += $post{$did} / $freq * $idf;
            }
          }
-Line 592 
 sub search_raw {
+Line 663 
 sub search_raw {
        # Update existing accumulators
        for my $did (keys %score) {
          next unless exists $post{$did};
-         if (my $freq = $self->{db}->{$M, $did}) {
+         if (my $freq = $self->{db}->{'m'. $did}) {
            $score{$did} += $post{$did} / $freq * $idf;
          }
        }
-Line 615 
 sub set {
+Line 686 
 sub set {
    $self->sync;
    while (my($key, $value) = each %{$self->{db}}) {
-     next if $key =~ /^\377[om]/;
+     next if $key !~ /^p/;
      $self->{db}->{$key} = $self->sort_postings($value);
    }
    $self->{reorg} = 1;
-Line 627 
 sub sync {
+Line 698 
 sub sync {
    if ($self->{mode} & O_RDWR) {
      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
      while (my($key, $value) = each %{$self->{cache}}) {
+       $self->{db}->{"p". $key} ||= "";
        if ($self->{reorg}) {
-         $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
+         $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
                                                     . $value);
        } else {
-         $self->{db}->{$key} .= $value;
+         $self->{db}->{'p'.$key} .= $value;
        }
      }
      while (my($key, $value) = each %{$self->{cdict}}) {
-       $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
+       $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
-       $self->{db}->{$key} += $value;
+       $self->{db}->{'o'.$key} += $value;
      }
      $self->{cache}  = {};
      $self->{cdict}  = {};

 Legend:



Removed from v.19
 


changed lines


 
Added in v.40
 Legend:



Removed from v.19
 


changed lines


 
Added in v.40
-Removed from v.19
+Added in v.40

	ViewVC Help
Powered by ViewVC 1.1.26