/[wait]/trunk/lib/WAIT/InvertedIndex.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory | Revision Log | View Patch Patch

-revision 33 by ulpfr,
Sun Nov 12 13:25:57 2000 UTC
+revision 85 by ulpfr,
Fri May  3 16:16:10 2002 UTC
 Line 4
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 12 14:20:56 2000
+ # Last Modified On: Sat Apr 27 16:13:55 2002
  # Language        : CPerl
  #
- # (C) Copyright 1996-2000, Ulrich Pfeifer
+ # (C) Copyright 1996-2002, Ulrich Pfeifer
  #
  package WAIT::InvertedIndex;
  use strict;
- use DB_File;
+ use BerkeleyDB;
  use Fcntl;
  use WAIT::Filter;
  use Carp;
  use vars qw(%FUNC $VERSION);
- $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
+ $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
  # The dictionary has three different key types:
  #  'o'.$word
  #
  #     The document frequency is the number of documents a term occurs
- #     in. The idea is that a term occuring in a significant part of the
+ #     in. The idea is that a term occuring in a significant portion of the
  #     documents is not too significant.
  #
  # 'm'.$word
  #
  #     The maximum term frequency of a document is the frequency of the
  #     most frequent term in the document.  It is related to the document
- #     length obviously.  A document in which the most frequnet term occurs
+ #     length obviously.  A document in which the most frequent term occurs
  #     100 times is probably much longer than a document whichs most
  #     frequent term occurs five time.
  #
 Line 156 
 sub is_an_old_index {
    defined $self->{db} or $self->open;
    $self->sync;
-   my $dbh = $self->{dbh};       # for convenience
+   my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
    my $O = pack('C', 0xff)."o";
-   my ($word, $value) = ($O.$;);  # $word and $value are modified!
+   my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
-   $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;
+   if ( my $ret = $dbh->seq($word, $value, DB_CURSOR) ) {
+     # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
+     return $self->{old_index} = 0;
+   }
    for (my $i=0; $i<10;$i++) {
      if ($value !~ /^\d+$/) {
+       # warn "DEBUG: word[$word]value[$value], not an old index";
        return $self->{old_index} = 0;
      }
-     if ($dbh->seq($word, $value, R_NEXT) or # no values left
+     if (my $ret = $dbh->seq($word, $value, DB_NEXT) or # no values left
-         $word !~ /^$O/o                     # no $O values left
+         $word !~ /^$O$;/o                   # no $O values left
         ) {
        # we are not sure enough that this is an old index
+       # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
        return $self->{old_index} = 0;
      }
    }
+   # warn "DEBUG: old index";
    return $self->{old_index} = 1;
  }
-Line 184 
 sub open {
+Line 190 
 sub open {
    } else {
      $self->{func}     =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
-     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
+     $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
-                        $self->{mode}, 0664, $DB_BTREE);
+                        -Filename => $self->{file},
+                        -Subname  => $self->{name},
+                        -Mode     => $self->{mode};
      $self->{cache} = {}
        if $self->{mode} & O_RDWR;
      $self->{cdict} = {}
-Line 245 
 sub sort_postings {
+Line 253 
 sub sort_postings {
    my $r = '';
    # Sort posting list by increasing ratio of maximum term frequency (~
-   # "document length") and term frequency. This rati multipied by the
+   # "document length") and term frequency. This ratio multipied by the
    # inverse document frequence gives the score for a term.  This sort
    # order can be exploited for tuning of single term queries.
-Line 322 
 sub intervall {
+Line 330 
 sub intervall {
    $last  = (defined $last)?'p'.$last:'q';
    # set the cursor to $first
-   $dbh->seq($first, $value, R_CURSOR);
+   $dbh->seq($first, $value, DB_CURSOR);
    # $first would be after the last word
    return () if $first gt $last;
    push @result, substr($first,1);
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+   while (!$dbh->seq($word, $value, DB_NEXT)) {
      # We should limit this to a "resonable" number of words
      last if $word gt $last;
      push @result, substr($word,1);
-Line 356 
 sub prefix {
+Line 364 
 sub prefix {
      ($prefix) = &{$self->{'pfunc'}}($prefix);
    }
-   if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
+   if ($dbh->seq($word = 'p'.$prefix, $value, DB_CURRENT)) {
      return ();
    }
    return () if $word !~ /^p$prefix/;
    push @result, substr($word,1);
-   while (!$dbh->seq($word, $value, R_NEXT)) {
+   while (!$dbh->seq($word, $value, DB_NEXT)) {
      # We should limit this to a "resonable" number of words
      last if $word !~ /^p$prefix/;
      push @result, substr($word,1);
-Line 405 
 sub search {
+Line 413 
 sub search {
    defined $self->{db} or $self->open;
    $self->sync;
-   $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
+   $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
  }
  sub parse {
-Line 415 
 sub parse {
+Line 423 
 sub parse {
    &{$self->{func}}(@_);
  }
- sub keys {
-   my $self  = shift;
-   defined $self->{db} or $self->open;
-   keys %{$self->{db}};
- }
  sub search_prefix {
    my $self  = shift;
-Line 465 
 sub search_raw {
+Line 466 
 sub search_raw {
        # check which words occur in the index.
        grep { $self->{db}->{'o'.$_} } @_;
-   return () unless @terms;                 # nothing to search for
+   return unless @terms;
    # We special-case one term queries here.  If the index was sorted,
    # choping off the rest of the list will return the same ranking.
-Line 614 
 sub search_raw {
+Line 615 
 sub search_raw {
      my $full;                   # Need to process all postings
      my $chop;                   # Score necessary to enter the ranking list
-     if (# We know that wanted is true since we especial cased the
+     if (# We know that wanted is true since we special cased the
          # exhaustive search.
          $wanted and
-Line 670 
 sub search_raw {
+Line 671 
 sub search_raw {
  sub set {
    my ($self, $attr, $value) = @_;
-   die "No such indexy attribute: '$attr'" unless $attr eq 'top';
+   die "No such index attribute: '$attr'" unless $attr eq 'top';
    return delete $self->{reorg} if $value == 0;
-Line 728 
 sub close {
+Line 729 
 sub close {
    }
  }
+ sub keys {
+   my $self  = shift;
+   defined $self->{db} or $self->open;
+   keys %{$self->{db}};
+ }
 ;

 Legend:



Removed from v.33
 


changed lines


 
Added in v.85
 Legend:



Removed from v.33
 


changed lines


 
Added in v.85
-Removed from v.33
+Added in v.85

	ViewVC Help
Powered by ViewVC 1.1.26