/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 36 by ulpfr, Sun Nov 12 17:01:59 2000 UTC revision 85 by ulpfr, Fri May 3 16:16:10 2002 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 12 14:40:21 2000  # Last Modified On: Sat Apr 27 16:13:55 2002
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2002, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
15  use DB_File;  use BerkeleyDB;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC $VERSION);  use vars qw(%FUNC $VERSION);
20    
21  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION  $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
22    
23  # The dictionary has three different key types:  # The dictionary has three different key types:
24  #  'o'.$word  #  'o'.$word
25  #  #
26  #     The document frequency is the number of documents a term occurs  #     The document frequency is the number of documents a term occurs
27  #     in. The idea is that a term occuring in a significant part of the  #     in. The idea is that a term occuring in a significant portion of the
28  #     documents is not too significant.  #     documents is not too significant.
29  #  #
30  # 'm'.$word  # 'm'.$word
31  #  #
32  #     The maximum term frequency of a document is the frequency of the  #     The maximum term frequency of a document is the frequency of the
33  #     most frequent term in the document.  It is related to the document  #     most frequent term in the document.  It is related to the document
34  #     length obviously.  A document in which the most frequnet term occurs  #     length obviously.  A document in which the most frequent term occurs
35  #     100 times is probably much longer than a document whichs most  #     100 times is probably much longer than a document whichs most
36  #     frequent term occurs five time.  #     frequent term occurs five time.
37  #  #
# Line 156  sub is_an_old_index { Line 156  sub is_an_old_index {
156    
157    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
158    $self->sync;    $self->sync;
159    my $dbh = $self->{dbh};       # for convenience    my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
160    
161    my $O = pack('C', 0xff)."o";    my $O = pack('C', 0xff)."o";
162    my ($word, $value) = ($O.$;);  # $word and $value are modified!    my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163    $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;    if ( my $ret = $dbh->seq($word, $value, DB_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167    for (my $i=0; $i<10;$i++) {    for (my $i=0; $i<10;$i++) {
168      if ($value !~ /^\d+$/) {      if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170        return $self->{old_index} = 0;        return $self->{old_index} = 0;
171      }      }
172      if ($dbh->seq($word, $value, R_NEXT) or # no values left      if (my $ret = $dbh->seq($word, $value, DB_NEXT) or # no values left
173          $word !~ /^$O$;/o                   # no $O values left          $word !~ /^$O$;/o                   # no $O values left
174         ) {         ) {
175        # we are not sure enough that this is an old index        # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177        return $self->{old_index} = 0;        return $self->{old_index} = 0;
178      }      }
179    }    }
180      # warn "DEBUG: old index";
181    return $self->{old_index} = 1;    return $self->{old_index} = 1;
182  }  }
183    
# Line 184  sub open { Line 190  sub open {
190    } else {    } else {
191      $self->{func}     =      $self->{func}     =
192        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
194                         $self->{mode}, 0664, $DB_BTREE);                         -Filename => $self->{file},
195                           -Subname  => $self->{name},
196                           -Mode     => $self->{mode};
197      $self->{cache} = {}      $self->{cache} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
199      $self->{cdict} = {}      $self->{cdict} = {}
# Line 245  sub sort_postings { Line 253  sub sort_postings {
253    my $r = '';    my $r = '';
254    
255    # Sort posting list by increasing ratio of maximum term frequency (~    # Sort posting list by increasing ratio of maximum term frequency (~
256    # "document length") and term frequency. This rati multipied by the    # "document length") and term frequency. This ratio multipied by the
257    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
258    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
259    
# Line 322  sub intervall { Line 330  sub intervall {
330    $last  = (defined $last)?'p'.$last:'q';    $last  = (defined $last)?'p'.$last:'q';
331    
332    # set the cursor to $first    # set the cursor to $first
333    $dbh->seq($first, $value, R_CURSOR);    $dbh->seq($first, $value, DB_CURSOR);
334    
335    # $first would be after the last word    # $first would be after the last word
336    return () if $first gt $last;    return () if $first gt $last;
337        
338    push @result, substr($first,1);    push @result, substr($first,1);
339    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, DB_NEXT)) {
340      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
341      last if $word gt $last;      last if $word gt $last;
342      push @result, substr($word,1);      push @result, substr($word,1);
# Line 356  sub prefix { Line 364  sub prefix {
364      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
365    }    }
366    
367    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, DB_CURRENT)) {
368      return ();      return ();
369    }    }
370    return () if $word !~ /^p$prefix/;    return () if $word !~ /^p$prefix/;
371    push @result, substr($word,1);    push @result, substr($word,1);
372    
373    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, DB_NEXT)) {
374      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
375      last if $word !~ /^p$prefix/;      last if $word !~ /^p$prefix/;
376      push @result, substr($word,1);      push @result, substr($word,1);
# Line 405  sub search { Line 413  sub search {
413    
414    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
415    $self->sync;    $self->sync;
416    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
417  }  }
418    
419  sub parse {  sub parse {
# Line 415  sub parse { Line 423  sub parse {
423    &{$self->{func}}(@_);    &{$self->{func}}(@_);
424  }  }
425    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
426  sub search_prefix {  sub search_prefix {
427    my $self  = shift;    my $self  = shift;
428    
# Line 465  sub search_raw { Line 466  sub search_raw {
466        # check which words occur in the index.        # check which words occur in the index.
467        grep { $self->{db}->{'o'.$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
468    
469    return () unless @terms;                 # nothing to search for    return unless @terms;
470    
471    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
472    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
# Line 614  sub search_raw { Line 615  sub search_raw {
615      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
616      my $chop;                   # Score necessary to enter the ranking list      my $chop;                   # Score necessary to enter the ranking list
617    
618      if (# We know that wanted is true since we especial cased the      if (# We know that wanted is true since we special cased the
619          # exhaustive search.          # exhaustive search.
620    
621          $wanted and          $wanted and
# Line 670  sub search_raw { Line 671  sub search_raw {
671  sub set {  sub set {
672    my ($self, $attr, $value) = @_;    my ($self, $attr, $value) = @_;
673    
674    die "No such indexy attribute: '$attr'" unless $attr eq 'top';    die "No such index attribute: '$attr'" unless $attr eq 'top';
675    
676    return delete $self->{reorg} if $value == 0;    return delete $self->{reorg} if $value == 0;
677    
# Line 728  sub close { Line 729  sub close {
729    }    }
730  }  }
731    
732    sub keys {
733      my $self  = shift;
734    
735      defined $self->{db} or $self->open;
736      keys %{$self->{db}};
737    }
738    
739  1;  1;
740    

Legend:
Removed from v.36  
changed lines
  Added in v.85

  ViewVC Help
Powered by ViewVC 1.1.26