/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 22 by ulpfr, Sat Nov 11 16:58:53 2000 UTC revision 51 by ulpfr, Mon Dec 31 14:00:22 2001 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sat Nov 11 16:32:38 2000  # Last Modified On: Mon Dec 31 14:30:05 2001
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
# Line 16  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21    $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23  # The dictionary has three different key types:  # The dictionary has three different key types:
24  #  'o'.$word  #  'o'.$word
# Line 157  sub is_an_old_index { Line 159  sub is_an_old_index {
159    my $dbh = $self->{dbh};       # for convenience    my $dbh = $self->{dbh};       # for convenience
160    
161    my $O = pack('C', 0xff)."o";    my $O = pack('C', 0xff)."o";
162    my ($word, $value) = ($O.$;);    my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163    $dbh->seq($word, $value, R_CURSOR);    if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167    for (my $i=0; $i<10;$i++) {    for (my $i=0; $i<10;$i++) {
168      if ($value !~ /^\d+$/) {      if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170        return $self->{old_index} = 0;        return $self->{old_index} = 0;
171      }      }
172      if ($dbh->seq($word, $value, R_NEXT) or # no values left      if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173          $word !~ /^$O/o                     # no $O values left          $word !~ /^$O$;/o                   # no $O values left
174         ) {         ) {
175        # we are not sure enough that this is an old index        # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177        return $self->{old_index} = 0;        return $self->{old_index} = 0;
178      }      }
179    }    }
180      # warn "DEBUG: old index";
181    return $self->{old_index} = 1;    return $self->{old_index} = 1;
182  }  }
183    
# Line 247  sub sort_postings { Line 255  sub sort_postings {
255    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
256    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
257    
258      for my $did (keys %$post) { # sanity check
259        unless ($self->{db}->{"m". $did}) {
260          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262        }
263      }
264    for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}    for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
265                                        <=>                                        <=>
266                           $post->{$a} / $self->{db}->{'m'. $a}                           $post->{$a} / $self->{db}->{'m'. $a}
# Line 274  sub delete { Line 288  sub delete {
288    
289    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
290    
291      # Be prepared for "Odd number of elements in hash assignment"
292      local $SIG{__WARN__} = sub {
293        my $warning = shift;
294        chomp $warning;
295        warn "Catching warning[$warning] during delete of key[$key]";
296      };
297    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
298      my %post = unpack 'w*', $db->{'p'.$_};      my %post = unpack 'w*', $db->{'p'.$_};
299      delete $post{$key};      delete $post{$key};
# Line 401  sub parse { Line 421  sub parse {
421    &{$self->{func}}(@_);    &{$self->{func}}(@_);
422  }  }
423    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
424  sub search_prefix {  sub search_prefix {
425    my $self  = shift;    my $self  = shift;
426    
# Line 451  sub search_raw { Line 464  sub search_raw {
464        # check which words occur in the index.        # check which words occur in the index.
465        grep { $self->{db}->{'o'.$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
466    
467    return () unless @terms;                 # nothing to search for    return unless @terms;
468    
469    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
470    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
# Line 467  sub search_raw { Line 480  sub search_raw {
480      }      }
481    
482      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
483        $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;        # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
484          # above was written badly, allows two DIV_ZERO problems.
485          my $maxtf = $self->{db}->{"m". $res[$i-1]};
486          unless ($maxtf) {
487            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
488            $maxtf = 1;
489          }
490          $res[$i] = ($res[$i] / $maxtf) * $idf;
491      }      }
492    
493      return @res      return @res
# Line 671  sub sync { Line 691  sub sync {
691    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
692      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
693      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
694          $self->{db}->{"p". $key} ||= "";
695        if ($self->{reorg}) {        if ($self->{reorg}) {
696          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
697                                                     . $value);                                                     . $value);
# Line 706  sub close { Line 727  sub close {
727    }    }
728  }  }
729    
730    sub keys {
731      my $self  = shift;
732    
733      defined $self->{db} or $self->open;
734      keys %{$self->{db}};
735    }
736    
737  1;  1;
738    

Legend:
Removed from v.22  
changed lines
  Added in v.51

  ViewVC Help
Powered by ViewVC 1.1.26