/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 20 by cvs2svn, Tue May 9 11:29:45 2000 UTC revision 51 by ulpfr, Mon Dec 31 14:00:22 2001 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Tue May  9 08:33:28 2000  # Last Modified On: Mon Dec 31 14:30:05 2001
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
# Line 16  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23  # The document frequency is the number of documents a term occurs  # The dictionary has three different key types:
24  # in. The idea is that a term occuring in a significant part of the  #  'o'.$word
25  # documents is not too significant.  #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequnet term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43  my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)  my $no_old_index_support = 0; # do not check for old indices if set
   
 # The maximum term frequency of a document is the frequency of the  
 # most frequent term in the document.  It is related to the document  
 # length obviously.  A document in which the most frequnet term occurs  
 # 100 times is probably much longer than a document whichs most  
 # frequent term occurs five time.  
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 134  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163      if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167      for (my $i=0; $i<10;$i++) {
168        if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170          return $self->{old_index} = 0;
171        }
172        if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173            $word !~ /^$O$;/o                   # no $O values left
174           ) {
175          # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177          return $self->{old_index} = 0;
178        }
179      }
180      # warn "DEBUG: old index";
181      return $self->{old_index} = 1;
182    }
183    
184  sub open {  sub open {
185    my $self = shift;    my $self = shift;
186    my $file = $self->{file};    my $file = $self->{file};
# Line 150  sub open { Line 197  sub open {
197      $self->{cdict} = {}      $self->{cdict} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
199      $self->{cached} = 0;      $self->{cached} = 0;
200        if (!$no_old_index_support and $self->is_an_old_index()) {
201          warn "This is an old index, upgrade you database";
202          require WAIT::InvertedIndexOld;
203          bless $self, 'WAIT::InvertedIndexOld';
204        }
205    }    }
206  }  }
207    
# Line 164  sub insert { Line 216  sub insert {
216    $self->{records}++;    $self->{records}++;
217    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
218      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
219        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
220        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221      } else {      } else {          
222        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
223        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
224      }      }
225      $self->{cached}++;      $self->{cached}++;
# Line 178  sub insert { Line 230  sub insert {
230    for (values %occ) {    for (values %occ) {
231      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
232    }    }
233    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
234  }  }
235    
236  # We sort postings by increasing max term frequency (~ by increasing  # We sort postings by increasing max term frequency (~ by increasing
# Line 203  sub sort_postings { Line 255  sub sort_postings {
255    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
256    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
257    
258    for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}    for my $did (keys %$post) { # sanity check
259        unless ($self->{db}->{"m". $did}) {
260          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262        }
263      }
264      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
265                                        <=>                                        <=>
266                           $post->{$a} / $self->{db}->{$M, $a}                           $post->{$a} / $self->{db}->{'m'. $a}
267                      } keys %$post) {                      } keys %$post) {
268      $r .= pack 'w2', $did, $post->{$did};      $r .= pack 'w2', $did, $post->{$did};
269    }    }
# Line 230  sub delete { Line 288  sub delete {
288    
289    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
290    
291      # Be prepared for "Odd number of elements in hash assignment"
292      local $SIG{__WARN__} = sub {
293        my $warning = shift;
294        chomp $warning;
295        warn "Catching warning[$warning] during delete of key[$key]";
296      };
297    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
298      my %post = unpack 'w*', $db->{$_};      my %post = unpack 'w*', $db->{'p'.$_};
299      delete $post{$key};      delete $post{$key};
300      $db->{$_}    = $self->sort_postings(\%post);      $db->{'p'.$_}    = $self->sort_postings(\%post);
301      _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;      _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302      $db->{$O,$_} = scalar keys %post;      $db->{'o'.$_} = scalar keys %post;
303    }    }
304    delete $db->{$M, $key};    delete $db->{'m'. $key};
305  }  }
306    
307  sub intervall {  sub intervall {
# Line 260  sub intervall { Line 324  sub intervall {
324      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
325      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
326    }    }
327    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
328      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
329    } else {  
330      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
331    }    $dbh->seq($first, $value, R_CURSOR);
332    # We assume that word do not start with the character \377  
333    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
334    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
335        
336    push @result, $first;    push @result, substr($first,1);
337    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
338      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
339      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
340      push @result, $word;      push @result, substr($word,1);
341    }    }
342    \@result;                     # speed    \@result;                     # speed
343  }  }
# Line 298  sub prefix { Line 362  sub prefix {
362      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
363    }    }
364    
365    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366      return ();      return ();
367    }    }
368    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
369    push @result, $word;    push @result, substr($word,1);
370    
371    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
372      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
373      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
374      push @result, $word;      push @result, substr($word,1);
375    }    }
376    \@result;                     # speed    \@result;                     # speed
377  }  }
# Line 357  sub parse { Line 421  sub parse {
421    &{$self->{func}}(@_);    &{$self->{func}}(@_);
422  }  }
423    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
424  sub search_prefix {  sub search_prefix {
425    my $self  = shift;    my $self  = shift;
426    
# Line 403  sub search_raw { Line 460  sub search_raw {
460    # We keep duplicates    # We keep duplicates
461    my @terms =    my @terms =
462      # Sort words by decreasing document frequency      # Sort words by decreasing document frequency
463      sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
464        # check which words occur in the index.        # check which words occur in the index.
465        grep { $self->{db}->{$O,$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
466    
467    return () unless @terms;                 # nothing to search for    return unless @terms;
468    
469    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
470    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
471    if ($wanted and @terms == 1) {    if ($wanted and @terms == 1) {
472      my $term  = shift @terms;      my $term  = shift @terms;
473      my $idf   = log($self->{records}/$self->{db}->{$O,$term});      my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
474      my @res;      my @res;
475    
476      if ($self->{reorg}) { # or not $query->{picky}      if ($self->{reorg}) { # or not $query->{picky}
477        @res = unpack "w". int(2*$wanted), $self->{db}->{$term};        @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
478      } else {      } else {
479        @res = unpack 'w*',                $self->{db}->{$term};        @res = unpack 'w*',                $self->{db}->{'p'.$term};
480      }      }
481    
482      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
483        $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;        # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
484          # above was written badly, allows two DIV_ZERO problems.
485          my $maxtf = $self->{db}->{"m". $res[$i-1]};
486          unless ($maxtf) {
487            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
488            $maxtf = 1;
489          }
490          $res[$i] = ($res[$i] / $maxtf) * $idf;
491      }      }
492    
493      return @res      return @res
# Line 434  sub search_raw { Line 498  sub search_raw {
498    # result.    # result.
499    unless ($wanted) {    unless ($wanted) {
500      for (@terms) {      for (@terms) {
501        my $df      = $self->{db}->{$O,$_};        my $df      = $self->{db}->{'o'.$_};
502    
503        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
504        _complain('search for term', $_) and $df = 1 if $df < 1;        _complain('search for term', $_) and $df = 1 if $df < 1;
505    
506        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
507        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
508    
509        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
510        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse
511        # fraction of documents the term occurs in.        # fraction of documents the term occurs in.
512        my $idf = log($self->{records}/$df);        my $idf = log($self->{records}/$df);
513        for my $did (keys %post) {        for my $did (keys %post) {
514          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
515            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
516          }          }
517        }        }
# Line 460  sub search_raw { Line 524  sub search_raw {
524    unless ($strict) {    unless ($strict) {
525      for (@terms) {      for (@terms) {
526        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
527        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
528    
529        # Lookup the number of documents the term occurs in (document frequency)        # Lookup the number of documents the term occurs in (document frequency)
530        my $occ  = $self->{db}->{$O,$_};        my $occ  = $self->{db}->{'o'.$_};
531    
532        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
533        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
534        _complain('search for term', $_) and $occ = 1 if $occ < 1;        _complain('search for term', $_) and $occ = 1 if $occ < 1;
535    
# Line 488  sub search_raw { Line 552  sub search_raw {
552                
553        if (keys %score < $wanted) {        if (keys %score < $wanted) {
554          for my $did (keys %post) {          for my $did (keys %post) {
555            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
556              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
557            }            }
558          }          }
559        } else {        } else {
560          for my $did (keys %score) {          for my $did (keys %score) {
561            next unless exists $post{$did};            next unless exists $post{$did};
562            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
563              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
564            }            }
565          }          }
# Line 515  sub search_raw { Line 579  sub search_raw {
579    for (my $i = $#terms; $i >=0; $i--) {    for (my $i = $#terms; $i >=0; $i--) {
580      local $_ = $terms[$i];      local $_ = $terms[$i];
581      # Lookup the number of documents the term occurs in (document frequency)      # Lookup the number of documents the term occurs in (document frequency)
582      my $df      = $self->{db}->{$O,$_};      my $df      = $self->{db}->{'o'.$_};
583    
584      # The frequency *must* be 1 at least since the posting list is nonempty      # The frequency *must* be 1 at least since the posting list is nonempty
585      _complain('search for term', $_) and $df = 1 if $df < 1;      _complain('search for term', $_) and $df = 1 if $df < 1;
# Line 526  sub search_raw { Line 590  sub search_raw {
590    
591      my ($did,$occ);      my ($did,$occ);
592      if ($self->{reorg}) {      if ($self->{reorg}) {
593        ($did,$occ) = unpack 'w2', $self->{db}->{$_};        ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
594      } else {                    # Maybe this costs more than it helps      } else {                    # Maybe this costs more than it helps
595        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
596      }      }
597      my $freq      = $self->{db}->{$M, $did};      my $freq      = $self->{db}->{'m'. $did};
598      my $max       = $occ/$freq*$idf[$i];      my $max       = $occ/$freq*$idf[$i];
599      $max[$i]      = $max + $max[$i+1];      $max[$i]      = $max + $max[$i+1];
600    }    }
# Line 540  sub search_raw { Line 604  sub search_raw {
604      my $term = $terms[$i];      my $term = $terms[$i];
605      # Unpack posting list for current query term $term. We loose the      # Unpack posting list for current query term $term. We loose the
606      # sorting order because the assignment to a hash.      # sorting order because the assignment to a hash.
607      my %post = unpack 'w*', $self->{db}->{$term};      my %post = unpack 'w*', $self->{db}->{'p'.$term};
608    
609      _complain('search for term', $term)      _complain('search for term', $term)
610        if $self->{db}->{$O,$term} != keys %post;        if $self->{db}->{'o'.$term} != keys %post;
611    
612      my $idf  = $idf[$i];      my $idf  = $idf[$i];
613      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
# Line 572  sub search_raw { Line 636  sub search_raw {
636        if (defined $chop) {        if (defined $chop) {
637          # We might be able to avoid allocating accumulators          # We might be able to avoid allocating accumulators
638          for my $did (keys %post) {          for my $did (keys %post) {
639            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
640              my $wgt = $post{$did} / $freq * $idf;              my $wgt = $post{$did} / $freq * $idf;
641              # We add an accumulator if $wgt exeeds $chop              # We add an accumulator if $wgt exeeds $chop
642              if (exists $score{$did} or $wgt > $chop) {              if (exists $score{$did} or $wgt > $chop) {
# Line 583  sub search_raw { Line 647  sub search_raw {
647        } else {        } else {
648          # Allocate acumulators for each seen document.          # Allocate acumulators for each seen document.
649          for my $did (keys %post) {          for my $did (keys %post) {
650            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
651              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
652            }            }
653          }          }
# Line 592  sub search_raw { Line 656  sub search_raw {
656        # Update existing accumulators        # Update existing accumulators
657        for my $did (keys %score) {        for my $did (keys %score) {
658          next unless exists $post{$did};          next unless exists $post{$did};
659          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
660            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
661          }          }
662        }        }
# Line 615  sub set { Line 679  sub set {
679    
680    $self->sync;    $self->sync;
681    while (my($key, $value) = each %{$self->{db}}) {    while (my($key, $value) = each %{$self->{db}}) {
682      next if $key =~ /^\377[om]/;      next if $key !~ /^p/;
683      $self->{db}->{$key} = $self->sort_postings($value);      $self->{db}->{$key} = $self->sort_postings($value);
684    }    }
685    $self->{reorg} = 1;    $self->{reorg} = 1;
# Line 627  sub sync { Line 691  sub sync {
691    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
692      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
693      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
694          $self->{db}->{"p". $key} ||= "";
695        if ($self->{reorg}) {        if ($self->{reorg}) {
696          $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
697                                                     . $value);                                                     . $value);
698        } else {        } else {
699          $self->{db}->{$key} .= $value;          $self->{db}->{'p'.$key} .= $value;
700        }        }
701      }      }
702      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
703        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
704        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
705      }      }
706      $self->{cache}  = {};      $self->{cache}  = {};
707      $self->{cdict}  = {};      $self->{cdict}  = {};
# Line 662  sub close { Line 727  sub close {
727    }    }
728  }  }
729    
730    sub keys {
731      my $self  = shift;
732    
733      defined $self->{db} or $self->open;
734      keys %{$self->{db}};
735    }
736    
737  1;  1;
738    

Legend:
Removed from v.20  
changed lines
  Added in v.51

  ViewVC Help
Powered by ViewVC 1.1.26