/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 19 by ulpfr, Tue May 9 11:29:45 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 40 by laperla, Mon Nov 13 10:44:03 2000 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Tue May  9 08:33:28 2000  # Last Modified On: Sun Nov 12 14:40:21 2000
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
# Line 16  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23  # The document frequency is the number of documents a term occurs  # The dictionary has three different key types:
24  # in. The idea is that a term occuring in a significant part of the  #  'o'.$word
25  # documents is not too significant.  #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequnet term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43  my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)  my $no_old_index_support = 0; # do not check for old indices if set
   
 # The maximum term frequency of a document is the frequency of the  
 # most frequent term in the document.  It is related to the document  
 # length obviously.  A document in which the most frequnet term occurs  
 # 100 times is probably much longer than a document whichs most  
 # frequent term occurs five time.  
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 134  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163      if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167      for (my $i=0; $i<10;$i++) {
168        if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170          return $self->{old_index} = 0;
171        }
172        if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173            $word !~ /^$O$;/o                   # no $O values left
174           ) {
175          # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177          return $self->{old_index} = 0;
178        }
179      }
180      # warn "DEBUG: old index";
181      return $self->{old_index} = 1;
182    }
183    
184  sub open {  sub open {
185    my $self = shift;    my $self = shift;
186    my $file = $self->{file};    my $file = $self->{file};
# Line 150  sub open { Line 197  sub open {
197      $self->{cdict} = {}      $self->{cdict} = {}
198        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
199      $self->{cached} = 0;      $self->{cached} = 0;
200        if (!$no_old_index_support and $self->is_an_old_index()) {
201          warn "This is an old index, upgrade you database";
202          require WAIT::InvertedIndexOld;
203          bless $self, 'WAIT::InvertedIndexOld';
204        }
205    }    }
206  }  }
207    
# Line 164  sub insert { Line 216  sub insert {
216    $self->{records}++;    $self->{records}++;
217    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
218      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
219        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
220        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221      } else {      } else {          
222        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
223        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
224      }      }
225      $self->{cached}++;      $self->{cached}++;
# Line 178  sub insert { Line 230  sub insert {
230    for (values %occ) {    for (values %occ) {
231      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
232    }    }
233    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
234  }  }
235    
236  # We sort postings by increasing max term frequency (~ by increasing  # We sort postings by increasing max term frequency (~ by increasing
# Line 203  sub sort_postings { Line 255  sub sort_postings {
255    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
256    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
257    
258    for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}    for my $did (keys %$post) { # sanity check
259        unless ($self->{db}->{"m". $did}) {
260          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262        }
263      }
264      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
265                                        <=>                                        <=>
266                           $post->{$a} / $self->{db}->{$M, $a}                           $post->{$a} / $self->{db}->{'m'. $a}
267                      } keys %$post) {                      } keys %$post) {
268      $r .= pack 'w2', $did, $post->{$did};      $r .= pack 'w2', $did, $post->{$did};
269    }    }
# Line 230  sub delete { Line 288  sub delete {
288    
289    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
290    
291      # Be prepared for "Odd number of elements in hash assignment"
292      local $SIG{__WARN__} = sub {
293        my $warning = shift;
294        chomp $warning;
295        warn "Catching warning[$warning] during delete of key[$key]";
296      };
297    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
298      my %post = unpack 'w*', $db->{$_};      my %post = unpack 'w*', $db->{'p'.$_};
299      delete $post{$key};      delete $post{$key};
300      $db->{$_}    = $self->sort_postings(\%post);      $db->{'p'.$_}    = $self->sort_postings(\%post);
301      _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;      _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302      $db->{$O,$_} = scalar keys %post;      $db->{'o'.$_} = scalar keys %post;
303    }    }
304    delete $db->{$M, $key};    delete $db->{'m'. $key};
305  }  }
306    
307  sub intervall {  sub intervall {
# Line 260  sub intervall { Line 324  sub intervall {
324      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
325      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
326    }    }
327    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
328      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
329    } else {  
330      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
331    }    $dbh->seq($first, $value, R_CURSOR);
332    # We assume that word do not start with the character \377  
333    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
334    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
335        
336    push @result, $first;    push @result, substr($first,1);
337    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
338      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
339      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
340      push @result, $word;      push @result, substr($word,1);
341    }    }
342    \@result;                     # speed    \@result;                     # speed
343  }  }
# Line 298  sub prefix { Line 362  sub prefix {
362      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
363    }    }
364    
365    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366      return ();      return ();
367    }    }
368    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
369    push @result, $word;    push @result, substr($word,1);
370    
371    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
372      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
373      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
374      push @result, $word;      push @result, substr($word,1);
375    }    }
376    \@result;                     # speed    \@result;                     # speed
377  }  }
# Line 403  sub search_raw { Line 467  sub search_raw {
467    # We keep duplicates    # We keep duplicates
468    my @terms =    my @terms =
469      # Sort words by decreasing document frequency      # Sort words by decreasing document frequency
470      sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
471        # check which words occur in the index.        # check which words occur in the index.
472        grep { $self->{db}->{$O,$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
473    
474    return () unless @terms;                 # nothing to search for    return unless @terms;
475    
476    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
477    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
478    if ($wanted and @terms == 1) {    if ($wanted and @terms == 1) {
479      my $term  = shift @terms;      my $term  = shift @terms;
480      my $idf   = log($self->{records}/$self->{db}->{$O,$term});      my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
481      my @res;      my @res;
482    
483      if ($self->{reorg}) { # or not $query->{picky}      if ($self->{reorg}) { # or not $query->{picky}
484        @res = unpack "w". int(2*$wanted), $self->{db}->{$term};        @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
485      } else {      } else {
486        @res = unpack 'w*',                $self->{db}->{$term};        @res = unpack 'w*',                $self->{db}->{'p'.$term};
487      }      }
488    
489      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
490        $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;        # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
491          # above was written badly, allows two DIV_ZERO problems.
492          my $maxtf = $self->{db}->{"m". $res[$i-1]};
493          unless ($maxtf) {
494            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
495            $maxtf = 1;
496          }
497          $res[$i] = ($res[$i] / $maxtf) * $idf;
498      }      }
499    
500      return @res      return @res
# Line 434  sub search_raw { Line 505  sub search_raw {
505    # result.    # result.
506    unless ($wanted) {    unless ($wanted) {
507      for (@terms) {      for (@terms) {
508        my $df      = $self->{db}->{$O,$_};        my $df      = $self->{db}->{'o'.$_};
509    
510        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
511        _complain('search for term', $_) and $df = 1 if $df < 1;        _complain('search for term', $_) and $df = 1 if $df < 1;
512    
513        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
514        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
515    
516        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
517        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse
518        # fraction of documents the term occurs in.        # fraction of documents the term occurs in.
519        my $idf = log($self->{records}/$df);        my $idf = log($self->{records}/$df);
520        for my $did (keys %post) {        for my $did (keys %post) {
521          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
522            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
523          }          }
524        }        }
# Line 460  sub search_raw { Line 531  sub search_raw {
531    unless ($strict) {    unless ($strict) {
532      for (@terms) {      for (@terms) {
533        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
534        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
535    
536        # Lookup the number of documents the term occurs in (document frequency)        # Lookup the number of documents the term occurs in (document frequency)
537        my $occ  = $self->{db}->{$O,$_};        my $occ  = $self->{db}->{'o'.$_};
538    
539        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
540        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
541        _complain('search for term', $_) and $occ = 1 if $occ < 1;        _complain('search for term', $_) and $occ = 1 if $occ < 1;
542    
# Line 488  sub search_raw { Line 559  sub search_raw {
559                
560        if (keys %score < $wanted) {        if (keys %score < $wanted) {
561          for my $did (keys %post) {          for my $did (keys %post) {
562            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
563              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
564            }            }
565          }          }
566        } else {        } else {
567          for my $did (keys %score) {          for my $did (keys %score) {
568            next unless exists $post{$did};            next unless exists $post{$did};
569            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
570              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
571            }            }
572          }          }
# Line 515  sub search_raw { Line 586  sub search_raw {
586    for (my $i = $#terms; $i >=0; $i--) {    for (my $i = $#terms; $i >=0; $i--) {
587      local $_ = $terms[$i];      local $_ = $terms[$i];
588      # Lookup the number of documents the term occurs in (document frequency)      # Lookup the number of documents the term occurs in (document frequency)
589      my $df      = $self->{db}->{$O,$_};      my $df      = $self->{db}->{'o'.$_};
590    
591      # The frequency *must* be 1 at least since the posting list is nonempty      # The frequency *must* be 1 at least since the posting list is nonempty
592      _complain('search for term', $_) and $df = 1 if $df < 1;      _complain('search for term', $_) and $df = 1 if $df < 1;
# Line 526  sub search_raw { Line 597  sub search_raw {
597    
598      my ($did,$occ);      my ($did,$occ);
599      if ($self->{reorg}) {      if ($self->{reorg}) {
600        ($did,$occ) = unpack 'w2', $self->{db}->{$_};        ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
601      } else {                    # Maybe this costs more than it helps      } else {                    # Maybe this costs more than it helps
602        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
603      }      }
604      my $freq      = $self->{db}->{$M, $did};      my $freq      = $self->{db}->{'m'. $did};
605      my $max       = $occ/$freq*$idf[$i];      my $max       = $occ/$freq*$idf[$i];
606      $max[$i]      = $max + $max[$i+1];      $max[$i]      = $max + $max[$i+1];
607    }    }
# Line 540  sub search_raw { Line 611  sub search_raw {
611      my $term = $terms[$i];      my $term = $terms[$i];
612      # Unpack posting list for current query term $term. We loose the      # Unpack posting list for current query term $term. We loose the
613      # sorting order because the assignment to a hash.      # sorting order because the assignment to a hash.
614      my %post = unpack 'w*', $self->{db}->{$term};      my %post = unpack 'w*', $self->{db}->{'p'.$term};
615    
616      _complain('search for term', $term)      _complain('search for term', $term)
617        if $self->{db}->{$O,$term} != keys %post;        if $self->{db}->{'o'.$term} != keys %post;
618    
619      my $idf  = $idf[$i];      my $idf  = $idf[$i];
620      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
# Line 572  sub search_raw { Line 643  sub search_raw {
643        if (defined $chop) {        if (defined $chop) {
644          # We might be able to avoid allocating accumulators          # We might be able to avoid allocating accumulators
645          for my $did (keys %post) {          for my $did (keys %post) {
646            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
647              my $wgt = $post{$did} / $freq * $idf;              my $wgt = $post{$did} / $freq * $idf;
648              # We add an accumulator if $wgt exeeds $chop              # We add an accumulator if $wgt exeeds $chop
649              if (exists $score{$did} or $wgt > $chop) {              if (exists $score{$did} or $wgt > $chop) {
# Line 583  sub search_raw { Line 654  sub search_raw {
654        } else {        } else {
655          # Allocate acumulators for each seen document.          # Allocate acumulators for each seen document.
656          for my $did (keys %post) {          for my $did (keys %post) {
657            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
658              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
659            }            }
660          }          }
# Line 592  sub search_raw { Line 663  sub search_raw {
663        # Update existing accumulators        # Update existing accumulators
664        for my $did (keys %score) {        for my $did (keys %score) {
665          next unless exists $post{$did};          next unless exists $post{$did};
666          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
667            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
668          }          }
669        }        }
# Line 615  sub set { Line 686  sub set {
686    
687    $self->sync;    $self->sync;
688    while (my($key, $value) = each %{$self->{db}}) {    while (my($key, $value) = each %{$self->{db}}) {
689      next if $key =~ /^\377[om]/;      next if $key !~ /^p/;
690      $self->{db}->{$key} = $self->sort_postings($value);      $self->{db}->{$key} = $self->sort_postings($value);
691    }    }
692    $self->{reorg} = 1;    $self->{reorg} = 1;
# Line 627  sub sync { Line 698  sub sync {
698    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
699      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
700      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
701          $self->{db}->{"p". $key} ||= "";
702        if ($self->{reorg}) {        if ($self->{reorg}) {
703          $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
704                                                     . $value);                                                     . $value);
705        } else {        } else {
706          $self->{db}->{$key} .= $value;          $self->{db}->{'p'.$key} .= $value;
707        }        }
708      }      }
709      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
710        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
711        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
712      }      }
713      $self->{cache}  = {};      $self->{cache}  = {};
714      $self->{cdict}  = {};      $self->{cdict}  = {};

Legend:
Removed from v.19  
changed lines
  Added in v.40

  ViewVC Help
Powered by ViewVC 1.1.26