/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 21 by cvs2svn, Tue May 9 11:29:45 2000 UTC revision 22 by ulpfr, Sat Nov 11 16:58:53 2000 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Tue May  9 08:33:28 2000  # Last Modified On: Sat Nov 11 16:32:38 2000
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
# Line 18  use WAIT::Filter; Line 18  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)  # The dictionary has three different key types:
22    #  'o'.$word
23    #
24    #     The document frequency is the number of documents a term occurs
25    #     in. The idea is that a term occuring in a significant part of the
26    #     documents is not too significant.
27    #
28    # 'm'.$word
29    #
30    #     The maximum term frequency of a document is the frequency of the
31    #     most frequent term in the document.  It is related to the document
32    #     length obviously.  A document in which the most frequnet term occurs
33    #     100 times is probably much longer than a document whichs most
34    #     frequent term occurs five time.
35    #
36    # 'p'.$word
37    #
38    #     Under this key we store the actual posting list as pairs of
39    #     packed integers.
40    
41  # The document frequency is the number of documents a term occurs  my $no_old_index_support = 0; # do not check for old indices if set
 # in. The idea is that a term occuring in a significant part of the  
 # documents is not too significant.  
   
 my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)  
   
 # The maximum term frequency of a document is the frequency of the  
 # most frequent term in the document.  It is related to the document  
 # length obviously.  A document in which the most frequnet term occurs  
 # 100 times is probably much longer than a document whichs most  
 # frequent term occurs five time.  
42    
43  sub new {  sub new {
44    my $type = shift;    my $type = shift;
# Line 134  sub drop { Line 142  sub drop {
142    }    }
143  }  }
144    
145    sub is_an_old_index {
146      my $self = shift;
147    
148      return 0 if $no_old_index_support;
149      return $self->{old_index} if exists $self->{old_index};
150    
151      # We can only guess if this is an old index. We lookup the first 10
152      # $O entries. If all values are integers, we assume that the index
153      # is an old one.
154    
155      defined $self->{db} or $self->open;
156      $self->sync;
157      my $dbh = $self->{dbh};       # for convenience
158    
159      my $O = pack('C', 0xff)."o";
160      my ($word, $value) = ($O.$;);
161      $dbh->seq($word, $value, R_CURSOR);
162      for (my $i=0; $i<10;$i++) {
163        if ($value !~ /^\d+$/) {
164          return $self->{old_index} = 0;
165        }
166        if ($dbh->seq($word, $value, R_NEXT) or # no values left
167            $word !~ /^$O/o                     # no $O values left
168           ) {
169          # we are not sure enough that this is an old index
170          return $self->{old_index} = 0;
171        }
172      }
173      return $self->{old_index} = 1;
174    }
175    
176  sub open {  sub open {
177    my $self = shift;    my $self = shift;
178    my $file = $self->{file};    my $file = $self->{file};
# Line 150  sub open { Line 189  sub open {
189      $self->{cdict} = {}      $self->{cdict} = {}
190        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
191      $self->{cached} = 0;      $self->{cached} = 0;
192        if (!$no_old_index_support and $self->is_an_old_index()) {
193          warn "This is an old index, upgrade you database";
194          require WAIT::InvertedIndexOld;
195          bless $self, 'WAIT::InvertedIndexOld';
196        }
197    }    }
198  }  }
199    
# Line 164  sub insert { Line 208  sub insert {
208    $self->{records}++;    $self->{records}++;
209    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
210      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
211        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
212        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
213      } else {      } else {          
214        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
215        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
216      }      }
217      $self->{cached}++;      $self->{cached}++;
# Line 178  sub insert { Line 222  sub insert {
222    for (values %occ) {    for (values %occ) {
223      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
224    }    }
225    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
226  }  }
227    
228  # We sort postings by increasing max term frequency (~ by increasing  # We sort postings by increasing max term frequency (~ by increasing
# Line 203  sub sort_postings { Line 247  sub sort_postings {
247    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
248    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
249    
250    for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}    for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
251                                        <=>                                        <=>
252                           $post->{$a} / $self->{db}->{$M, $a}                           $post->{$a} / $self->{db}->{'m'. $a}
253                      } keys %$post) {                      } keys %$post) {
254      $r .= pack 'w2', $did, $post->{$did};      $r .= pack 'w2', $did, $post->{$did};
255    }    }
# Line 231  sub delete { Line 275  sub delete {
275    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
276    
277    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
278      my %post = unpack 'w*', $db->{$_};      my %post = unpack 'w*', $db->{'p'.$_};
279      delete $post{$key};      delete $post{$key};
280      $db->{$_}    = $self->sort_postings(\%post);      $db->{'p'.$_}    = $self->sort_postings(\%post);
281      _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;      _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
282      $db->{$O,$_} = scalar keys %post;      $db->{'o'.$_} = scalar keys %post;
283    }    }
284    delete $db->{$M, $key};    delete $db->{'m'. $key};
285  }  }
286    
287  sub intervall {  sub intervall {
# Line 260  sub intervall { Line 304  sub intervall {
304      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
305      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
306    }    }
307    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
308      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
309    } else {  
310      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
311    }    $dbh->seq($first, $value, R_CURSOR);
312    # We assume that word do not start with the character \377  
313    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
314    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
315        
316    push @result, $first;    push @result, substr($first,1);
317    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
318      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
319      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
320      push @result, $word;      push @result, substr($word,1);
321    }    }
322    \@result;                     # speed    \@result;                     # speed
323  }  }
# Line 298  sub prefix { Line 342  sub prefix {
342      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
343    }    }
344    
345    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
346      return ();      return ();
347    }    }
348    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
349    push @result, $word;    push @result, substr($word,1);
350    
351    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
352      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
353      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
354      push @result, $word;      push @result, substr($word,1);
355    }    }
356    \@result;                     # speed    \@result;                     # speed
357  }  }
# Line 403  sub search_raw { Line 447  sub search_raw {
447    # We keep duplicates    # We keep duplicates
448    my @terms =    my @terms =
449      # Sort words by decreasing document frequency      # Sort words by decreasing document frequency
450      sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
451        # check which words occur in the index.        # check which words occur in the index.
452        grep { $self->{db}->{$O,$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
453    
454    return () unless @terms;                 # nothing to search for    return () unless @terms;                 # nothing to search for
455    
# Line 413  sub search_raw { Line 457  sub search_raw {
457    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
458    if ($wanted and @terms == 1) {    if ($wanted and @terms == 1) {
459      my $term  = shift @terms;      my $term  = shift @terms;
460      my $idf   = log($self->{records}/$self->{db}->{$O,$term});      my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
461      my @res;      my @res;
462    
463      if ($self->{reorg}) { # or not $query->{picky}      if ($self->{reorg}) { # or not $query->{picky}
464        @res = unpack "w". int(2*$wanted), $self->{db}->{$term};        @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
465      } else {      } else {
466        @res = unpack 'w*',                $self->{db}->{$term};        @res = unpack 'w*',                $self->{db}->{'p'.$term};
467      }      }
468    
469      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
470        $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;        $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
471      }      }
472    
473      return @res      return @res
# Line 434  sub search_raw { Line 478  sub search_raw {
478    # result.    # result.
479    unless ($wanted) {    unless ($wanted) {
480      for (@terms) {      for (@terms) {
481        my $df      = $self->{db}->{$O,$_};        my $df      = $self->{db}->{'o'.$_};
482    
483        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
484        _complain('search for term', $_) and $df = 1 if $df < 1;        _complain('search for term', $_) and $df = 1 if $df < 1;
485    
486        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
487        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
488    
489        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
490        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse
491        # fraction of documents the term occurs in.        # fraction of documents the term occurs in.
492        my $idf = log($self->{records}/$df);        my $idf = log($self->{records}/$df);
493        for my $did (keys %post) {        for my $did (keys %post) {
494          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
495            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
496          }          }
497        }        }
# Line 460  sub search_raw { Line 504  sub search_raw {
504    unless ($strict) {    unless ($strict) {
505      for (@terms) {      for (@terms) {
506        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
507        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
508    
509        # Lookup the number of documents the term occurs in (document frequency)        # Lookup the number of documents the term occurs in (document frequency)
510        my $occ  = $self->{db}->{$O,$_};        my $occ  = $self->{db}->{'o'.$_};
511    
512        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
513        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
514        _complain('search for term', $_) and $occ = 1 if $occ < 1;        _complain('search for term', $_) and $occ = 1 if $occ < 1;
515    
# Line 488  sub search_raw { Line 532  sub search_raw {
532                
533        if (keys %score < $wanted) {        if (keys %score < $wanted) {
534          for my $did (keys %post) {          for my $did (keys %post) {
535            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
536              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
537            }            }
538          }          }
539        } else {        } else {
540          for my $did (keys %score) {          for my $did (keys %score) {
541            next unless exists $post{$did};            next unless exists $post{$did};
542            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
543              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
544            }            }
545          }          }
# Line 515  sub search_raw { Line 559  sub search_raw {
559    for (my $i = $#terms; $i >=0; $i--) {    for (my $i = $#terms; $i >=0; $i--) {
560      local $_ = $terms[$i];      local $_ = $terms[$i];
561      # Lookup the number of documents the term occurs in (document frequency)      # Lookup the number of documents the term occurs in (document frequency)
562      my $df      = $self->{db}->{$O,$_};      my $df      = $self->{db}->{'o'.$_};
563    
564      # The frequency *must* be 1 at least since the posting list is nonempty      # The frequency *must* be 1 at least since the posting list is nonempty
565      _complain('search for term', $_) and $df = 1 if $df < 1;      _complain('search for term', $_) and $df = 1 if $df < 1;
# Line 526  sub search_raw { Line 570  sub search_raw {
570    
571      my ($did,$occ);      my ($did,$occ);
572      if ($self->{reorg}) {      if ($self->{reorg}) {
573        ($did,$occ) = unpack 'w2', $self->{db}->{$_};        ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
574      } else {                    # Maybe this costs more than it helps      } else {                    # Maybe this costs more than it helps
575        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
576      }      }
577      my $freq      = $self->{db}->{$M, $did};      my $freq      = $self->{db}->{'m'. $did};
578      my $max       = $occ/$freq*$idf[$i];      my $max       = $occ/$freq*$idf[$i];
579      $max[$i]      = $max + $max[$i+1];      $max[$i]      = $max + $max[$i+1];
580    }    }
# Line 540  sub search_raw { Line 584  sub search_raw {
584      my $term = $terms[$i];      my $term = $terms[$i];
585      # Unpack posting list for current query term $term. We loose the      # Unpack posting list for current query term $term. We loose the
586      # sorting order because the assignment to a hash.      # sorting order because the assignment to a hash.
587      my %post = unpack 'w*', $self->{db}->{$term};      my %post = unpack 'w*', $self->{db}->{'p'.$term};
588    
589      _complain('search for term', $term)      _complain('search for term', $term)
590        if $self->{db}->{$O,$term} != keys %post;        if $self->{db}->{'o'.$term} != keys %post;
591    
592      my $idf  = $idf[$i];      my $idf  = $idf[$i];
593      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
# Line 572  sub search_raw { Line 616  sub search_raw {
616        if (defined $chop) {        if (defined $chop) {
617          # We might be able to avoid allocating accumulators          # We might be able to avoid allocating accumulators
618          for my $did (keys %post) {          for my $did (keys %post) {
619            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
620              my $wgt = $post{$did} / $freq * $idf;              my $wgt = $post{$did} / $freq * $idf;
621              # We add an accumulator if $wgt exeeds $chop              # We add an accumulator if $wgt exeeds $chop
622              if (exists $score{$did} or $wgt > $chop) {              if (exists $score{$did} or $wgt > $chop) {
# Line 583  sub search_raw { Line 627  sub search_raw {
627        } else {        } else {
628          # Allocate acumulators for each seen document.          # Allocate acumulators for each seen document.
629          for my $did (keys %post) {          for my $did (keys %post) {
630            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
631              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
632            }            }
633          }          }
# Line 592  sub search_raw { Line 636  sub search_raw {
636        # Update existing accumulators        # Update existing accumulators
637        for my $did (keys %score) {        for my $did (keys %score) {
638          next unless exists $post{$did};          next unless exists $post{$did};
639          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
640            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
641          }          }
642        }        }
# Line 615  sub set { Line 659  sub set {
659    
660    $self->sync;    $self->sync;
661    while (my($key, $value) = each %{$self->{db}}) {    while (my($key, $value) = each %{$self->{db}}) {
662      next if $key =~ /^\377[om]/;      next if $key !~ /^p/;
663      $self->{db}->{$key} = $self->sort_postings($value);      $self->{db}->{$key} = $self->sort_postings($value);
664    }    }
665    $self->{reorg} = 1;    $self->{reorg} = 1;
# Line 628  sub sync { Line 672  sub sync {
672      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
673      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
674        if ($self->{reorg}) {        if ($self->{reorg}) {
675          $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
676                                                     . $value);                                                     . $value);
677        } else {        } else {
678          $self->{db}->{$key} .= $value;          $self->{db}->{'p'.$key} .= $value;
679        }        }
680      }      }
681      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
682        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
683        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
684      }      }
685      $self->{cache}  = {};      $self->{cache}  = {};
686      $self->{cdict}  = {};      $self->{cdict}  = {};

Legend:
Removed from v.21  
changed lines
  Added in v.22

  ViewVC Help
Powered by ViewVC 1.1.26