/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 19 by ulpfr, Tue May 9 11:29:45 2000 UTC cvs-head/lib/WAIT/InvertedIndex.pm revision 36 by ulpfr, Sun Nov 12 17:01:59 2000 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Tue May  9 08:33:28 2000  # Last Modified On: Sun Nov 12 14:40:21 2000
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2000, Ulrich Pfeifer
# Line 16  use DB_File; Line 16  use DB_File;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    
21  my $O = pack('C', 0xff)."o";                  # occurances (document ferquency)  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22    
23  # The document frequency is the number of documents a term occurs  # The dictionary has three different key types:
24  # in. The idea is that a term occuring in a significant part of the  #  'o'.$word
25  # documents is not too significant.  #
26    #     The document frequency is the number of documents a term occurs
27    #     in. The idea is that a term occuring in a significant part of the
28    #     documents is not too significant.
29    #
30    # 'm'.$word
31    #
32    #     The maximum term frequency of a document is the frequency of the
33    #     most frequent term in the document.  It is related to the document
34    #     length obviously.  A document in which the most frequnet term occurs
35    #     100 times is probably much longer than a document whichs most
36    #     frequent term occurs five time.
37    #
38    # 'p'.$word
39    #
40    #     Under this key we store the actual posting list as pairs of
41    #     packed integers.
42    
43  my $M = pack('C', 0xff)."m";                  # maxtf (term frequency)  my $no_old_index_support = 0; # do not check for old indices if set
   
 # The maximum term frequency of a document is the frequency of the  
 # most frequent term in the document.  It is related to the document  
 # length obviously.  A document in which the most frequnet term occurs  
 # 100 times is probably much longer than a document whichs most  
 # frequent term occurs five time.  
44    
45  sub new {  sub new {
46    my $type = shift;    my $type = shift;
# Line 134  sub drop { Line 144  sub drop {
144    }    }
145  }  }
146    
147    sub is_an_old_index {
148      my $self = shift;
149    
150      return 0 if $no_old_index_support;
151      return $self->{old_index} if exists $self->{old_index};
152    
153      # We can only guess if this is an old index. We lookup the first 10
154      # $O entries. If all values are integers, we assume that the index
155      # is an old one.
156    
157      defined $self->{db} or $self->open;
158      $self->sync;
159      my $dbh = $self->{dbh};       # for convenience
160    
161      my $O = pack('C', 0xff)."o";
162      my ($word, $value) = ($O.$;);  # $word and $value are modified!
163      $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;
164      for (my $i=0; $i<10;$i++) {
165        if ($value !~ /^\d+$/) {
166          return $self->{old_index} = 0;
167        }
168        if ($dbh->seq($word, $value, R_NEXT) or # no values left
169            $word !~ /^$O$;/o                   # no $O values left
170           ) {
171          # we are not sure enough that this is an old index
172          return $self->{old_index} = 0;
173        }
174      }
175      return $self->{old_index} = 1;
176    }
177    
178  sub open {  sub open {
179    my $self = shift;    my $self = shift;
180    my $file = $self->{file};    my $file = $self->{file};
# Line 150  sub open { Line 191  sub open {
191      $self->{cdict} = {}      $self->{cdict} = {}
192        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
193      $self->{cached} = 0;      $self->{cached} = 0;
194        if (!$no_old_index_support and $self->is_an_old_index()) {
195          warn "This is an old index, upgrade you database";
196          require WAIT::InvertedIndexOld;
197          bless $self, 'WAIT::InvertedIndexOld';
198        }
199    }    }
200  }  }
201    
# Line 164  sub insert { Line 210  sub insert {
210    $self->{records}++;    $self->{records}++;
211    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
212      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
213        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
214        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
215      } else {      } else {          
216        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
217        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
218      }      }
219      $self->{cached}++;      $self->{cached}++;
# Line 178  sub insert { Line 224  sub insert {
224    for (values %occ) {    for (values %occ) {
225      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
226    }    }
227    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{'m'. $key} = $maxtf;
228  }  }
229    
230  # We sort postings by increasing max term frequency (~ by increasing  # We sort postings by increasing max term frequency (~ by increasing
# Line 203  sub sort_postings { Line 249  sub sort_postings {
249    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
250    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
251    
252    for my $did (sort {    $post->{$b} / $self->{db}->{$M, $b}    for my $did (keys %$post) { # sanity check
253        unless ($self->{db}->{"m". $did}) {
254          warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
255          $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
256        }
257      }
258      for my $did (sort {    $post->{$b} / $self->{db}->{'m'. $b}
259                                        <=>                                        <=>
260                           $post->{$a} / $self->{db}->{$M, $a}                           $post->{$a} / $self->{db}->{'m'. $a}
261                      } keys %$post) {                      } keys %$post) {
262      $r .= pack 'w2', $did, $post->{$did};      $r .= pack 'w2', $did, $post->{$did};
263    }    }
# Line 230  sub delete { Line 282  sub delete {
282    
283    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
284    
285      # Be prepared for "Odd number of elements in hash assignment"
286      local $SIG{__WARN__} = sub {
287        my $warning = shift;
288        chomp $warning;
289        warn "Catching warning[$warning] during delete of key[$key]";
290      };
291    for (keys %occ) {# may reorder posting list    for (keys %occ) {# may reorder posting list
292      my %post = unpack 'w*', $db->{$_};      my %post = unpack 'w*', $db->{'p'.$_};
293      delete $post{$key};      delete $post{$key};
294      $db->{$_}    = $self->sort_postings(\%post);      $db->{'p'.$_}    = $self->sort_postings(\%post);
295      _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;      _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
296      $db->{$O,$_} = scalar keys %post;      $db->{'o'.$_} = scalar keys %post;
297    }    }
298    delete $db->{$M, $key};    delete $db->{'m'. $key};
299  }  }
300    
301  sub intervall {  sub intervall {
# Line 260  sub intervall { Line 318  sub intervall {
318      ($first) = &{$self->{'ifunc'}}($first) if $first;      ($first) = &{$self->{'ifunc'}}($first) if $first;
319      ($last)  = &{$self->{'ifunc'}}($last) if $last;      ($last)  = &{$self->{'ifunc'}}($last) if $last;
320    }    }
321    if (defined $first and $first ne '') {         # set the cursor to $first    $first = 'p'.($first||'');
322      $dbh->seq($first, $value, R_CURSOR);    $last  = (defined $last)?'p'.$last:'q';
323    } else {  
324      $dbh->seq($first, $value, R_FIRST);    # set the cursor to $first
325    }    $dbh->seq($first, $value, R_CURSOR);
326    # We assume that word do not start with the character \377  
327    # $last = pack 'C', 0xff unless defined $last and $last ne '';    # $first would be after the last word
328    return () if defined $last and $first gt $last; # $first would be after the last word    return () if $first gt $last;
329        
330    push @result, $first;    push @result, substr($first,1);
331    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
332      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
333      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;      last if $word gt $last;
334      push @result, $word;      push @result, substr($word,1);
335    }    }
336    \@result;                     # speed    \@result;                     # speed
337  }  }
# Line 298  sub prefix { Line 356  sub prefix {
356      ($prefix) = &{$self->{'pfunc'}}($prefix);      ($prefix) = &{$self->{'pfunc'}}($prefix);
357    }    }
358    
359    if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {    if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
360      return ();      return ();
361    }    }
362    return () if $word !~ /^$prefix/;    return () if $word !~ /^p$prefix/;
363    push @result, $word;    push @result, substr($word,1);
364    
365    while (!$dbh->seq($word, $value, R_NEXT)) {    while (!$dbh->seq($word, $value, R_NEXT)) {
366      # We should limit this to a "resonable" number of words      # We should limit this to a "resonable" number of words
367      last if $word !~ /^$prefix/;      last if $word !~ /^p$prefix/;
368      push @result, $word;      push @result, substr($word,1);
369    }    }
370    \@result;                     # speed    \@result;                     # speed
371  }  }
# Line 403  sub search_raw { Line 461  sub search_raw {
461    # We keep duplicates    # We keep duplicates
462    my @terms =    my @terms =
463      # Sort words by decreasing document frequency      # Sort words by decreasing document frequency
464      sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }      sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
465        # check which words occur in the index.        # check which words occur in the index.
466        grep { $self->{db}->{$O,$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
467    
468    return () unless @terms;                 # nothing to search for    return () unless @terms;                 # nothing to search for
469    
# Line 413  sub search_raw { Line 471  sub search_raw {
471    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
472    if ($wanted and @terms == 1) {    if ($wanted and @terms == 1) {
473      my $term  = shift @terms;      my $term  = shift @terms;
474      my $idf   = log($self->{records}/$self->{db}->{$O,$term});      my $idf   = log($self->{records}/$self->{db}->{'o'.$term});
475      my @res;      my @res;
476    
477      if ($self->{reorg}) { # or not $query->{picky}      if ($self->{reorg}) { # or not $query->{picky}
478        @res = unpack "w". int(2*$wanted), $self->{db}->{$term};        @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
479      } else {      } else {
480        @res = unpack 'w*',                $self->{db}->{$term};        @res = unpack 'w*',                $self->{db}->{'p'.$term};
481      }      }
482    
483      for (my $i=1; $i<@res; $i+=2) {      for (my $i=1; $i<@res; $i+=2) {
484        $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;        # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
485          # above was written badly, allows two DIV_ZERO problems.
486          my $maxtf = $self->{db}->{"m". $res[$i-1]};
487          unless ($maxtf) {
488            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
489            $maxtf = 1;
490          }
491          $res[$i] = ($res[$i] / $maxtf) * $idf;
492      }      }
493    
494      return @res      return @res
# Line 434  sub search_raw { Line 499  sub search_raw {
499    # result.    # result.
500    unless ($wanted) {    unless ($wanted) {
501      for (@terms) {      for (@terms) {
502        my $df      = $self->{db}->{$O,$_};        my $df      = $self->{db}->{'o'.$_};
503    
504        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
505        _complain('search for term', $_) and $df = 1 if $df < 1;        _complain('search for term', $_) and $df = 1 if $df < 1;
506    
507        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
508        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
509    
510        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
511        # This is the inverse document frequency. The log of the inverse        # This is the inverse document frequency. The log of the inverse
512        # fraction of documents the term occurs in.        # fraction of documents the term occurs in.
513        my $idf = log($self->{records}/$df);        my $idf = log($self->{records}/$df);
514        for my $did (keys %post) {        for my $did (keys %post) {
515          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
516            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
517          }          }
518        }        }
# Line 460  sub search_raw { Line 525  sub search_raw {
525    unless ($strict) {    unless ($strict) {
526      for (@terms) {      for (@terms) {
527        # Unpack posting list for current query term $_        # Unpack posting list for current query term $_
528        my %post = unpack 'w*', $self->{db}->{$_};        my %post = unpack 'w*', $self->{db}->{'p'.$_};
529    
530        # Lookup the number of documents the term occurs in (document frequency)        # Lookup the number of documents the term occurs in (document frequency)
531        my $occ  = $self->{db}->{$O,$_};        my $occ  = $self->{db}->{'o'.$_};
532    
533        _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;        _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
534        # The frequency *must* be 1 at least since the posting list is nonempty        # The frequency *must* be 1 at least since the posting list is nonempty
535        _complain('search for term', $_) and $occ = 1 if $occ < 1;        _complain('search for term', $_) and $occ = 1 if $occ < 1;
536    
# Line 488  sub search_raw { Line 553  sub search_raw {
553                
554        if (keys %score < $wanted) {        if (keys %score < $wanted) {
555          for my $did (keys %post) {          for my $did (keys %post) {
556            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
557              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
558            }            }
559          }          }
560        } else {        } else {
561          for my $did (keys %score) {          for my $did (keys %score) {
562            next unless exists $post{$did};            next unless exists $post{$did};
563            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
564              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
565            }            }
566          }          }
# Line 515  sub search_raw { Line 580  sub search_raw {
580    for (my $i = $#terms; $i >=0; $i--) {    for (my $i = $#terms; $i >=0; $i--) {
581      local $_ = $terms[$i];      local $_ = $terms[$i];
582      # Lookup the number of documents the term occurs in (document frequency)      # Lookup the number of documents the term occurs in (document frequency)
583      my $df      = $self->{db}->{$O,$_};      my $df      = $self->{db}->{'o'.$_};
584    
585      # The frequency *must* be 1 at least since the posting list is nonempty      # The frequency *must* be 1 at least since the posting list is nonempty
586      _complain('search for term', $_) and $df = 1 if $df < 1;      _complain('search for term', $_) and $df = 1 if $df < 1;
# Line 526  sub search_raw { Line 591  sub search_raw {
591    
592      my ($did,$occ);      my ($did,$occ);
593      if ($self->{reorg}) {      if ($self->{reorg}) {
594        ($did,$occ) = unpack 'w2', $self->{db}->{$_};        ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
595      } else {                    # Maybe this costs more than it helps      } else {                    # Maybe this costs more than it helps
596        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});        ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
597      }      }
598      my $freq      = $self->{db}->{$M, $did};      my $freq      = $self->{db}->{'m'. $did};
599      my $max       = $occ/$freq*$idf[$i];      my $max       = $occ/$freq*$idf[$i];
600      $max[$i]      = $max + $max[$i+1];      $max[$i]      = $max + $max[$i+1];
601    }    }
# Line 540  sub search_raw { Line 605  sub search_raw {
605      my $term = $terms[$i];      my $term = $terms[$i];
606      # Unpack posting list for current query term $term. We loose the      # Unpack posting list for current query term $term. We loose the
607      # sorting order because the assignment to a hash.      # sorting order because the assignment to a hash.
608      my %post = unpack 'w*', $self->{db}->{$term};      my %post = unpack 'w*', $self->{db}->{'p'.$term};
609    
610      _complain('search for term', $term)      _complain('search for term', $term)
611        if $self->{db}->{$O,$term} != keys %post;        if $self->{db}->{'o'.$term} != keys %post;
612    
613      my $idf  = $idf[$i];      my $idf  = $idf[$i];
614      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
# Line 572  sub search_raw { Line 637  sub search_raw {
637        if (defined $chop) {        if (defined $chop) {
638          # We might be able to avoid allocating accumulators          # We might be able to avoid allocating accumulators
639          for my $did (keys %post) {          for my $did (keys %post) {
640            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
641              my $wgt = $post{$did} / $freq * $idf;              my $wgt = $post{$did} / $freq * $idf;
642              # We add an accumulator if $wgt exeeds $chop              # We add an accumulator if $wgt exeeds $chop
643              if (exists $score{$did} or $wgt > $chop) {              if (exists $score{$did} or $wgt > $chop) {
# Line 583  sub search_raw { Line 648  sub search_raw {
648        } else {        } else {
649          # Allocate acumulators for each seen document.          # Allocate acumulators for each seen document.
650          for my $did (keys %post) {          for my $did (keys %post) {
651            if (my $freq = $self->{db}->{$M, $did}) {            if (my $freq = $self->{db}->{'m'. $did}) {
652              $score{$did} += $post{$did} / $freq * $idf;              $score{$did} += $post{$did} / $freq * $idf;
653            }            }
654          }          }
# Line 592  sub search_raw { Line 657  sub search_raw {
657        # Update existing accumulators        # Update existing accumulators
658        for my $did (keys %score) {        for my $did (keys %score) {
659          next unless exists $post{$did};          next unless exists $post{$did};
660          if (my $freq = $self->{db}->{$M, $did}) {          if (my $freq = $self->{db}->{'m'. $did}) {
661            $score{$did} += $post{$did} / $freq * $idf;            $score{$did} += $post{$did} / $freq * $idf;
662          }          }
663        }        }
# Line 615  sub set { Line 680  sub set {
680    
681    $self->sync;    $self->sync;
682    while (my($key, $value) = each %{$self->{db}}) {    while (my($key, $value) = each %{$self->{db}}) {
683      next if $key =~ /^\377[om]/;      next if $key !~ /^p/;
684      $self->{db}->{$key} = $self->sort_postings($value);      $self->{db}->{$key} = $self->sort_postings($value);
685    }    }
686    $self->{reorg} = 1;    $self->{reorg} = 1;
# Line 627  sub sync { Line 692  sub sync {
692    if ($self->{mode} & O_RDWR) {    if ($self->{mode} & O_RDWR) {
693      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};      print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
694      while (my($key, $value) = each %{$self->{cache}}) {      while (my($key, $value) = each %{$self->{cache}}) {
695          $self->{db}->{"p". $key} ||= "";
696        if ($self->{reorg}) {        if ($self->{reorg}) {
697          $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}          $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
698                                                     . $value);                                                     . $value);
699        } else {        } else {
700          $self->{db}->{$key} .= $value;          $self->{db}->{'p'.$key} .= $value;
701        }        }
702      }      }
703      while (my($key, $value) = each %{$self->{cdict}}) {      while (my($key, $value) = each %{$self->{cdict}}) {
704        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};        $self->{db}->{'o'.$key} = 0 unless  $self->{db}->{'o'.$key};
705        $self->{db}->{$key} += $value;        $self->{db}->{'o'.$key} += $value;
706      }      }
707      $self->{cache}  = {};      $self->{cache}  = {};
708      $self->{cdict}  = {};      $self->{cdict}  = {};

Legend:
Removed from v.19  
changed lines
  Added in v.36

  ViewVC Help
Powered by ViewVC 1.1.26