/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 36 by ulpfr, Sun Nov 12 17:01:59 2000 UTC revision 82 by ulpfr, Mon Apr 22 10:09:34 2002 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 12 14:40:21 2000  # Last Modified On: Sat Apr 20 16:56:29 2002
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2002, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
# Line 18  use WAIT::Filter; Line 18  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC $VERSION);  use vars qw(%FUNC $VERSION);
20    
21  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION  $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
22    
23  # The dictionary has three different key types:  # The dictionary has three different key types:
24  #  'o'.$word  #  'o'.$word
25  #  #
26  #     The document frequency is the number of documents a term occurs  #     The document frequency is the number of documents a term occurs
27  #     in. The idea is that a term occuring in a significant part of the  #     in. The idea is that a term occuring in a significant portion of the
28  #     documents is not too significant.  #     documents is not too significant.
29  #  #
30  # 'm'.$word  # 'm'.$word
31  #  #
32  #     The maximum term frequency of a document is the frequency of the  #     The maximum term frequency of a document is the frequency of the
33  #     most frequent term in the document.  It is related to the document  #     most frequent term in the document.  It is related to the document
34  #     length obviously.  A document in which the most frequnet term occurs  #     length obviously.  A document in which the most frequent term occurs
35  #     100 times is probably much longer than a document whichs most  #     100 times is probably much longer than a document whichs most
36  #     frequent term occurs five time.  #     frequent term occurs five time.
37  #  #
# Line 156  sub is_an_old_index { Line 156  sub is_an_old_index {
156    
157    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
158    $self->sync;    $self->sync;
159    my $dbh = $self->{dbh};       # for convenience    my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
160    
161    my $O = pack('C', 0xff)."o";    my $O = pack('C', 0xff)."o";
162    my ($word, $value) = ($O.$;);  # $word and $value are modified!    my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
163    $dbh->seq($word, $value, R_CURSOR) or return $self->{old_index} = 0;    if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164        # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165        return $self->{old_index} = 0;
166      }
167    for (my $i=0; $i<10;$i++) {    for (my $i=0; $i<10;$i++) {
168      if ($value !~ /^\d+$/) {      if ($value !~ /^\d+$/) {
169          # warn "DEBUG: word[$word]value[$value], not an old index";
170        return $self->{old_index} = 0;        return $self->{old_index} = 0;
171      }      }
172      if ($dbh->seq($word, $value, R_NEXT) or # no values left      if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173          $word !~ /^$O$;/o                   # no $O values left          $word !~ /^$O$;/o                   # no $O values left
174         ) {         ) {
175        # we are not sure enough that this is an old index        # we are not sure enough that this is an old index
176          # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177        return $self->{old_index} = 0;        return $self->{old_index} = 0;
178      }      }
179    }    }
180      # warn "DEBUG: old index";
181    return $self->{old_index} = 1;    return $self->{old_index} = 1;
182  }  }
183    
# Line 245  sub sort_postings { Line 251  sub sort_postings {
251    my $r = '';    my $r = '';
252    
253    # Sort posting list by increasing ratio of maximum term frequency (~    # Sort posting list by increasing ratio of maximum term frequency (~
254    # "document length") and term frequency. This rati multipied by the    # "document length") and term frequency. This ratio multipied by the
255    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
256    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
257    
# Line 405  sub search { Line 411  sub search {
411    
412    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
413    $self->sync;    $self->sync;
414    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
415  }  }
416    
417  sub parse {  sub parse {
# Line 415  sub parse { Line 421  sub parse {
421    &{$self->{func}}(@_);    &{$self->{func}}(@_);
422  }  }
423    
 sub keys {  
   my $self  = shift;  
   
   defined $self->{db} or $self->open;  
   keys %{$self->{db}};  
 }  
   
424  sub search_prefix {  sub search_prefix {
425    my $self  = shift;    my $self  = shift;
426    
# Line 465  sub search_raw { Line 464  sub search_raw {
464        # check which words occur in the index.        # check which words occur in the index.
465        grep { $self->{db}->{'o'.$_} } @_;        grep { $self->{db}->{'o'.$_} } @_;
466    
467    return () unless @terms;                 # nothing to search for    return unless @terms;
468    
469    # We special-case one term queries here.  If the index was sorted,    # We special-case one term queries here.  If the index was sorted,
470    # choping off the rest of the list will return the same ranking.    # choping off the rest of the list will return the same ranking.
# Line 614  sub search_raw { Line 613  sub search_raw {
613      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
614      my $chop;                   # Score necessary to enter the ranking list      my $chop;                   # Score necessary to enter the ranking list
615    
616      if (# We know that wanted is true since we especial cased the      if (# We know that wanted is true since we special cased the
617          # exhaustive search.          # exhaustive search.
618    
619          $wanted and          $wanted and
# Line 728  sub close { Line 727  sub close {
727    }    }
728  }  }
729    
730    sub keys {
731      my $self  = shift;
732    
733      defined $self->{db} or $self->open;
734      keys %{$self->{db}};
735    }
736    
737  1;  1;
738    

Legend:
Removed from v.36  
changed lines
  Added in v.82

  ViewVC Help
Powered by ViewVC 1.1.26