/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 51 by ulpfr, Mon Dec 31 14:00:22 2001 UTC revision 82 by ulpfr, Mon Apr 22 10:09:34 2002 UTC
# Line 4  Line 4 
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Mon Dec 31 14:30:05 2001  # Last Modified On: Sat Apr 20 16:56:29 2002
8  # Language        : CPerl  # Language        : CPerl
9  #  #
10  # (C) Copyright 1996-2000, Ulrich Pfeifer  # (C) Copyright 1996-2002, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
# Line 18  use WAIT::Filter; Line 18  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC $VERSION);  use vars qw(%FUNC $VERSION);
20    
21  $VERSION = "1.801"; # others test if we are loaded by checking $VERSION  $VERSION = "1.900"; # others test if we are loaded by checking $VERSION
22    
23  # The dictionary has three different key types:  # The dictionary has three different key types:
24  #  'o'.$word  #  'o'.$word
25  #  #
26  #     The document frequency is the number of documents a term occurs  #     The document frequency is the number of documents a term occurs
27  #     in. The idea is that a term occuring in a significant part of the  #     in. The idea is that a term occuring in a significant portion of the
28  #     documents is not too significant.  #     documents is not too significant.
29  #  #
30  # 'm'.$word  # 'm'.$word
31  #  #
32  #     The maximum term frequency of a document is the frequency of the  #     The maximum term frequency of a document is the frequency of the
33  #     most frequent term in the document.  It is related to the document  #     most frequent term in the document.  It is related to the document
34  #     length obviously.  A document in which the most frequnet term occurs  #     length obviously.  A document in which the most frequent term occurs
35  #     100 times is probably much longer than a document whichs most  #     100 times is probably much longer than a document whichs most
36  #     frequent term occurs five time.  #     frequent term occurs five time.
37  #  #
# Line 156  sub is_an_old_index { Line 156  sub is_an_old_index {
156    
157    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
158    $self->sync;    $self->sync;
159    my $dbh = $self->{dbh};       # for convenience    my $dbh = $self->{dbh} or return $self->{old_index} = 0;       # for convenience
160    
161    my $O = pack('C', 0xff)."o";    my $O = pack('C', 0xff)."o";
162    my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!    my ($word, $value) = ($O.$;);  # $word and $value are modified by seq!
# Line 251  sub sort_postings { Line 251  sub sort_postings {
251    my $r = '';    my $r = '';
252    
253    # Sort posting list by increasing ratio of maximum term frequency (~    # Sort posting list by increasing ratio of maximum term frequency (~
254    # "document length") and term frequency. This rati multipied by the    # "document length") and term frequency. This ratio multipied by the
255    # inverse document frequence gives the score for a term.  This sort    # inverse document frequence gives the score for a term.  This sort
256    # order can be exploited for tuning of single term queries.    # order can be exploited for tuning of single term queries.
257    
# Line 411  sub search { Line 411  sub search {
411    
412    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
413    $self->sync;    $self->sync;
414    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here    $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() there
415  }  }
416    
417  sub parse {  sub parse {
# Line 613  sub search_raw { Line 613  sub search_raw {
613      my $full;                   # Need to process all postings      my $full;                   # Need to process all postings
614      my $chop;                   # Score necessary to enter the ranking list      my $chop;                   # Score necessary to enter the ranking list
615    
616      if (# We know that wanted is true since we especial cased the      if (# We know that wanted is true since we special cased the
617          # exhaustive search.          # exhaustive search.
618    
619          $wanted and          $wanted and

Legend:
Removed from v.51  
changed lines
  Added in v.82

  ViewVC Help
Powered by ViewVC 1.1.26