/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/InvertedIndex.pm revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC trunk/lib/WAIT/InvertedIndex.pm revision 108 by dpavlin, Tue Jul 13 17:41:12 2004 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  # -*- Mode: cperl; fill-column: 79 -*-
2  # InvertedIndex.pm --  # $Basename: InvertedIndex.pm $
3  # ITIID           : $ITI$ $Header $__Header$  # $Revision: 1.30 $
4  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
5  # Created On      : Thu Aug  8 13:05:10 1996  # Created On      : Thu Aug  8 13:05:10 1996
6  # Last Modified By: Ulrich Pfeifer  # Last Modified By: Ulrich Pfeifer
7  # Last Modified On: Sun Nov 22 18:44:42 1998  # Last Modified On: Mon Apr 22 16:52:01 2002
8  # Language        : CPerl  # Language        : CPerl
 # Status          : Unknown, Use with caution!  
9  #  #
10  # Copyright (c) 1996-1997, Ulrich Pfeifer  # (C) Copyright 1996-2002, Ulrich Pfeifer
11  #  #
12    
13  package WAIT::InvertedIndex;  package WAIT::InvertedIndex;
14  use strict;  use strict;
15  use DB_File;  use BerkeleyDB;
16  use Fcntl;  use Fcntl;
17  use WAIT::Filter;  use WAIT::Filter;
18  use Carp;  use Carp;
19  use vars qw(%FUNC);  use vars qw(%FUNC $VERSION);
20    use Time::HiRes qw(time);
21    
22  my $O = pack('C', 0xff)."o";                  # occurances  $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23  my $M = pack('C', 0xff)."m";                  # maxtf  
24    use constant DOCFREQ_O     => "o";
25    use constant MAXTF_M       => "m";
26    use constant POSTINGLIST_P => "p";
27    use constant PMATCH        => qr/^(??{POSTINGLIST_P()})/;
28    
29    # The dictionary has three different key types:
30    
31    #  'o'.$word
32    #
33    #     The document frequency is the number of documents a term occurs
34    #     in. The idea is that a term occuring in a significant portion of the
35    #     documents is not too significant.
36    #
37    # 'm'.$word
38    #
39    #     The maximum term frequency of a document is the frequency of the
40    #     most frequent term in the document.  It is related to the document
41    #     length obviously.  A document in which the most frequent term occurs
42    #     100 times is probably much longer than a document whichs most
43    #     frequent term occurs five time.
44    #
45    # 'p'.$word
46    #
47    #     Under this key we store the actual posting list as pairs of
48    #     packed integers.
49    
50  sub new {  sub new {
51    my $type = shift;    my $type = shift;
52    my %parm = @_;    my %parm = @_;
53    my $self = {};    my $self = {};
54    
55    $self->{file}     = $parm{file}     or croak "No file specified";    for my $x (qw(file attr subname env maindbfile tablename)) {
56    $self->{attr}     = $parm{attr}     or croak "No attributes specified";      $self->{$x}     = $parm{$x}     or croak "No $x specified";
57      }
58    
59    $self->{filter}   = $parm{filter};    $self->{filter}   = $parm{filter};
60    $self->{'name'}   = $parm{'name'};    $self->{'name'}   = $parm{'name'};
61    $self->{records}  = 0;    $self->{records}  = 0;
# Line 44  sub new { Line 71  sub new {
71    bless $self, ref($type) || $type;    bless $self, ref($type) || $type;
72  }  }
73    
74  sub name {$_[0]->{'name'}}  for my $accessor (qw(name maindbfile tablename subname)) {
75      no strict 'refs';
76      *{$accessor} = sub {
77        my($self) = @_;
78        return $self->{$accessor} if $self->{$accessor};
79        require Carp;
80        Carp::confess("accessor $accessor not there");
81      }
82    }
83    
84  sub _split_pos {  sub _split_pos {
85    my ($text, $pos) = @{$_[0]};    my ($text, $pos) = @{$_[0]};
# Line 63  sub _split_pos { Line 98  sub _split_pos {
98  sub _xfiltergen {  sub _xfiltergen {
99    my $filter = pop @_;    my $filter = pop @_;
100    
101    if ($filter eq 'stop') {      # avoid the slow stopword elimination  # Oops, we cannot overrule the user's choice. Other filters may kill
102      return _xfiltergen(@_);            # it's cheaper to look them up afterwards  # stopwords, such as isotr clobbers "isn't" to "isnt".
103    }  
104    #  if ($filter eq 'stop') {      # avoid the slow stopword elimination
105    #    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
106    #  }
107    if (@_) {    if (@_) {
108      if ($filter =~ /^split(\d*)/) {      if ($filter =~ /^split(\d*)/) {
109        if ($1) {        if ($1) {
# Line 130  sub open { Line 168  sub open {
168    } else {    } else {
169      $self->{func}     =      $self->{func}     =
170        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
171      $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,      my $flags;
172                         $self->{mode}, 0664, $DB_BTREE);      if ($self->{mode} & O_RDWR) {
173  #    tie(%{$self->{cache}}, 'DB_File', undef,        $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
174  #        $self->{mode}, 0664, $DB_BTREE)        warn "Flags on inverted $file set to 'writing'";
175        } else {
176          $flags = DB_RDONLY;
177          # warn "Flags on inverted $file set to 'readonly'";
178        }
179        my $filename = $self->maindbfile or die;
180        my $subname  = join("/",$self->tablename || die,$self->subname || die);
181        my $env = $self->{env} || "[undef]";
182        $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
183                           # Filename => $file,
184                           Filename => $filename,
185                           $self->{env} ? (Env  => $self->{env}) : (),
186                           Subname => $subname,
187                           Mode => 0664,
188                           Flags => $flags,
189                           $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
190                           $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
191                           ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
192      $self->{cache} = {}      $self->{cache} = {}
193        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
 #    tie(%{$self->{cdict}}, 'DB_File', undef,  
 #        $self->{mode}, 0664, $DB_BTREE)  
194      $self->{cdict} = {}      $self->{cdict} = {}
195        if $self->{mode} & O_RDWR;        if $self->{mode} & O_RDWR;
196      $self->{cached} = 0;      $self->{cached} = 0;
# Line 148  sub insert { Line 201  sub insert {
201    my $self  = shift;    my $self  = shift;
202    my $key   = shift;    my $key   = shift;
203    my %occ;    my %occ;
204      
205    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
206      defined $self->{db} or die "open didn't help!!!";
207    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
208    my ($word, $noc);    my ($word, $noc);
209    $self->{records}++;    $self->{records}++;
210    while (($word, $noc) = each %occ) {    while (($word, $noc) = each %occ) {
211      if (defined $self->{cache}->{$word}) {      if (defined $self->{cache}->{$word}) {
212        $self->{cdict}->{$O,$word}++;        $self->{cdict}->{$word}++;
213        $self->{cache}->{$word} .= pack 'w2', $key, $noc;        $self->{cache}->{$word} .= pack 'w2', $key, $noc;
214      } else {      } else {
215        $self->{cdict}->{$O,$word} = 1;        $self->{cdict}->{$word} = 1;
216        $self->{cache}->{$word}  = pack 'w2', $key, $noc;        $self->{cache}->{$word}  = pack 'w2', $key, $noc;
217      }      }
218      $self->{cached}++;      $self->{cached}++;
219    }    }
220      # This cache limit should be configurable
221    $self->sync if $self->{cached} > 100_000;    $self->sync if $self->{cached} > 100_000;
222    my $maxtf = 0;    my $maxtf = 0;
223    for (values %occ) {    for (values %occ) {
224      $maxtf = $_ if $_ > $maxtf;      $maxtf = $_ if $_ > $maxtf;
225    }    }
226    $self->{db}->{$M, $key} = $maxtf;    $self->{db}->{MAXTF_M . $key} = $maxtf;
227    }
228    
229    # We sort postings by increasing max term frequency (~ by increasing
230    # document length.  This reduces the quality degradation if we process
231    # only the first part of a posting list.
232    
233    sub sort_postings {
234      my $self = shift;
235      my $post = shift;             # reference to a hash or packed string
236    
237      if (ref $post) {
238        # we skip the sort part, if the index is not sorted
239        return pack('w*', %$post) unless $self->{reorg};
240      } else {
241        $post = { unpack 'w*', $post };
242      }
243    
244      my $r = '';
245    
246      # Sort posting list by increasing ratio of maximum term frequency (~
247      # "document length") and term frequency. This ratio multipied by the
248      # inverse document frequence gives the score for a term.  This sort
249      # order can be exploited for tuning of single term queries.
250    
251      for my $did (keys %$post) { # sanity check
252        unless ($self->{db}->{MAXTF_M . $did}) {
253          warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
254          $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
255        }
256      }
257      for my $did (sort {    $post->{$b} / $self->{db}->{MAXTF_M . $b}
258                                          <=>
259                             $post->{$a} / $self->{db}->{MAXTF_M . $a}
260                        } keys %$post) {
261        $r .= pack 'w2', $did, $post->{$did};
262      }
263      #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
264      $r;
265  }  }
266    
267  sub delete {  sub delete {
# Line 176  sub delete { Line 269  sub delete {
269    my $key   = shift;    my $key   = shift;
270    my %occ;    my %occ;
271    
272      my $db;
273    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
274      $db = $self->{db};
275    $self->sync;    $self->sync;
276    $self->{records}--;    $self->{records}--;
277    
278      # less than zero documents in database?
279      _complain('delete of document', $key) and $self->{records} = 0
280        if $self->{records} < 0;
281    
282    grep $occ{$_}++, &{$self->{func}}(@_);    grep $occ{$_}++, &{$self->{func}}(@_);
283    for (keys %occ) {  
284      # may reorder posting list    # Be prepared for "Odd number of elements in hash assignment"
285      my %post = unpack 'w*', $self->{db}->{$_};    local $SIG{__WARN__} = sub {
286      $self->{db}->{$O,$_}--;      my $warning = shift;
287        chomp $warning;
288        warn "Catching warning[$warning] during delete of key[$key]";
289      };
290      for (keys %occ) {# may reorder posting list
291        my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
292      delete $post{$key};      delete $post{$key};
293      $self->{db}->{$_} = pack 'w*', %post;      $db->{POSTINGLIST_P . $_}    = $self->sort_postings(\%post);
294        _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
295        $db->{DOCFREQ_O . $_} = scalar keys %post;
296    }    }
297    delete $self->{db}->{$M, $key};    delete $db->{MAXTF_M . $key};
298  }  }
299    
300  sub intervall {  sub intervall {
301    my ($self, $first, $last) = @_;    my ($self, $first, $last) = @_;
   my $value = '';  
   my $word  = '';  
   my @result;  
302    
303    return unless exists $self->{'intervall'};    die "intervall broken in this version of WAIT: need to fix the
304      R_CURSOR and R_NEXT lines";
305    
306    defined $self->{db} or $self->open;  ####      my $value = '';
307    $self->sync;  ####      my $word  = '';
308    my $dbh = $self->{dbh};       # for convenience  ####      my @result;
309    ####    
310    if (ref $self->{'intervall'}) {  ####      return unless exists $self->{'intervall'};
311      unless (exists $self->{'ifunc'}) {  ####    
312        $self->{'ifunc'} =  ####      defined $self->{db} or $self->open;
313          eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));  ####      $self->sync;
314      }  ####      my $dbh = $self->{dbh};       # for convenience
315      ($first) = &{$self->{'ifunc'}}($first) if $first;  ####    
316      ($last)  = &{$self->{'ifunc'}}($last) if $last;  ####      if (ref $self->{'intervall'}) {
317    }  ####        unless (exists $self->{'ifunc'}) {
318    if (defined $first and $first ne '') {         # set the cursor to $first  ####          $self->{'ifunc'} =
319      $dbh->seq($first, $value, R_CURSOR);  ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
320    } else {  ####        }
321      $dbh->seq($first, $value, R_FIRST);  ####        ($first) = &{$self->{'ifunc'}}($first) if $first;
322    }  ####        ($last)  = &{$self->{'ifunc'}}($last) if $last;
323    # We assume that word do not start with the character \377  ####      }
324    # $last = pack 'C', 0xff unless defined $last and $last ne '';  ####      $first = POSTINGLIST_P . ($first||'');
325    return () if defined $last and $first gt $last; # $first would be after the last word  ####      $last  = (defined $last)?POSTINGLIST_P . $last:'q';
326      ####    
327    push @result, $first;  ####      # set the cursor to $first
328    while (!$dbh->seq($word, $value, R_NEXT)) {  ####      $dbh->seq($first, $value, R_CURSOR);
329      # We should limit this to a "resonable" number of words  ####    
330      last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;  ####      # $first would be after the last word
331      push @result, $word;  ####      return () if $first gt $last;
332    }  ####      
333    \@result;                     # speed  ####      push @result, substr($first,1);
334    ####      while (!$dbh->seq($word, $value, R_NEXT)) {
335    ####        # We should limit this to a "resonable" number of words
336    ####        last if $word gt $last;
337    ####        push @result, substr($word,1);
338    ####      }
339    ####      \@result;                     # speed
340  }  }
341    
342  sub prefix {  sub prefix {
343    my ($self, $prefix) = @_;    my ($self, $prefix) = @_;
   my $value = '';  
   my $word  = '';  
   my @result;  
344    
345    return () unless defined $prefix; # Full dictionary requested !!    die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
   return unless exists $self->{'prefix'};  
   defined $self->{db} or $self->open;  
   $self->sync;  
   my $dbh = $self->{dbh};  
     
   if (ref $self->{'prefix'}) {  
     unless (exists $self->{'pfunc'}) {  
       $self->{'pfunc'} =  
         eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));  
     }  
     ($prefix) = &{$self->{'pfunc'}}($prefix);  
   }  
346    
   if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {  
     return ();  
   }  
   return () if $word !~ /^$prefix/;  
   push @result, $word;  
347    
348    while (!$dbh->seq($word, $value, R_NEXT)) {  ####      my $value = '';
349      # We should limit this to a "resonable" number of words  ####      my $word  = '';
350      last if $word !~ /^$prefix/;  ####      my @result;
351      push @result, $word;  ####    
352    }  ####      return () unless defined $prefix; # Full dictionary requested !!
353    \@result;                     # speed  ####      return unless exists $self->{'prefix'};
354  }  ####      defined $self->{db} or $self->open;
355    ####      $self->sync;
356    ####      my $dbh = $self->{dbh};
357    ####      
358    ####      if (ref $self->{'prefix'}) {
359    ####        unless (exists $self->{'pfunc'}) {
360    ####          $self->{'pfunc'} =
361    ####            eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
362    ####        }
363    ####        ($prefix) = &{$self->{'pfunc'}}($prefix);
364    ####      }
365    ####    
366    ####      if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
367    ####        return ();
368    ####      }
369    ####      return () if $word !~ /^p$prefix/;
370    ####      push @result, substr($word,1);
371    ####    
372    ####      while (!$dbh->seq($word, $value, R_NEXT)) {
373    ####        # We should limit this to a "resonable" number of words
374    ####        last if $word !~ /^p$prefix/;
375    ####        push @result, substr($word,1);
376    ####      }
377    ####      \@result;                     # speed
378    }
379    
380    =head2 search($query)
381    
382    The search method supports a range of search algorithms.  It is
383    recommended to tune the index by calling
384    C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
385    into the table.  This is a computing intense operation and all inserts
386    and deletes after this optimization are slightly more expensive.  Once
387    reorganized, the index is kept sorted automatically until you switch
388    the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
389    
390    When searching a tuned index, a query can be processed faster if the
391    caller requests only the topmost documents.  This can be done by
392    passing a C<top =E<gt>> I<n> parameter to the search method.
393    
394    For single term queries, the method returns only the I<n> top ranking
395    documents.  For multi term queries two optimized algorithms are
396    available. The first algorithm computes the top n documents
397    approximately but very fast, sacrificing a little bit of precision for
398    speed.  The second algorithm computes the topmost I<n> documents
399    precisely.  This algorithm is slower and should be used only for small
400    values of I<n>.  It can be requested by passing the query attribute
401    C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
402    While the picky version might not be faster than the brute force
403    version on average for modest size databases it uses less memory and
404    the processing time is almost linear in the number of query terms, not
405    in the size of the lists.
406    
407  sub search {  =cut
408    
409    sub search_ref {
410    my $self  = shift;    my $self  = shift;
411      my $query = shift;
412    
413      my $debugtime = 0;
414      my($time,$entertime);
415      our $STARTTIME;
416      if ($debugtime) {
417        $time = time;
418        $STARTTIME ||= $time;
419        if ($time-$STARTTIME > 5) {
420          $STARTTIME = $time;
421          warn "STARTTIME: $STARTTIME\n";
422        }
423        $entertime = time-$STARTTIME;
424        warn sprintf "ENTER TIME: %.4f\n", $entertime;
425      }
426    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
427    $self->sync;    $self->sync;
428    $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here    my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
429      if ($debugtime) {
430        my $leavetime = time-$STARTTIME;
431        warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
432        if ($leavetime-$entertime > .4) {
433          require Data::Dumper;
434          print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
435              Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
436        }
437      }
438      $ref;
439  }  }
440    
441  sub parse {  sub parse {
# Line 282  sub search_prefix { Line 450  sub search_prefix {
450    
451    # print "search_prefix(@_)\n";    # print "search_prefix(@_)\n";
452    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
453    $self->search_raw(map($self->prefix($_), @_));    $self->search_raw_ref(map($self->prefix($_), @_));
454  }  }
455    
456  sub search_raw {  sub _complain ($$) {
457      my ($action, $term) = @_;
458    
459      require Carp;
460      Carp::cluck
461        (sprintf("WAIT database inconsistency during $action [%s]: ".
462                 "Please rebuild index\n",
463                 $term,));
464    }
465    
466    sub search_raw_ref {
467    my $self  = shift;    my $self  = shift;
468    my %occ;    my $query = shift;
469      # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
470    my %score;    my %score;
471    
472    return () unless @_;    # Top $top_wanted documents must be correct. Zero means all matching documents.
473      my $top_wanted = $query->{top};
474      my $picky_strict = $query->{picky};
475      # the option is really ignore_excess
476      my $ignore_excess = $query->{ignore_excess};
477    
478      # Return at least $minacc documents. Zero means all matching documents.
479    
480      # my $minacc = $query->{accus} || $top_wanted;
481    
482      # Open index and flush cache if necessary
483    defined $self->{db} or $self->open;    defined $self->{db} or $self->open;
484    $self->sync;    $self->sync;
485    grep $occ{$_}++, @_;  
486    for (keys %occ) {    # We keep duplicates
487      if (defined $self->{db}->{$_}) {    my @terms =
488        my %post = unpack 'w*', $self->{db}->{$_};      # Sort words by decreasing document frequency
489        my $idf = log($self->{records}/$self->{db}->{$O,$_});      sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
490        my $did;        # check which words occur in the index.
491        for $did (keys %post) {        grep { $self->{db}->{DOCFREQ_O . $_} } @_;
492          $score{$did} = 0 unless defined $score{$did}; # perl -w  
493          $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf    # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
494            if $self->{db}->{$M, $did}; # db may be broken    return unless @terms;
495    
496      # We special-case one term queries here.  If the index was sorted,
497      # choping off the rest of the list will return the same ranking.
498      if ($top_wanted and @terms == 1) {
499        my $term  = shift @terms;
500        my $idf   = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
501        my @res;
502    
503        if ($self->{reorg}) { # or not $query->{picky}
504          @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
505          # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
506        } else {
507          @res = unpack 'w*',                $self->{db}->{POSTINGLIST_P . $term};
508        }
509    
510        for (my $i=1; $i<@res; $i+=2) {
511          # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
512          # above was written badly, allows two DIV_ZERO problems.
513          my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
514          unless ($maxtf) {
515            warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
516            $maxtf = 1;
517          }
518          $res[$i] = ($res[$i] / $maxtf) * $idf;
519        }
520    
521        my %res = @res; # bloed: @res waere schon sortiert gewesen
522        return \%res;
523      }
524    
525      # We separate exhaustive search here to avoid overhead and make the
526      # code more readable. The block can be removed without changing the
527      # result.
528      unless ($top_wanted) {
529        for (@terms) {
530          my $df      = $self->{db}->{DOCFREQ_O . $_};
531    
532          # The frequency *must* be 1 at least since the posting list is nonempty
533          _complain('search for term', $_) and $df = 1 if $df < 1;
534    
535          # Unpack posting list for current query term $_
536          my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
537    
538          _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
539          # This is the inverse document frequency. The log of the inverse
540          # fraction of documents the term occurs in.
541          my $idf = log($self->{records}/$df);
542          for my $did (keys %post) {
543            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
544              $score{$did} += $post{$did} / $freq * $idf;
545            }
546          }
547        }
548        # warn sprintf "Used %d accumulators\n", scalar keys %score;
549        return \%score;
550      }
551    
552      # A sloppy but fast algorithm for multiple term queries.
553      unless ($picky_strict) {
554        for (@terms) {
555          # Unpack posting list for current query term $_
556          my %post;
557          if ($self->{reorg} && $top_wanted && $ignore_excess) {
558            %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
559          } else {
560            %post = unpack 'w*',                        $self->{db}->{POSTINGLIST_P . $_};
561          }
562          # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
563    
564          # Lookup the number of documents the term occurs in (document frequency)
565          my $occ  = $self->{db}->{DOCFREQ_O . $_};
566    
567          _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
568          # The frequency *must* be 1 at least since the posting list is nonempty
569          _complain('search for term', $_) and $occ = 1 if $occ < 1;
570    
571          # This is the inverse document frequency. The log of the inverse fraction
572          # of documents the term occurs in.
573          my $idf = log($self->{records}/$occ);
574    
575          # If we have a reasonable number of accumulators, change the
576          # loop to iterate over the accumulators.  This will compromise
577          # quality for better speed.  The algorithm still computes the
578          # exact weights, but the result is not guaranteed to contain the
579          # *best* results.  The database might contain documents better
580          # than the worst returned document.
581          
582          # We process the lists in order of increasing length.  When the
583          # number of accumulators exceeds $wanted, no new documents are
584          # added, only the ranking/weighting of the seen documents is
585          # improved.  The resulting ranking list must be pruned, since only
586          # the top most documents end up near their "optimal" rank.
587          
588          if (keys %score < $top_wanted) {
589    
590            # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
591            # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
592    
593            # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
594            # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
595            # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
596            # vermutlich billig.
597    
598            for my $did (keys %post) {
599              if (my $freq = $self->{db}->{MAXTF_M . $did}) {
600                $score{$did} += $post{$did} / $freq * $idf;
601              }
602            }
603          } else {
604            for my $did (keys %score) {
605              next unless exists $post{$did};
606              if (my $freq = $self->{db}->{MAXTF_M . $did}) {
607                $score{$did} += $post{$did} / $freq * $idf;
608              }
609            }
610        }        }
611      }      }
612        warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
613                     time,
614                     join(":",@terms),
615                     scalar keys %score,
616                    );
617        return \%score;
618      }
619      my @max; $max[$#terms+1]=0;
620      my @idf;
621    
622      # Preparation loop.  This extra loop makes sense only when "reorg"
623      # and "wanted" are true.  But at the time beeing, keeping the code
624      # for the different search algorithms in one place seems more
625      # desirable than some minor speedup of the brute force version.  We
626      # do cache $idf though.
627    
628      for (my $i = $#terms; $i >=0; $i--) {
629        local $_ = $terms[$i];
630        # Lookup the number of documents the term occurs in (document frequency)
631        my $df      = $self->{db}->{DOCFREQ_O . $_};
632    
633        # The frequency *must* be 1 at least since the posting list is nonempty
634        _complain('search for term', $_) and $df = 1 if $df < 1;
635    
636        # This is the inverse document frequency. The log of the inverse
637        # fraction of documents the term occurs in.
638        $idf[$i] = log($self->{records}/$df);
639    
640        my ($did,$occ);
641        if ($self->{reorg}) {
642          ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
643        } else {                    # Maybe this costs more than it helps
644          ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
645        }
646        my $freq      = $self->{db}->{MAXTF_M . $did};
647        my $max       = $occ/$freq*$idf[$i];
648        $max[$i]      = $max + $max[$i+1];
649    }    }
650    %score;  
651      # Main loop
652      for my $i (0 .. $#terms) {
653        my $term = $terms[$i];
654        # Unpack posting list for current query term $term. We loose the
655        # sorting order because the assignment to a hash.
656        my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
657    
658        _complain('search for term', $term)
659          if $self->{db}->{DOCFREQ_O . $term} != keys %post;
660    
661        my $idf  = $idf[$i];
662        my $full;                   # Need to process all postings
663        my $chop;                   # Score necessary to enter the ranking list
664    
665        if (# We know that wanted is true since we special cased the
666            # exhaustive search.
667    
668            $top_wanted and
669    
670            # We did sort here if necessary in the preparation loop:
671            # $self->{reorg} and
672    
673            scalar keys %score > $top_wanted) {
674          $chop = (sort { $b <=> $a } values %score)[$top_wanted];
675          $full = $max[$i] > $chop;
676        } else {
677          $full = 1;
678        }
679    
680        if ($full) {
681          # We need to inspect the full list. Either $top_wanted is not given,
682          # the index is not sorted, or we don't have enough accumulators
683          # yet.
684          if (defined $chop) {
685            # We might be able to avoid allocating accumulators
686            for my $did (keys %post) {
687              if (my $freq = $self->{db}->{MAXTF_M . $did}) {
688                my $wgt = $post{$did} / $freq * $idf;
689                # We add an accumulator if $wgt exeeds $chop
690                if (exists $score{$did} or $wgt > $chop) {
691                  $score{$did} += $wgt;
692                }
693              }
694            }
695          } else {
696            # Allocate acumulators for each seen document.
697            for my $did (keys %post) {
698              if (my $freq = $self->{db}->{MAXTF_M . $did}) {
699                $score{$did} += $post{$did} / $freq * $idf;
700              }
701            }
702          }
703        } else {
704          # Update existing accumulators
705          for my $did (keys %score) {
706            next unless exists $post{$did};
707            if (my $freq = $self->{db}->{MAXTF_M . $did}) {
708              $score{$did} += $post{$did} / $freq * $idf;
709            }
710          }
711        }
712      }
713      #warn sprintf "Used %d accumulators\n", scalar keys %score;
714      \%score;
715    }
716    
717    sub set {
718      my ($self, $attr, $value) = @_;
719    
720      die "No such index attribute: '$attr'" unless $attr eq 'top';
721    
722      return delete $self->{reorg} if $value == 0;
723    
724      return if     $self->{reorg};     # we are sorted already
725      return unless $self->{mode} & O_RDWR;
726      defined $self->{db} or $self->open;
727    
728      $self->sync;
729      while (my($key, $value) = each %{$self->{db}}) {
730        next if $key !~ /^p/; # some day use PMATCH
731        $self->{db}{$key} = $self->sort_postings($value);
732      }
733      $self->{reorg} = 1;
734  }  }
735    
736  sub sync {  sub sync {
737    my $self = shift;    my $self = shift;
738      return unless $self->{mode} & O_RDWR;
739    if ($self->{mode} & O_RDWR) {    Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
740      print STDERR "\aFlushing $self->{cached} postings\n";          if $self->{cached};
741      while (my($key, $value) = each %{$self->{cache}}) {    while (my($key, $value) = each %{$self->{cache}}) {
742        $self->{db}->{$key} .= $value;      $self->{db}{POSTINGLIST_P . $key} ||= "";
743        #delete $self->{cache}->{$key};      if ($self->{reorg}) {
744      }        $self->{db}->{POSTINGLIST_P . $key} =
745      while (my($key, $value) = each %{$self->{cdict}}) {            $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
746        $self->{db}->{$key} = 0 unless  $self->{db}->{$key};                                 . $value);
747        $self->{db}->{$key} += $value;      } else {
748        #delete $self->{cdict}->{$key};        $self->{db}->{POSTINGLIST_P . $key} .= $value;
749      }      }
     $self->{cache} = {};  
     $self->{cdict} = {};  
     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";  
     $self->{cached} = 0;  
     # $self->{dbh}->sync if $self->{dbh};  
750    }    }
751      while (my($key, $value) = each %{$self->{cdict}}) {
752        $self->{db}->{DOCFREQ_O . $key} = 0 unless  $self->{db}->{DOCFREQ_O . $key};
753        $self->{db}->{DOCFREQ_O . $key} += $value;
754      }
755      $self->{cache}  = {};
756      $self->{cdict}  = {};
757      $self->{cached} = 0;
758  }  }
759    
760  sub close {  sub close {
761    my $self = shift;    my $self = shift;
762    
763      delete $self->{env};
764    if ($self->{dbh}) {    if ($self->{dbh}) {
765      $self->sync;      $self->sync;
766      delete $self->{dbh};      delete $self->{dbh};
767      untie %{$self->{db}};      untie %{$self->{db}};
768      delete $self->{db};      for my $att (qw(db func cache cached cdict file maindbfile)) {
769      delete $self->{func};        delete $self->{$att};
770      delete $self->{cache};      }
771      delete $self->{cached};      for my $att (qw(pfunc ifunc xfunc)) {
772      delete $self->{cdict};        delete $self->{$att} if defined $self->{$att};
773      delete $self->{pfunc} if defined $self->{pfunc};      }
     delete $self->{ifunc} if defined $self->{ifunc};  
     delete $self->{xfunc} if defined $self->{xfunc};  
774    }    }
775  }  }
776    
777    sub keys {
778      my $self  = shift;
779    
780      defined $self->{db} or $self->open;
781      keys %{$self->{db}};
782    }
783    
784  1;  1;
785    

Legend:
Removed from v.11  
changed lines
  Added in v.108

  ViewVC Help
Powered by ViewVC 1.1.26