/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 108 - (hide annotations)
Tue Jul 13 17:41:12 2004 UTC (19 years, 11 months ago) by dpavlin
File size: 25179 byte(s)
beginning of version 2.0 using BerkeleyDB (non-functional for now)

1 dpavlin 108 # -*- Mode: cperl; fill-column: 79 -*-
2 ulpfr 19 # $Basename: InvertedIndex.pm $
3     # $Revision: 1.30 $
4 ulpfr 10 # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7 dpavlin 89 # Last Modified On: Mon Apr 22 16:52:01 2002
8 ulpfr 10 # Language : CPerl
9 ulpfr 19 #
10 ulpfr 80 # (C) Copyright 1996-2002, Ulrich Pfeifer
11 ulpfr 19 #
12 ulpfr 10
13     package WAIT::InvertedIndex;
14     use strict;
15 dpavlin 108 use BerkeleyDB;
16 ulpfr 10 use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19 laperla 30 use vars qw(%FUNC $VERSION);
20 dpavlin 108 use Time::HiRes qw(time);
21 ulpfr 10
22 dpavlin 108 $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23 laperla 30
24 dpavlin 108 use constant DOCFREQ_O => "o";
25     use constant MAXTF_M => "m";
26     use constant POSTINGLIST_P => "p";
27     use constant PMATCH => qr/^(??{POSTINGLIST_P()})/;
28    
29 ulpfr 22 # The dictionary has three different key types:
30 dpavlin 108
31 ulpfr 22 # 'o'.$word
32     #
33     # The document frequency is the number of documents a term occurs
34 ulpfr 80 # in. The idea is that a term occuring in a significant portion of the
35 ulpfr 22 # documents is not too significant.
36     #
37     # 'm'.$word
38     #
39     # The maximum term frequency of a document is the frequency of the
40     # most frequent term in the document. It is related to the document
41 laperla 71 # length obviously. A document in which the most frequent term occurs
42 ulpfr 22 # 100 times is probably much longer than a document whichs most
43     # frequent term occurs five time.
44     #
45     # 'p'.$word
46     #
47     # Under this key we store the actual posting list as pairs of
48     # packed integers.
49 ulpfr 10
50     sub new {
51     my $type = shift;
52     my %parm = @_;
53     my $self = {};
54    
55 dpavlin 108 for my $x (qw(file attr subname env maindbfile tablename)) {
56     $self->{$x} = $parm{$x} or croak "No $x specified";
57     }
58    
59 ulpfr 10 $self->{filter} = $parm{filter};
60     $self->{'name'} = $parm{'name'};
61     $self->{records} = 0;
62     for (qw(intervall prefix)) {
63     if (exists $parm{$_}) {
64     if (ref $parm{$_}) {
65     $self->{$_} = [@{$parm{$_}}] # clone
66     } else {
67     $self->{$_} = $parm{$_}
68     }
69     }
70     }
71     bless $self, ref($type) || $type;
72     }
73    
74 dpavlin 108 for my $accessor (qw(name maindbfile tablename subname)) {
75     no strict 'refs';
76     *{$accessor} = sub {
77     my($self) = @_;
78     return $self->{$accessor} if $self->{$accessor};
79     require Carp;
80     Carp::confess("accessor $accessor not there");
81     }
82     }
83 ulpfr 10
84     sub _split_pos {
85     my ($text, $pos) = @{$_[0]};
86     my @result;
87    
88     $text =~ s/(^\s+)// and $pos += length($1);
89     while ($text =~ s/(^\S+)//) {
90     my $word = $1;
91     push @result, [$word, $pos];
92     $pos += length($word);
93     $text =~ s/(^\s+)// and $pos += length($1);
94     }
95     @result;
96     }
97    
98     sub _xfiltergen {
99     my $filter = pop @_;
100    
101 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
102     # stopwords, such as isotr clobbers "isn't" to "isnt".
103    
104     # if ($filter eq 'stop') { # avoid the slow stopword elimination
105     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
106     # }
107 ulpfr 10 if (@_) {
108     if ($filter =~ /^split(\d*)/) {
109     if ($1) {
110     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
111     } else {
112     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
113     }
114     } else {
115     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
116     }
117     } else {
118     if ($filter =~ /^split(\d*)/) {
119     if ($1) {
120     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
121     } else {
122     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
123     }
124     } else {
125     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
126     }
127     }
128     }
129    
130     sub parse_pos {
131     my $self = shift;
132    
133     unless (exists $self->{xfunc}) {
134     $self->{xfunc} =
135     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
136     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
137     }
138     &{$self->{xfunc}}($_[0]);
139     }
140    
141     sub _filtergen {
142     my $filter = pop @_;
143    
144     if (@_) {
145     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
146     } else {
147     "map(&WAIT::Filter::$filter(\$_), \@_)";
148     }
149     }
150    
151     sub drop {
152     my $self = shift;
153     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
154     my $file = $self->{file};
155    
156     ! (!-e $file or unlink $file);
157     } else { # notify our database
158     croak ref($self)."::drop called directly";
159     }
160     }
161    
162     sub open {
163     my $self = shift;
164     my $file = $self->{file};
165    
166     if (defined $self->{dbh}) {
167     $self->{dbh};
168     } else {
169     $self->{func} =
170     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
171 dpavlin 108 my $flags;
172     if ($self->{mode} & O_RDWR) {
173     $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
174     warn "Flags on inverted $file set to 'writing'";
175     } else {
176     $flags = DB_RDONLY;
177     # warn "Flags on inverted $file set to 'readonly'";
178     }
179     my $filename = $self->maindbfile or die;
180     my $subname = join("/",$self->tablename || die,$self->subname || die);
181     my $env = $self->{env} || "[undef]";
182     $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
183     # Filename => $file,
184     Filename => $filename,
185     $self->{env} ? (Env => $self->{env}) : (),
186     Subname => $subname,
187     Mode => 0664,
188     Flags => $flags,
189     $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
190     $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
191     ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
192 ulpfr 10 $self->{cache} = {}
193     if $self->{mode} & O_RDWR;
194     $self->{cdict} = {}
195     if $self->{mode} & O_RDWR;
196     $self->{cached} = 0;
197     }
198     }
199    
200     sub insert {
201     my $self = shift;
202     my $key = shift;
203     my %occ;
204 ulpfr 13
205 ulpfr 10 defined $self->{db} or $self->open;
206 dpavlin 108 defined $self->{db} or die "open didn't help!!!";
207 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
208     my ($word, $noc);
209     $self->{records}++;
210     while (($word, $noc) = each %occ) {
211     if (defined $self->{cache}->{$word}) {
212 ulpfr 22 $self->{cdict}->{$word}++;
213 ulpfr 10 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
214 dpavlin 108 } else {
215 ulpfr 22 $self->{cdict}->{$word} = 1;
216 ulpfr 10 $self->{cache}->{$word} = pack 'w2', $key, $noc;
217 ulpfr 13 }
218 ulpfr 10 $self->{cached}++;
219     }
220 ulpfr 19 # This cache limit should be configurable
221 ulpfr 10 $self->sync if $self->{cached} > 100_000;
222     my $maxtf = 0;
223     for (values %occ) {
224     $maxtf = $_ if $_ > $maxtf;
225     }
226 dpavlin 108 $self->{db}->{MAXTF_M . $key} = $maxtf;
227 ulpfr 10 }
228    
229 ulpfr 19 # We sort postings by increasing max term frequency (~ by increasing
230     # document length. This reduces the quality degradation if we process
231     # only the first part of a posting list.
232    
233     sub sort_postings {
234     my $self = shift;
235     my $post = shift; # reference to a hash or packed string
236    
237     if (ref $post) {
238     # we skip the sort part, if the index is not sorted
239     return pack('w*', %$post) unless $self->{reorg};
240     } else {
241     $post = { unpack 'w*', $post };
242     }
243    
244     my $r = '';
245    
246     # Sort posting list by increasing ratio of maximum term frequency (~
247 ulpfr 80 # "document length") and term frequency. This ratio multipied by the
248 ulpfr 19 # inverse document frequence gives the score for a term. This sort
249     # order can be exploited for tuning of single term queries.
250    
251 laperla 30 for my $did (keys %$post) { # sanity check
252 dpavlin 108 unless ($self->{db}->{MAXTF_M . $did}) {
253     warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
254     $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
255 laperla 30 }
256     }
257 dpavlin 108 for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b}
258 ulpfr 19 <=>
259 dpavlin 108 $post->{$a} / $self->{db}->{MAXTF_M . $a}
260 ulpfr 19 } keys %$post) {
261     $r .= pack 'w2', $did, $post->{$did};
262     }
263     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
264     $r;
265     }
266    
267 ulpfr 10 sub delete {
268     my $self = shift;
269     my $key = shift;
270     my %occ;
271    
272 ulpfr 19 my $db;
273 ulpfr 10 defined $self->{db} or $self->open;
274 ulpfr 19 $db = $self->{db};
275 ulpfr 10 $self->sync;
276     $self->{records}--;
277 ulpfr 19
278     # less than zero documents in database?
279     _complain('delete of document', $key) and $self->{records} = 0
280     if $self->{records} < 0;
281    
282 ulpfr 10 grep $occ{$_}++, &{$self->{func}}(@_);
283 ulpfr 19
284 laperla 30 # Be prepared for "Odd number of elements in hash assignment"
285     local $SIG{__WARN__} = sub {
286     my $warning = shift;
287     chomp $warning;
288     warn "Catching warning[$warning] during delete of key[$key]";
289     };
290 ulpfr 19 for (keys %occ) {# may reorder posting list
291 dpavlin 108 my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
292 ulpfr 10 delete $post{$key};
293 dpavlin 108 $db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post);
294     _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
295     $db->{DOCFREQ_O . $_} = scalar keys %post;
296 ulpfr 10 }
297 dpavlin 108 delete $db->{MAXTF_M . $key};
298 ulpfr 10 }
299    
300     sub intervall {
301     my ($self, $first, $last) = @_;
302    
303 dpavlin 108 die "intervall broken in this version of WAIT: need to fix the
304     R_CURSOR and R_NEXT lines";
305 ulpfr 10
306 dpavlin 108 #### my $value = '';
307     #### my $word = '';
308     #### my @result;
309     ####
310     #### return unless exists $self->{'intervall'};
311     ####
312     #### defined $self->{db} or $self->open;
313     #### $self->sync;
314     #### my $dbh = $self->{dbh}; # for convenience
315     ####
316     #### if (ref $self->{'intervall'}) {
317     #### unless (exists $self->{'ifunc'}) {
318     #### $self->{'ifunc'} =
319     #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
320     #### }
321     #### ($first) = &{$self->{'ifunc'}}($first) if $first;
322     #### ($last) = &{$self->{'ifunc'}}($last) if $last;
323     #### }
324     #### $first = POSTINGLIST_P . ($first||'');
325     #### $last = (defined $last)?POSTINGLIST_P . $last:'q';
326     ####
327     #### # set the cursor to $first
328     #### $dbh->seq($first, $value, R_CURSOR);
329     ####
330     #### # $first would be after the last word
331     #### return () if $first gt $last;
332     ####
333     #### push @result, substr($first,1);
334     #### while (!$dbh->seq($word, $value, R_NEXT)) {
335     #### # We should limit this to a "resonable" number of words
336     #### last if $word gt $last;
337     #### push @result, substr($word,1);
338     #### }
339     #### \@result; # speed
340 ulpfr 10 }
341    
342     sub prefix {
343     my ($self, $prefix) = @_;
344    
345 dpavlin 108 die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
346 ulpfr 10
347    
348 dpavlin 108 #### my $value = '';
349     #### my $word = '';
350     #### my @result;
351     ####
352     #### return () unless defined $prefix; # Full dictionary requested !!
353     #### return unless exists $self->{'prefix'};
354     #### defined $self->{db} or $self->open;
355     #### $self->sync;
356     #### my $dbh = $self->{dbh};
357     ####
358     #### if (ref $self->{'prefix'}) {
359     #### unless (exists $self->{'pfunc'}) {
360     #### $self->{'pfunc'} =
361     #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
362     #### }
363     #### ($prefix) = &{$self->{'pfunc'}}($prefix);
364     #### }
365     ####
366     #### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
367     #### return ();
368     #### }
369     #### return () if $word !~ /^p$prefix/;
370     #### push @result, substr($word,1);
371     ####
372     #### while (!$dbh->seq($word, $value, R_NEXT)) {
373     #### # We should limit this to a "resonable" number of words
374     #### last if $word !~ /^p$prefix/;
375     #### push @result, substr($word,1);
376     #### }
377     #### \@result; # speed
378 ulpfr 10 }
379    
380 ulpfr 19 =head2 search($query)
381    
382     The search method supports a range of search algorithms. It is
383     recommended to tune the index by calling
384     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
385     into the table. This is a computing intense operation and all inserts
386     and deletes after this optimization are slightly more expensive. Once
387     reorganized, the index is kept sorted automatically until you switch
388     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
389    
390     When searching a tuned index, a query can be processed faster if the
391     caller requests only the topmost documents. This can be done by
392     passing a C<top =E<gt>> I<n> parameter to the search method.
393    
394     For single term queries, the method returns only the I<n> top ranking
395     documents. For multi term queries two optimized algorithms are
396     available. The first algorithm computes the top n documents
397     approximately but very fast, sacrificing a little bit of precision for
398     speed. The second algorithm computes the topmost I<n> documents
399     precisely. This algorithm is slower and should be used only for small
400     values of I<n>. It can be requested by passing the query attribute
401     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
402     While the picky version might not be faster than the brute force
403     version on average for modest size databases it uses less memory and
404     the processing time is almost linear in the number of query terms, not
405     in the size of the lists.
406    
407     =cut
408    
409 dpavlin 108 sub search_ref {
410 ulpfr 10 my $self = shift;
411 ulpfr 19 my $query = shift;
412 ulpfr 10
413 dpavlin 108 my $debugtime = 0;
414     my($time,$entertime);
415     our $STARTTIME;
416     if ($debugtime) {
417     $time = time;
418     $STARTTIME ||= $time;
419     if ($time-$STARTTIME > 5) {
420     $STARTTIME = $time;
421     warn "STARTTIME: $STARTTIME\n";
422     }
423     $entertime = time-$STARTTIME;
424     warn sprintf "ENTER TIME: %.4f\n", $entertime;
425     }
426 ulpfr 10 defined $self->{db} or $self->open;
427     $self->sync;
428 dpavlin 108 my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
429     if ($debugtime) {
430     my $leavetime = time-$STARTTIME;
431     warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
432     if ($leavetime-$entertime > .4) {
433     require Data::Dumper;
434     print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
435     Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
436     }
437     }
438     $ref;
439 ulpfr 10 }
440    
441     sub parse {
442     my $self = shift;
443    
444     defined $self->{db} or $self->open;
445     &{$self->{func}}(@_);
446     }
447    
448     sub search_prefix {
449     my $self = shift;
450    
451     # print "search_prefix(@_)\n";
452     defined $self->{db} or $self->open;
453 dpavlin 108 $self->search_raw_ref(map($self->prefix($_), @_));
454 ulpfr 10 }
455    
456 ulpfr 19 sub _complain ($$) {
457     my ($action, $term) = @_;
458    
459     require Carp;
460     Carp::cluck
461     (sprintf("WAIT database inconsistency during $action [%s]: ".
462     "Please rebuild index\n",
463     $term,));
464     }
465    
466 dpavlin 108 sub search_raw_ref {
467 ulpfr 10 my $self = shift;
468 ulpfr 19 my $query = shift;
469 dpavlin 108 # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
470 ulpfr 10 my %score;
471    
472 dpavlin 108 # Top $top_wanted documents must be correct. Zero means all matching documents.
473     my $top_wanted = $query->{top};
474     my $picky_strict = $query->{picky};
475     # the option is really ignore_excess
476     my $ignore_excess = $query->{ignore_excess};
477 ulpfr 10
478 dpavlin 108 # Return at least $minacc documents. Zero means all matching documents.
479 ulpfr 19
480 dpavlin 108 # my $minacc = $query->{accus} || $top_wanted;
481    
482 ulpfr 19 # Open index and flush cache if necessary
483 ulpfr 10 defined $self->{db} or $self->open;
484     $self->sync;
485 ulpfr 19
486     # We keep duplicates
487     my @terms =
488     # Sort words by decreasing document frequency
489 dpavlin 108 sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
490 ulpfr 19 # check which words occur in the index.
491 dpavlin 108 grep { $self->{db}->{DOCFREQ_O . $_} } @_;
492 ulpfr 19
493 dpavlin 108 # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
494 laperla 40 return unless @terms;
495 ulpfr 19
496     # We special-case one term queries here. If the index was sorted,
497     # choping off the rest of the list will return the same ranking.
498 dpavlin 108 if ($top_wanted and @terms == 1) {
499 ulpfr 19 my $term = shift @terms;
500 dpavlin 108 my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
501 ulpfr 19 my @res;
502    
503     if ($self->{reorg}) { # or not $query->{picky}
504 dpavlin 108 @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
505     # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
506 ulpfr 19 } else {
507 dpavlin 108 @res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
508 ulpfr 19 }
509    
510     for (my $i=1; $i<@res; $i+=2) {
511 dpavlin 108 # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
512 laperla 30 # above was written badly, allows two DIV_ZERO problems.
513 dpavlin 108 my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
514 laperla 30 unless ($maxtf) {
515     warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
516     $maxtf = 1;
517     }
518     $res[$i] = ($res[$i] / $maxtf) * $idf;
519 ulpfr 19 }
520    
521 dpavlin 108 my %res = @res; # bloed: @res waere schon sortiert gewesen
522     return \%res;
523 ulpfr 19 }
524    
525     # We separate exhaustive search here to avoid overhead and make the
526     # code more readable. The block can be removed without changing the
527     # result.
528 dpavlin 108 unless ($top_wanted) {
529 ulpfr 19 for (@terms) {
530 dpavlin 108 my $df = $self->{db}->{DOCFREQ_O . $_};
531 ulpfr 19
532     # The frequency *must* be 1 at least since the posting list is nonempty
533     _complain('search for term', $_) and $df = 1 if $df < 1;
534    
535     # Unpack posting list for current query term $_
536 dpavlin 108 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
537 ulpfr 19
538 dpavlin 108 _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
539 ulpfr 19 # This is the inverse document frequency. The log of the inverse
540     # fraction of documents the term occurs in.
541     my $idf = log($self->{records}/$df);
542     for my $did (keys %post) {
543 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
544 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
545     }
546 ulpfr 10 }
547     }
548 ulpfr 19 # warn sprintf "Used %d accumulators\n", scalar keys %score;
549 dpavlin 108 return \%score;
550 ulpfr 10 }
551 ulpfr 19
552     # A sloppy but fast algorithm for multiple term queries.
553 dpavlin 108 unless ($picky_strict) {
554 ulpfr 19 for (@terms) {
555     # Unpack posting list for current query term $_
556 dpavlin 108 my %post;
557     if ($self->{reorg} && $top_wanted && $ignore_excess) {
558     %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
559     } else {
560     %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
561     }
562     # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
563 ulpfr 19
564     # Lookup the number of documents the term occurs in (document frequency)
565 dpavlin 108 my $occ = $self->{db}->{DOCFREQ_O . $_};
566 ulpfr 19
567 dpavlin 108 _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
568 ulpfr 19 # The frequency *must* be 1 at least since the posting list is nonempty
569     _complain('search for term', $_) and $occ = 1 if $occ < 1;
570    
571 dpavlin 108 # This is the inverse document frequency. The log of the inverse fraction
572     # of documents the term occurs in.
573 ulpfr 19 my $idf = log($self->{records}/$occ);
574    
575     # If we have a reasonable number of accumulators, change the
576     # loop to iterate over the accumulators. This will compromise
577     # quality for better speed. The algorithm still computes the
578     # exact weights, but the result is not guaranteed to contain the
579     # *best* results. The database might contain documents better
580     # than the worst returned document.
581    
582     # We process the lists in order of increasing length. When the
583     # number of accumulators exceeds $wanted, no new documents are
584     # added, only the ranking/weighting of the seen documents is
585     # improved. The resulting ranking list must be pruned, since only
586     # the top most documents end up near their "optimal" rank.
587    
588 dpavlin 108 if (keys %score < $top_wanted) {
589    
590     # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
591     # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
592    
593     # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
594     # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
595     # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
596     # vermutlich billig.
597    
598 ulpfr 19 for my $did (keys %post) {
599 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
600 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
601     }
602     }
603     } else {
604     for my $did (keys %score) {
605     next unless exists $post{$did};
606 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
607 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
608     }
609     }
610     }
611     }
612 dpavlin 108 warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
613     time,
614     join(":",@terms),
615     scalar keys %score,
616     );
617     return \%score;
618 ulpfr 19 }
619     my @max; $max[$#terms+1]=0;
620     my @idf;
621    
622     # Preparation loop. This extra loop makes sense only when "reorg"
623     # and "wanted" are true. But at the time beeing, keeping the code
624     # for the different search algorithms in one place seems more
625     # desirable than some minor speedup of the brute force version. We
626     # do cache $idf though.
627    
628     for (my $i = $#terms; $i >=0; $i--) {
629     local $_ = $terms[$i];
630     # Lookup the number of documents the term occurs in (document frequency)
631 dpavlin 108 my $df = $self->{db}->{DOCFREQ_O . $_};
632 ulpfr 19
633     # The frequency *must* be 1 at least since the posting list is nonempty
634     _complain('search for term', $_) and $df = 1 if $df < 1;
635    
636     # This is the inverse document frequency. The log of the inverse
637     # fraction of documents the term occurs in.
638     $idf[$i] = log($self->{records}/$df);
639    
640     my ($did,$occ);
641     if ($self->{reorg}) {
642 dpavlin 108 ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
643 ulpfr 19 } else { # Maybe this costs more than it helps
644 dpavlin 108 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
645 ulpfr 19 }
646 dpavlin 108 my $freq = $self->{db}->{MAXTF_M . $did};
647 ulpfr 19 my $max = $occ/$freq*$idf[$i];
648     $max[$i] = $max + $max[$i+1];
649     }
650    
651 dpavlin 108 # Main loop
652 ulpfr 19 for my $i (0 .. $#terms) {
653     my $term = $terms[$i];
654     # Unpack posting list for current query term $term. We loose the
655     # sorting order because the assignment to a hash.
656 dpavlin 108 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
657 ulpfr 19
658     _complain('search for term', $term)
659 dpavlin 108 if $self->{db}->{DOCFREQ_O . $term} != keys %post;
660 ulpfr 19
661     my $idf = $idf[$i];
662     my $full; # Need to process all postings
663     my $chop; # Score necessary to enter the ranking list
664    
665 ulpfr 80 if (# We know that wanted is true since we special cased the
666 ulpfr 19 # exhaustive search.
667    
668 dpavlin 108 $top_wanted and
669 ulpfr 19
670 dpavlin 108 # We did sort here if necessary in the preparation loop:
671 ulpfr 19 # $self->{reorg} and
672    
673 dpavlin 108 scalar keys %score > $top_wanted) {
674     $chop = (sort { $b <=> $a } values %score)[$top_wanted];
675 ulpfr 19 $full = $max[$i] > $chop;
676     } else {
677     $full = 1;
678     }
679    
680     if ($full) {
681 dpavlin 108 # We need to inspect the full list. Either $top_wanted is not given,
682 ulpfr 19 # the index is not sorted, or we don't have enough accumulators
683     # yet.
684     if (defined $chop) {
685     # We might be able to avoid allocating accumulators
686     for my $did (keys %post) {
687 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
688 ulpfr 19 my $wgt = $post{$did} / $freq * $idf;
689     # We add an accumulator if $wgt exeeds $chop
690     if (exists $score{$did} or $wgt > $chop) {
691     $score{$did} += $wgt;
692     }
693     }
694     }
695     } else {
696     # Allocate acumulators for each seen document.
697     for my $did (keys %post) {
698 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
699 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
700     }
701     }
702     }
703     } else {
704     # Update existing accumulators
705     for my $did (keys %score) {
706     next unless exists $post{$did};
707 dpavlin 108 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
708 ulpfr 19 $score{$did} += $post{$did} / $freq * $idf;
709     }
710     }
711     }
712     }
713     #warn sprintf "Used %d accumulators\n", scalar keys %score;
714 dpavlin 108 \%score;
715 ulpfr 10 }
716    
717 ulpfr 19 sub set {
718     my ($self, $attr, $value) = @_;
719    
720 ulpfr 83 die "No such index attribute: '$attr'" unless $attr eq 'top';
721 ulpfr 19
722     return delete $self->{reorg} if $value == 0;
723    
724     return if $self->{reorg}; # we are sorted already
725     return unless $self->{mode} & O_RDWR;
726     defined $self->{db} or $self->open;
727    
728     $self->sync;
729     while (my($key, $value) = each %{$self->{db}}) {
730 dpavlin 108 next if $key !~ /^p/; # some day use PMATCH
731     $self->{db}{$key} = $self->sort_postings($value);
732 ulpfr 19 }
733     $self->{reorg} = 1;
734     }
735    
736 ulpfr 10 sub sync {
737     my $self = shift;
738 dpavlin 108 return unless $self->{mode} & O_RDWR;
739     Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
740     if $self->{cached};
741     while (my($key, $value) = each %{$self->{cache}}) {
742     $self->{db}{POSTINGLIST_P . $key} ||= "";
743     if ($self->{reorg}) {
744     $self->{db}->{POSTINGLIST_P . $key} =
745     $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
746     . $value);
747     } else {
748     $self->{db}->{POSTINGLIST_P . $key} .= $value;
749 ulpfr 10 }
750     }
751 dpavlin 108 while (my($key, $value) = each %{$self->{cdict}}) {
752     $self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key};
753     $self->{db}->{DOCFREQ_O . $key} += $value;
754     }
755     $self->{cache} = {};
756     $self->{cdict} = {};
757     $self->{cached} = 0;
758 ulpfr 10 }
759    
760     sub close {
761     my $self = shift;
762    
763 dpavlin 108 delete $self->{env};
764 ulpfr 10 if ($self->{dbh}) {
765     $self->sync;
766     delete $self->{dbh};
767     untie %{$self->{db}};
768 dpavlin 108 for my $att (qw(db func cache cached cdict file maindbfile)) {
769     delete $self->{$att};
770     }
771     for my $att (qw(pfunc ifunc xfunc)) {
772     delete $self->{$att} if defined $self->{$att};
773     }
774 ulpfr 10 }
775     }
776    
777 ulpfr 51 sub keys {
778     my $self = shift;
779    
780     defined $self->{db} or $self->open;
781     keys %{$self->{db}};
782     }
783    
784 ulpfr 10 1;
785    

Properties

Name Value
cvs2svn:cvs-rev 1.12

  ViewVC Help
Powered by ViewVC 1.1.26