/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 108 - (show annotations)
Tue Jul 13 17:41:12 2004 UTC (19 years, 9 months ago) by dpavlin
File size: 25179 byte(s)
beginning of version 2.0 using BerkeleyDB (non-functional for now)

1 # -*- Mode: cperl; fill-column: 79 -*-
2 # $Basename: InvertedIndex.pm $
3 # $Revision: 1.30 $
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Mon Apr 22 16:52:01 2002
8 # Language : CPerl
9 #
10 # (C) Copyright 1996-2002, Ulrich Pfeifer
11 #
12
13 package WAIT::InvertedIndex;
14 use strict;
15 use BerkeleyDB;
16 use Fcntl;
17 use WAIT::Filter;
18 use Carp;
19 use vars qw(%FUNC $VERSION);
20 use Time::HiRes qw(time);
21
22 $VERSION = "2.000"; # others test if we are loaded by checking $VERSION
23
24 use constant DOCFREQ_O => "o";
25 use constant MAXTF_M => "m";
26 use constant POSTINGLIST_P => "p";
27 use constant PMATCH => qr/^(??{POSTINGLIST_P()})/;
28
29 # The dictionary has three different key types:
30
31 # 'o'.$word
32 #
33 # The document frequency is the number of documents a term occurs
34 # in. The idea is that a term occuring in a significant portion of the
35 # documents is not too significant.
36 #
37 # 'm'.$word
38 #
39 # The maximum term frequency of a document is the frequency of the
40 # most frequent term in the document. It is related to the document
41 # length obviously. A document in which the most frequent term occurs
42 # 100 times is probably much longer than a document whichs most
43 # frequent term occurs five time.
44 #
45 # 'p'.$word
46 #
47 # Under this key we store the actual posting list as pairs of
48 # packed integers.
49
50 sub new {
51 my $type = shift;
52 my %parm = @_;
53 my $self = {};
54
55 for my $x (qw(file attr subname env maindbfile tablename)) {
56 $self->{$x} = $parm{$x} or croak "No $x specified";
57 }
58
59 $self->{filter} = $parm{filter};
60 $self->{'name'} = $parm{'name'};
61 $self->{records} = 0;
62 for (qw(intervall prefix)) {
63 if (exists $parm{$_}) {
64 if (ref $parm{$_}) {
65 $self->{$_} = [@{$parm{$_}}] # clone
66 } else {
67 $self->{$_} = $parm{$_}
68 }
69 }
70 }
71 bless $self, ref($type) || $type;
72 }
73
74 for my $accessor (qw(name maindbfile tablename subname)) {
75 no strict 'refs';
76 *{$accessor} = sub {
77 my($self) = @_;
78 return $self->{$accessor} if $self->{$accessor};
79 require Carp;
80 Carp::confess("accessor $accessor not there");
81 }
82 }
83
84 sub _split_pos {
85 my ($text, $pos) = @{$_[0]};
86 my @result;
87
88 $text =~ s/(^\s+)// and $pos += length($1);
89 while ($text =~ s/(^\S+)//) {
90 my $word = $1;
91 push @result, [$word, $pos];
92 $pos += length($word);
93 $text =~ s/(^\s+)// and $pos += length($1);
94 }
95 @result;
96 }
97
98 sub _xfiltergen {
99 my $filter = pop @_;
100
101 # Oops, we cannot overrule the user's choice. Other filters may kill
102 # stopwords, such as isotr clobbers "isn't" to "isnt".
103
104 # if ($filter eq 'stop') { # avoid the slow stopword elimination
105 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
106 # }
107 if (@_) {
108 if ($filter =~ /^split(\d*)/) {
109 if ($1) {
110 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
111 } else {
112 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
113 }
114 } else {
115 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
116 }
117 } else {
118 if ($filter =~ /^split(\d*)/) {
119 if ($1) {
120 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
121 } else {
122 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
123 }
124 } else {
125 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
126 }
127 }
128 }
129
130 sub parse_pos {
131 my $self = shift;
132
133 unless (exists $self->{xfunc}) {
134 $self->{xfunc} =
135 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
136 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
137 }
138 &{$self->{xfunc}}($_[0]);
139 }
140
141 sub _filtergen {
142 my $filter = pop @_;
143
144 if (@_) {
145 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
146 } else {
147 "map(&WAIT::Filter::$filter(\$_), \@_)";
148 }
149 }
150
151 sub drop {
152 my $self = shift;
153 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
154 my $file = $self->{file};
155
156 ! (!-e $file or unlink $file);
157 } else { # notify our database
158 croak ref($self)."::drop called directly";
159 }
160 }
161
162 sub open {
163 my $self = shift;
164 my $file = $self->{file};
165
166 if (defined $self->{dbh}) {
167 $self->{dbh};
168 } else {
169 $self->{func} =
170 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
171 my $flags;
172 if ($self->{mode} & O_RDWR) {
173 $flags = DB_CREATE; # | DB_INIT_MPOOL | DB_PRIVATE | DB_INIT_CDB;
174 warn "Flags on inverted $file set to 'writing'";
175 } else {
176 $flags = DB_RDONLY;
177 # warn "Flags on inverted $file set to 'readonly'";
178 }
179 my $filename = $self->maindbfile or die;
180 my $subname = join("/",$self->tablename || die,$self->subname || die);
181 my $env = $self->{env} || "[undef]";
182 $self->{dbh} = tie(%{$self->{db}}, 'BerkeleyDB::Btree',
183 # Filename => $file,
184 Filename => $filename,
185 $self->{env} ? (Env => $self->{env}) : (),
186 Subname => $subname,
187 Mode => 0664,
188 Flags => $flags,
189 $WAIT::Database::Cachesize?(Cachesize => $WAIT::Database::Cachesize):(),
190 $WAIT::Database::Pagesize?(Pagesize => $WAIT::Database::Pagesize):(),
191 ) or die "Couldn't tie: $BerkeleyDB::Error; filename=>'$filename', env=>'$env',subname=>'$subname',flags=>'$flags'";
192 $self->{cache} = {}
193 if $self->{mode} & O_RDWR;
194 $self->{cdict} = {}
195 if $self->{mode} & O_RDWR;
196 $self->{cached} = 0;
197 }
198 }
199
200 sub insert {
201 my $self = shift;
202 my $key = shift;
203 my %occ;
204
205 defined $self->{db} or $self->open;
206 defined $self->{db} or die "open didn't help!!!";
207 grep $occ{$_}++, &{$self->{func}}(@_);
208 my ($word, $noc);
209 $self->{records}++;
210 while (($word, $noc) = each %occ) {
211 if (defined $self->{cache}->{$word}) {
212 $self->{cdict}->{$word}++;
213 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
214 } else {
215 $self->{cdict}->{$word} = 1;
216 $self->{cache}->{$word} = pack 'w2', $key, $noc;
217 }
218 $self->{cached}++;
219 }
220 # This cache limit should be configurable
221 $self->sync if $self->{cached} > 100_000;
222 my $maxtf = 0;
223 for (values %occ) {
224 $maxtf = $_ if $_ > $maxtf;
225 }
226 $self->{db}->{MAXTF_M . $key} = $maxtf;
227 }
228
229 # We sort postings by increasing max term frequency (~ by increasing
230 # document length. This reduces the quality degradation if we process
231 # only the first part of a posting list.
232
233 sub sort_postings {
234 my $self = shift;
235 my $post = shift; # reference to a hash or packed string
236
237 if (ref $post) {
238 # we skip the sort part, if the index is not sorted
239 return pack('w*', %$post) unless $self->{reorg};
240 } else {
241 $post = { unpack 'w*', $post };
242 }
243
244 my $r = '';
245
246 # Sort posting list by increasing ratio of maximum term frequency (~
247 # "document length") and term frequency. This ratio multipied by the
248 # inverse document frequence gives the score for a term. This sort
249 # order can be exploited for tuning of single term queries.
250
251 for my $did (keys %$post) { # sanity check
252 unless ($self->{db}->{MAXTF_M . $did}) {
253 warn "WAIT Warning: DIVZERO threat from did[$did]post[$post]post{did}[$post->{$did}]";
254 $self->{db}->{MAXTF_M . $did} = 1; # fails if we have not opened for writing
255 }
256 }
257 for my $did (sort { $post->{$b} / $self->{db}->{MAXTF_M . $b}
258 <=>
259 $post->{$a} / $self->{db}->{MAXTF_M . $a}
260 } keys %$post) {
261 $r .= pack 'w2', $did, $post->{$did};
262 }
263 #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
264 $r;
265 }
266
267 sub delete {
268 my $self = shift;
269 my $key = shift;
270 my %occ;
271
272 my $db;
273 defined $self->{db} or $self->open;
274 $db = $self->{db};
275 $self->sync;
276 $self->{records}--;
277
278 # less than zero documents in database?
279 _complain('delete of document', $key) and $self->{records} = 0
280 if $self->{records} < 0;
281
282 grep $occ{$_}++, &{$self->{func}}(@_);
283
284 # Be prepared for "Odd number of elements in hash assignment"
285 local $SIG{__WARN__} = sub {
286 my $warning = shift;
287 chomp $warning;
288 warn "Catching warning[$warning] during delete of key[$key]";
289 };
290 for (keys %occ) {# may reorder posting list
291 my %post = unpack 'w*', $db->{POSTINGLIST_P . $_};
292 delete $post{$key};
293 $db->{POSTINGLIST_P . $_} = $self->sort_postings(\%post);
294 _complain('delete of term', $_) if $db->{DOCFREQ_O . $_}-1 != keys %post;
295 $db->{DOCFREQ_O . $_} = scalar keys %post;
296 }
297 delete $db->{MAXTF_M . $key};
298 }
299
300 sub intervall {
301 my ($self, $first, $last) = @_;
302
303 die "intervall broken in this version of WAIT: need to fix the
304 R_CURSOR and R_NEXT lines";
305
306 #### my $value = '';
307 #### my $word = '';
308 #### my @result;
309 ####
310 #### return unless exists $self->{'intervall'};
311 ####
312 #### defined $self->{db} or $self->open;
313 #### $self->sync;
314 #### my $dbh = $self->{dbh}; # for convenience
315 ####
316 #### if (ref $self->{'intervall'}) {
317 #### unless (exists $self->{'ifunc'}) {
318 #### $self->{'ifunc'} =
319 #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
320 #### }
321 #### ($first) = &{$self->{'ifunc'}}($first) if $first;
322 #### ($last) = &{$self->{'ifunc'}}($last) if $last;
323 #### }
324 #### $first = POSTINGLIST_P . ($first||'');
325 #### $last = (defined $last)?POSTINGLIST_P . $last:'q';
326 ####
327 #### # set the cursor to $first
328 #### $dbh->seq($first, $value, R_CURSOR);
329 ####
330 #### # $first would be after the last word
331 #### return () if $first gt $last;
332 ####
333 #### push @result, substr($first,1);
334 #### while (!$dbh->seq($word, $value, R_NEXT)) {
335 #### # We should limit this to a "resonable" number of words
336 #### last if $word gt $last;
337 #### push @result, substr($word,1);
338 #### }
339 #### \@result; # speed
340 }
341
342 sub prefix {
343 my ($self, $prefix) = @_;
344
345 die "prefix not supported in this version of WAIT: need to fix the R_CURSOR";
346
347
348 #### my $value = '';
349 #### my $word = '';
350 #### my @result;
351 ####
352 #### return () unless defined $prefix; # Full dictionary requested !!
353 #### return unless exists $self->{'prefix'};
354 #### defined $self->{db} or $self->open;
355 #### $self->sync;
356 #### my $dbh = $self->{dbh};
357 ####
358 #### if (ref $self->{'prefix'}) {
359 #### unless (exists $self->{'pfunc'}) {
360 #### $self->{'pfunc'} =
361 #### eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
362 #### }
363 #### ($prefix) = &{$self->{'pfunc'}}($prefix);
364 #### }
365 ####
366 #### if ($dbh->seq($word = POSTINGLIST_P . $prefix, $value, R_CURSOR)) {
367 #### return ();
368 #### }
369 #### return () if $word !~ /^p$prefix/;
370 #### push @result, substr($word,1);
371 ####
372 #### while (!$dbh->seq($word, $value, R_NEXT)) {
373 #### # We should limit this to a "resonable" number of words
374 #### last if $word !~ /^p$prefix/;
375 #### push @result, substr($word,1);
376 #### }
377 #### \@result; # speed
378 }
379
380 =head2 search($query)
381
382 The search method supports a range of search algorithms. It is
383 recommended to tune the index by calling
384 C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
385 into the table. This is a computing intense operation and all inserts
386 and deletes after this optimization are slightly more expensive. Once
387 reorganized, the index is kept sorted automatically until you switch
388 the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
389
390 When searching a tuned index, a query can be processed faster if the
391 caller requests only the topmost documents. This can be done by
392 passing a C<top =E<gt>> I<n> parameter to the search method.
393
394 For single term queries, the method returns only the I<n> top ranking
395 documents. For multi term queries two optimized algorithms are
396 available. The first algorithm computes the top n documents
397 approximately but very fast, sacrificing a little bit of precision for
398 speed. The second algorithm computes the topmost I<n> documents
399 precisely. This algorithm is slower and should be used only for small
400 values of I<n>. It can be requested by passing the query attribute
401 C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
402 While the picky version might not be faster than the brute force
403 version on average for modest size databases it uses less memory and
404 the processing time is almost linear in the number of query terms, not
405 in the size of the lists.
406
407 =cut
408
409 sub search_ref {
410 my $self = shift;
411 my $query = shift;
412
413 my $debugtime = 0;
414 my($time,$entertime);
415 our $STARTTIME;
416 if ($debugtime) {
417 $time = time;
418 $STARTTIME ||= $time;
419 if ($time-$STARTTIME > 5) {
420 $STARTTIME = $time;
421 warn "STARTTIME: $STARTTIME\n";
422 }
423 $entertime = time-$STARTTIME;
424 warn sprintf "ENTER TIME: %.4f\n", $entertime;
425 }
426 defined $self->{db} or $self->open;
427 $self->sync;
428 my $ref = $self->search_raw_ref($query, &{$self->{func}}(@_)); # No call to parse() there
429 if ($debugtime) {
430 my $leavetime = time-$STARTTIME;
431 warn sprintf "LEAVE TIME: %.4f\n", $leavetime;
432 if ($leavetime-$entertime > .4) {
433 require Data::Dumper;
434 print STDERR "Line " . __LINE__ . ", File: " . __FILE__ . "\n" .
435 Data::Dumper->new([$query,\@_],[qw(query at_)])->Indent(1)->Useqq(1)->Dump; # XXX
436 }
437 }
438 $ref;
439 }
440
441 sub parse {
442 my $self = shift;
443
444 defined $self->{db} or $self->open;
445 &{$self->{func}}(@_);
446 }
447
448 sub search_prefix {
449 my $self = shift;
450
451 # print "search_prefix(@_)\n";
452 defined $self->{db} or $self->open;
453 $self->search_raw_ref(map($self->prefix($_), @_));
454 }
455
456 sub _complain ($$) {
457 my ($action, $term) = @_;
458
459 require Carp;
460 Carp::cluck
461 (sprintf("WAIT database inconsistency during $action [%s]: ".
462 "Please rebuild index\n",
463 $term,));
464 }
465
466 sub search_raw_ref {
467 my $self = shift;
468 my $query = shift;
469 # warn "DEBUG WAIT: search_raw_ref args 2..[@_]";
470 my %score;
471
472 # Top $top_wanted documents must be correct. Zero means all matching documents.
473 my $top_wanted = $query->{top};
474 my $picky_strict = $query->{picky};
475 # the option is really ignore_excess
476 my $ignore_excess = $query->{ignore_excess};
477
478 # Return at least $minacc documents. Zero means all matching documents.
479
480 # my $minacc = $query->{accus} || $top_wanted;
481
482 # Open index and flush cache if necessary
483 defined $self->{db} or $self->open;
484 $self->sync;
485
486 # We keep duplicates
487 my @terms =
488 # Sort words by decreasing document frequency
489 sort { $self->{db}->{DOCFREQ_O . $a} <=> $self->{db}->{DOCFREQ_O . $b} }
490 # check which words occur in the index.
491 grep { $self->{db}->{DOCFREQ_O . $_} } @_;
492
493 # warn "DEBUG WAIT: wanted[$top_wanted]terms[@terms]";
494 return unless @terms;
495
496 # We special-case one term queries here. If the index was sorted,
497 # choping off the rest of the list will return the same ranking.
498 if ($top_wanted and @terms == 1) {
499 my $term = shift @terms;
500 my $idf = log($self->{records}/$self->{db}->{DOCFREQ_O . $term});
501 my @res;
502
503 if ($self->{reorg}) { # or not $query->{picky}
504 @res = unpack "w". int(2*$top_wanted), $self->{db}->{POSTINGLIST_P . $term};
505 # warn sprintf "DEBUG WAIT: scalar(\@res)[%d]", scalar(@res);
506 } else {
507 @res = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
508 }
509
510 for (my $i=1; $i<@res; $i+=2) {
511 # $res[$i] /= $self->{db}->{MAXTF_M . $res[$i-1]} / $idf;
512 # above was written badly, allows two DIV_ZERO problems.
513 my $maxtf = $self->{db}->{MAXTF_M . $res[$i-1]};
514 unless ($maxtf) {
515 warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
516 $maxtf = 1;
517 }
518 $res[$i] = ($res[$i] / $maxtf) * $idf;
519 }
520
521 my %res = @res; # bloed: @res waere schon sortiert gewesen
522 return \%res;
523 }
524
525 # We separate exhaustive search here to avoid overhead and make the
526 # code more readable. The block can be removed without changing the
527 # result.
528 unless ($top_wanted) {
529 for (@terms) {
530 my $df = $self->{db}->{DOCFREQ_O . $_};
531
532 # The frequency *must* be 1 at least since the posting list is nonempty
533 _complain('search for term', $_) and $df = 1 if $df < 1;
534
535 # Unpack posting list for current query term $_
536 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
537
538 _complain('search for term', $_) if $self->{db}->{DOCFREQ_O . $_} != keys %post;
539 # This is the inverse document frequency. The log of the inverse
540 # fraction of documents the term occurs in.
541 my $idf = log($self->{records}/$df);
542 for my $did (keys %post) {
543 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
544 $score{$did} += $post{$did} / $freq * $idf;
545 }
546 }
547 }
548 # warn sprintf "Used %d accumulators\n", scalar keys %score;
549 return \%score;
550 }
551
552 # A sloppy but fast algorithm for multiple term queries.
553 unless ($picky_strict) {
554 for (@terms) {
555 # Unpack posting list for current query term $_
556 my %post;
557 if ($self->{reorg} && $top_wanted && $ignore_excess) {
558 %post = unpack 'w'. int(2*$ignore_excess) , $self->{db}->{POSTINGLIST_P . $_};
559 } else {
560 %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $_};
561 }
562 # warn sprintf "DEBUG WAIT: term[%s] keys %%post[%s]", $_, scalar keys %post;
563
564 # Lookup the number of documents the term occurs in (document frequency)
565 my $occ = $self->{db}->{DOCFREQ_O . $_};
566
567 _complain('search for term', $_) if !$ignore_excess && $occ != keys %post;
568 # The frequency *must* be 1 at least since the posting list is nonempty
569 _complain('search for term', $_) and $occ = 1 if $occ < 1;
570
571 # This is the inverse document frequency. The log of the inverse fraction
572 # of documents the term occurs in.
573 my $idf = log($self->{records}/$occ);
574
575 # If we have a reasonable number of accumulators, change the
576 # loop to iterate over the accumulators. This will compromise
577 # quality for better speed. The algorithm still computes the
578 # exact weights, but the result is not guaranteed to contain the
579 # *best* results. The database might contain documents better
580 # than the worst returned document.
581
582 # We process the lists in order of increasing length. When the
583 # number of accumulators exceeds $wanted, no new documents are
584 # added, only the ranking/weighting of the seen documents is
585 # improved. The resulting ranking list must be pruned, since only
586 # the top most documents end up near their "optimal" rank.
587
588 if (keys %score < $top_wanted) {
589
590 # Diese folgende Schleife ist (WAR!) der Hammer fuer die Suche "mysql
591 # für dummies bellomo". Sie frisst 3.1+1.7 Sekunden.
592
593 # Der erste Grund ist, dass 3 Begriffe noch nicht genug gebracht haben,
594 # aber der vierte viel zu viel bringt. Der zweite Grund ist, dass wir
595 # so viele Lookups in $self->{db} machen. Das Rechnen hingegen ist
596 # vermutlich billig.
597
598 for my $did (keys %post) {
599 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
600 $score{$did} += $post{$did} / $freq * $idf;
601 }
602 }
603 } else {
604 for my $did (keys %score) {
605 next unless exists $post{$did};
606 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
607 $score{$did} += $post{$did} / $freq * $idf;
608 }
609 }
610 }
611 }
612 warn sprintf("DEBUG WAIT: returning from search_raw_ref at [%.3f] after terms[%s] with keys[%d]",
613 time,
614 join(":",@terms),
615 scalar keys %score,
616 );
617 return \%score;
618 }
619 my @max; $max[$#terms+1]=0;
620 my @idf;
621
622 # Preparation loop. This extra loop makes sense only when "reorg"
623 # and "wanted" are true. But at the time beeing, keeping the code
624 # for the different search algorithms in one place seems more
625 # desirable than some minor speedup of the brute force version. We
626 # do cache $idf though.
627
628 for (my $i = $#terms; $i >=0; $i--) {
629 local $_ = $terms[$i];
630 # Lookup the number of documents the term occurs in (document frequency)
631 my $df = $self->{db}->{DOCFREQ_O . $_};
632
633 # The frequency *must* be 1 at least since the posting list is nonempty
634 _complain('search for term', $_) and $df = 1 if $df < 1;
635
636 # This is the inverse document frequency. The log of the inverse
637 # fraction of documents the term occurs in.
638 $idf[$i] = log($self->{records}/$df);
639
640 my ($did,$occ);
641 if ($self->{reorg}) {
642 ($did,$occ) = unpack 'w2', $self->{db}->{POSTINGLIST_P . $_};
643 } else { # Maybe this costs more than it helps
644 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{POSTINGLIST_P . $_});
645 }
646 my $freq = $self->{db}->{MAXTF_M . $did};
647 my $max = $occ/$freq*$idf[$i];
648 $max[$i] = $max + $max[$i+1];
649 }
650
651 # Main loop
652 for my $i (0 .. $#terms) {
653 my $term = $terms[$i];
654 # Unpack posting list for current query term $term. We loose the
655 # sorting order because the assignment to a hash.
656 my %post = unpack 'w*', $self->{db}->{POSTINGLIST_P . $term};
657
658 _complain('search for term', $term)
659 if $self->{db}->{DOCFREQ_O . $term} != keys %post;
660
661 my $idf = $idf[$i];
662 my $full; # Need to process all postings
663 my $chop; # Score necessary to enter the ranking list
664
665 if (# We know that wanted is true since we special cased the
666 # exhaustive search.
667
668 $top_wanted and
669
670 # We did sort here if necessary in the preparation loop:
671 # $self->{reorg} and
672
673 scalar keys %score > $top_wanted) {
674 $chop = (sort { $b <=> $a } values %score)[$top_wanted];
675 $full = $max[$i] > $chop;
676 } else {
677 $full = 1;
678 }
679
680 if ($full) {
681 # We need to inspect the full list. Either $top_wanted is not given,
682 # the index is not sorted, or we don't have enough accumulators
683 # yet.
684 if (defined $chop) {
685 # We might be able to avoid allocating accumulators
686 for my $did (keys %post) {
687 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
688 my $wgt = $post{$did} / $freq * $idf;
689 # We add an accumulator if $wgt exeeds $chop
690 if (exists $score{$did} or $wgt > $chop) {
691 $score{$did} += $wgt;
692 }
693 }
694 }
695 } else {
696 # Allocate acumulators for each seen document.
697 for my $did (keys %post) {
698 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
699 $score{$did} += $post{$did} / $freq * $idf;
700 }
701 }
702 }
703 } else {
704 # Update existing accumulators
705 for my $did (keys %score) {
706 next unless exists $post{$did};
707 if (my $freq = $self->{db}->{MAXTF_M . $did}) {
708 $score{$did} += $post{$did} / $freq * $idf;
709 }
710 }
711 }
712 }
713 #warn sprintf "Used %d accumulators\n", scalar keys %score;
714 \%score;
715 }
716
717 sub set {
718 my ($self, $attr, $value) = @_;
719
720 die "No such index attribute: '$attr'" unless $attr eq 'top';
721
722 return delete $self->{reorg} if $value == 0;
723
724 return if $self->{reorg}; # we are sorted already
725 return unless $self->{mode} & O_RDWR;
726 defined $self->{db} or $self->open;
727
728 $self->sync;
729 while (my($key, $value) = each %{$self->{db}}) {
730 next if $key !~ /^p/; # some day use PMATCH
731 $self->{db}{$key} = $self->sort_postings($value);
732 }
733 $self->{reorg} = 1;
734 }
735
736 sub sync {
737 my $self = shift;
738 return unless $self->{mode} & O_RDWR;
739 Carp::carp(sprintf "[%s] Flushing %d postings", scalar(localtime), $self->{cached})
740 if $self->{cached};
741 while (my($key, $value) = each %{$self->{cache}}) {
742 $self->{db}{POSTINGLIST_P . $key} ||= "";
743 if ($self->{reorg}) {
744 $self->{db}->{POSTINGLIST_P . $key} =
745 $self->sort_postings($self->{db}->{POSTINGLIST_P . $key}
746 . $value);
747 } else {
748 $self->{db}->{POSTINGLIST_P . $key} .= $value;
749 }
750 }
751 while (my($key, $value) = each %{$self->{cdict}}) {
752 $self->{db}->{DOCFREQ_O . $key} = 0 unless $self->{db}->{DOCFREQ_O . $key};
753 $self->{db}->{DOCFREQ_O . $key} += $value;
754 }
755 $self->{cache} = {};
756 $self->{cdict} = {};
757 $self->{cached} = 0;
758 }
759
760 sub close {
761 my $self = shift;
762
763 delete $self->{env};
764 if ($self->{dbh}) {
765 $self->sync;
766 delete $self->{dbh};
767 untie %{$self->{db}};
768 for my $att (qw(db func cache cached cdict file maindbfile)) {
769 delete $self->{$att};
770 }
771 for my $att (qw(pfunc ifunc xfunc)) {
772 delete $self->{$att} if defined $self->{$att};
773 }
774 }
775 }
776
777 sub keys {
778 my $self = shift;
779
780 defined $self->{db} or $self->open;
781 keys %{$self->{db}};
782 }
783
784 1;
785

Properties

Name Value
cvs2svn:cvs-rev 1.12

  ViewVC Help
Powered by ViewVC 1.1.26