/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 40 - (show annotations)
Mon Nov 13 10:44:03 2000 UTC (23 years, 6 months ago) by laperla
File size: 22237 byte(s)
seq returns 0 on success. So if there is an error on positioning the
cursor on $O$;, then we can say it is not an old index. The condition
is
    if $dbh->seq, then return
but it was
    $dbh->seq or return.

Besides there are now some outcommented debugging statements that I
needed from time to time. They should be removed some day later but
left for a while to ease the debugging process.

1 # -*- Mode: Perl -*-
2 # $Basename: InvertedIndex.pm $
3 # $Revision: 1.30 $
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sun Nov 12 14:40:21 2000
8 # Language : CPerl
9 #
10 # (C) Copyright 1996-2000, Ulrich Pfeifer
11 #
12
13 package WAIT::InvertedIndex;
14 use strict;
15 use DB_File;
16 use Fcntl;
17 use WAIT::Filter;
18 use Carp;
19 use vars qw(%FUNC $VERSION);
20
21 $VERSION = "1.801"; # others test if we are loaded by checking $VERSION
22
23 # The dictionary has three different key types:
24 # 'o'.$word
25 #
26 # The document frequency is the number of documents a term occurs
27 # in. The idea is that a term occuring in a significant part of the
28 # documents is not too significant.
29 #
30 # 'm'.$word
31 #
32 # The maximum term frequency of a document is the frequency of the
33 # most frequent term in the document. It is related to the document
34 # length obviously. A document in which the most frequnet term occurs
35 # 100 times is probably much longer than a document whichs most
36 # frequent term occurs five time.
37 #
38 # 'p'.$word
39 #
40 # Under this key we store the actual posting list as pairs of
41 # packed integers.
42
43 my $no_old_index_support = 0; # do not check for old indices if set
44
45 sub new {
46 my $type = shift;
47 my %parm = @_;
48 my $self = {};
49
50 $self->{file} = $parm{file} or croak "No file specified";
51 $self->{attr} = $parm{attr} or croak "No attributes specified";
52 $self->{filter} = $parm{filter};
53 $self->{'name'} = $parm{'name'};
54 $self->{records} = 0;
55 for (qw(intervall prefix)) {
56 if (exists $parm{$_}) {
57 if (ref $parm{$_}) {
58 $self->{$_} = [@{$parm{$_}}] # clone
59 } else {
60 $self->{$_} = $parm{$_}
61 }
62 }
63 }
64 bless $self, ref($type) || $type;
65 }
66
67 sub name {$_[0]->{'name'}}
68
69 sub _split_pos {
70 my ($text, $pos) = @{$_[0]};
71 my @result;
72
73 $text =~ s/(^\s+)// and $pos += length($1);
74 while ($text =~ s/(^\S+)//) {
75 my $word = $1;
76 push @result, [$word, $pos];
77 $pos += length($word);
78 $text =~ s/(^\s+)// and $pos += length($1);
79 }
80 @result;
81 }
82
83 sub _xfiltergen {
84 my $filter = pop @_;
85
86 # Oops, we cannot overrule the user's choice. Other filters may kill
87 # stopwords, such as isotr clobbers "isn't" to "isnt".
88
89 # if ($filter eq 'stop') { # avoid the slow stopword elimination
90 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
91 # }
92 if (@_) {
93 if ($filter =~ /^split(\d*)/) {
94 if ($1) {
95 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
96 } else {
97 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
98 }
99 } else {
100 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
101 }
102 } else {
103 if ($filter =~ /^split(\d*)/) {
104 if ($1) {
105 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
106 } else {
107 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
108 }
109 } else {
110 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
111 }
112 }
113 }
114
115 sub parse_pos {
116 my $self = shift;
117
118 unless (exists $self->{xfunc}) {
119 $self->{xfunc} =
120 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
121 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
122 }
123 &{$self->{xfunc}}($_[0]);
124 }
125
126 sub _filtergen {
127 my $filter = pop @_;
128
129 if (@_) {
130 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
131 } else {
132 "map(&WAIT::Filter::$filter(\$_), \@_)";
133 }
134 }
135
136 sub drop {
137 my $self = shift;
138 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
139 my $file = $self->{file};
140
141 ! (!-e $file or unlink $file);
142 } else { # notify our database
143 croak ref($self)."::drop called directly";
144 }
145 }
146
147 sub is_an_old_index {
148 my $self = shift;
149
150 return 0 if $no_old_index_support;
151 return $self->{old_index} if exists $self->{old_index};
152
153 # We can only guess if this is an old index. We lookup the first 10
154 # $O entries. If all values are integers, we assume that the index
155 # is an old one.
156
157 defined $self->{db} or $self->open;
158 $self->sync;
159 my $dbh = $self->{dbh}; # for convenience
160
161 my $O = pack('C', 0xff)."o";
162 my ($word, $value) = ($O.$;); # $word and $value are modified by seq!
163 if ( my $ret = $dbh->seq($word, $value, R_CURSOR) ) {
164 # warn "DEBUG: ret[$ret], not an old index, either empty or no \$^O";
165 return $self->{old_index} = 0;
166 }
167 for (my $i=0; $i<10;$i++) {
168 if ($value !~ /^\d+$/) {
169 # warn "DEBUG: word[$word]value[$value], not an old index";
170 return $self->{old_index} = 0;
171 }
172 if (my $ret = $dbh->seq($word, $value, R_NEXT) or # no values left
173 $word !~ /^$O$;/o # no $O values left
174 ) {
175 # we are not sure enough that this is an old index
176 # warn "DEBUG: ret[$ret]word[$word]value[$value], not an old index";
177 return $self->{old_index} = 0;
178 }
179 }
180 # warn "DEBUG: old index";
181 return $self->{old_index} = 1;
182 }
183
184 sub open {
185 my $self = shift;
186 my $file = $self->{file};
187
188 if (defined $self->{dbh}) {
189 $self->{dbh};
190 } else {
191 $self->{func} =
192 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
193 $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
194 $self->{mode}, 0664, $DB_BTREE);
195 $self->{cache} = {}
196 if $self->{mode} & O_RDWR;
197 $self->{cdict} = {}
198 if $self->{mode} & O_RDWR;
199 $self->{cached} = 0;
200 if (!$no_old_index_support and $self->is_an_old_index()) {
201 warn "This is an old index, upgrade you database";
202 require WAIT::InvertedIndexOld;
203 bless $self, 'WAIT::InvertedIndexOld';
204 }
205 }
206 }
207
208 sub insert {
209 my $self = shift;
210 my $key = shift;
211 my %occ;
212
213 defined $self->{db} or $self->open;
214 grep $occ{$_}++, &{$self->{func}}(@_);
215 my ($word, $noc);
216 $self->{records}++;
217 while (($word, $noc) = each %occ) {
218 if (defined $self->{cache}->{$word}) {
219 $self->{cdict}->{$word}++;
220 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
221 } else {
222 $self->{cdict}->{$word} = 1;
223 $self->{cache}->{$word} = pack 'w2', $key, $noc;
224 }
225 $self->{cached}++;
226 }
227 # This cache limit should be configurable
228 $self->sync if $self->{cached} > 100_000;
229 my $maxtf = 0;
230 for (values %occ) {
231 $maxtf = $_ if $_ > $maxtf;
232 }
233 $self->{db}->{'m'. $key} = $maxtf;
234 }
235
236 # We sort postings by increasing max term frequency (~ by increasing
237 # document length. This reduces the quality degradation if we process
238 # only the first part of a posting list.
239
240 sub sort_postings {
241 my $self = shift;
242 my $post = shift; # reference to a hash or packed string
243
244 if (ref $post) {
245 # we skip the sort part, if the index is not sorted
246 return pack('w*', %$post) unless $self->{reorg};
247 } else {
248 $post = { unpack 'w*', $post };
249 }
250
251 my $r = '';
252
253 # Sort posting list by increasing ratio of maximum term frequency (~
254 # "document length") and term frequency. This rati multipied by the
255 # inverse document frequence gives the score for a term. This sort
256 # order can be exploited for tuning of single term queries.
257
258 for my $did (keys %$post) { # sanity check
259 unless ($self->{db}->{"m". $did}) {
260 warn "Warning from WAIT: DIVZERO threat from did[$did] post[$post->{$did}]";
261 $self->{db}->{"m". $did} = 1; # fails if we have not opened for writing
262 }
263 }
264 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
265 <=>
266 $post->{$a} / $self->{db}->{'m'. $a}
267 } keys %$post) {
268 $r .= pack 'w2', $did, $post->{$did};
269 }
270 #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
271 $r;
272 }
273
274 sub delete {
275 my $self = shift;
276 my $key = shift;
277 my %occ;
278
279 my $db;
280 defined $self->{db} or $self->open;
281 $db = $self->{db};
282 $self->sync;
283 $self->{records}--;
284
285 # less than zero documents in database?
286 _complain('delete of document', $key) and $self->{records} = 0
287 if $self->{records} < 0;
288
289 grep $occ{$_}++, &{$self->{func}}(@_);
290
291 # Be prepared for "Odd number of elements in hash assignment"
292 local $SIG{__WARN__} = sub {
293 my $warning = shift;
294 chomp $warning;
295 warn "Catching warning[$warning] during delete of key[$key]";
296 };
297 for (keys %occ) {# may reorder posting list
298 my %post = unpack 'w*', $db->{'p'.$_};
299 delete $post{$key};
300 $db->{'p'.$_} = $self->sort_postings(\%post);
301 _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
302 $db->{'o'.$_} = scalar keys %post;
303 }
304 delete $db->{'m'. $key};
305 }
306
307 sub intervall {
308 my ($self, $first, $last) = @_;
309 my $value = '';
310 my $word = '';
311 my @result;
312
313 return unless exists $self->{'intervall'};
314
315 defined $self->{db} or $self->open;
316 $self->sync;
317 my $dbh = $self->{dbh}; # for convenience
318
319 if (ref $self->{'intervall'}) {
320 unless (exists $self->{'ifunc'}) {
321 $self->{'ifunc'} =
322 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
323 }
324 ($first) = &{$self->{'ifunc'}}($first) if $first;
325 ($last) = &{$self->{'ifunc'}}($last) if $last;
326 }
327 $first = 'p'.($first||'');
328 $last = (defined $last)?'p'.$last:'q';
329
330 # set the cursor to $first
331 $dbh->seq($first, $value, R_CURSOR);
332
333 # $first would be after the last word
334 return () if $first gt $last;
335
336 push @result, substr($first,1);
337 while (!$dbh->seq($word, $value, R_NEXT)) {
338 # We should limit this to a "resonable" number of words
339 last if $word gt $last;
340 push @result, substr($word,1);
341 }
342 \@result; # speed
343 }
344
345 sub prefix {
346 my ($self, $prefix) = @_;
347 my $value = '';
348 my $word = '';
349 my @result;
350
351 return () unless defined $prefix; # Full dictionary requested !!
352 return unless exists $self->{'prefix'};
353 defined $self->{db} or $self->open;
354 $self->sync;
355 my $dbh = $self->{dbh};
356
357 if (ref $self->{'prefix'}) {
358 unless (exists $self->{'pfunc'}) {
359 $self->{'pfunc'} =
360 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
361 }
362 ($prefix) = &{$self->{'pfunc'}}($prefix);
363 }
364
365 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
366 return ();
367 }
368 return () if $word !~ /^p$prefix/;
369 push @result, substr($word,1);
370
371 while (!$dbh->seq($word, $value, R_NEXT)) {
372 # We should limit this to a "resonable" number of words
373 last if $word !~ /^p$prefix/;
374 push @result, substr($word,1);
375 }
376 \@result; # speed
377 }
378
379 =head2 search($query)
380
381 The search method supports a range of search algorithms. It is
382 recommended to tune the index by calling
383 C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
384 into the table. This is a computing intense operation and all inserts
385 and deletes after this optimization are slightly more expensive. Once
386 reorganized, the index is kept sorted automatically until you switch
387 the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
388
389 When searching a tuned index, a query can be processed faster if the
390 caller requests only the topmost documents. This can be done by
391 passing a C<top =E<gt>> I<n> parameter to the search method.
392
393 For single term queries, the method returns only the I<n> top ranking
394 documents. For multi term queries two optimized algorithms are
395 available. The first algorithm computes the top n documents
396 approximately but very fast, sacrificing a little bit of precision for
397 speed. The second algorithm computes the topmost I<n> documents
398 precisely. This algorithm is slower and should be used only for small
399 values of I<n>. It can be requested by passing the query attribute
400 C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
401 While the picky version might not be faster than the brute force
402 version on average for modest size databases it uses less memory and
403 the processing time is almost linear in the number of query terms, not
404 in the size of the lists.
405
406 =cut
407
408 sub search {
409 my $self = shift;
410 my $query = shift;
411
412 defined $self->{db} or $self->open;
413 $self->sync;
414 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
415 }
416
417 sub parse {
418 my $self = shift;
419
420 defined $self->{db} or $self->open;
421 &{$self->{func}}(@_);
422 }
423
424 sub keys {
425 my $self = shift;
426
427 defined $self->{db} or $self->open;
428 keys %{$self->{db}};
429 }
430
431 sub search_prefix {
432 my $self = shift;
433
434 # print "search_prefix(@_)\n";
435 defined $self->{db} or $self->open;
436 $self->search_raw(map($self->prefix($_), @_));
437 }
438
439 sub _complain ($$) {
440 my ($action, $term) = @_;
441
442 require Carp;
443 Carp::cluck
444 (sprintf("WAIT database inconsistency during $action [%s]: ".
445 "Please rebuild index\n",
446 $term,));
447 }
448
449 sub search_raw {
450 my $self = shift;
451 my $query = shift;
452 my %score;
453
454 # Top $wanted documents must be correct. Zero means all matching
455 # documents.
456 my $wanted = $query->{top};
457 my $strict = $query->{picky};
458
459 # Return at least $minacc documents. Zero means all matching
460 # documents.
461 # my $minacc = $query->{accus} || $wanted;
462
463 # Open index and flush cache if necessary
464 defined $self->{db} or $self->open;
465 $self->sync;
466
467 # We keep duplicates
468 my @terms =
469 # Sort words by decreasing document frequency
470 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
471 # check which words occur in the index.
472 grep { $self->{db}->{'o'.$_} } @_;
473
474 return unless @terms;
475
476 # We special-case one term queries here. If the index was sorted,
477 # choping off the rest of the list will return the same ranking.
478 if ($wanted and @terms == 1) {
479 my $term = shift @terms;
480 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
481 my @res;
482
483 if ($self->{reorg}) { # or not $query->{picky}
484 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
485 } else {
486 @res = unpack 'w*', $self->{db}->{'p'.$term};
487 }
488
489 for (my $i=1; $i<@res; $i+=2) {
490 # $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
491 # above was written badly, allows two DIV_ZERO problems.
492 my $maxtf = $self->{db}->{"m". $res[$i-1]};
493 unless ($maxtf) {
494 warn "WAIT-Warning: Averting DIVZERO for i[$i] \$res[\$i-1][$res[$i-1]] term[$term]";
495 $maxtf = 1;
496 }
497 $res[$i] = ($res[$i] / $maxtf) * $idf;
498 }
499
500 return @res
501 }
502
503 # We separate exhaustive search here to avoid overhead and make the
504 # code more readable. The block can be removed without changing the
505 # result.
506 unless ($wanted) {
507 for (@terms) {
508 my $df = $self->{db}->{'o'.$_};
509
510 # The frequency *must* be 1 at least since the posting list is nonempty
511 _complain('search for term', $_) and $df = 1 if $df < 1;
512
513 # Unpack posting list for current query term $_
514 my %post = unpack 'w*', $self->{db}->{'p'.$_};
515
516 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
517 # This is the inverse document frequency. The log of the inverse
518 # fraction of documents the term occurs in.
519 my $idf = log($self->{records}/$df);
520 for my $did (keys %post) {
521 if (my $freq = $self->{db}->{'m'. $did}) {
522 $score{$did} += $post{$did} / $freq * $idf;
523 }
524 }
525 }
526 # warn sprintf "Used %d accumulators\n", scalar keys %score;
527 return %score;
528 }
529
530 # A sloppy but fast algorithm for multiple term queries.
531 unless ($strict) {
532 for (@terms) {
533 # Unpack posting list for current query term $_
534 my %post = unpack 'w*', $self->{db}->{'p'.$_};
535
536 # Lookup the number of documents the term occurs in (document frequency)
537 my $occ = $self->{db}->{'o'.$_};
538
539 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
540 # The frequency *must* be 1 at least since the posting list is nonempty
541 _complain('search for term', $_) and $occ = 1 if $occ < 1;
542
543 # This is the inverse document frequency. The log of the inverse
544 # fraction of documents the term occurs in.
545 my $idf = log($self->{records}/$occ);
546
547 # If we have a reasonable number of accumulators, change the
548 # loop to iterate over the accumulators. This will compromise
549 # quality for better speed. The algorithm still computes the
550 # exact weights, but the result is not guaranteed to contain the
551 # *best* results. The database might contain documents better
552 # than the worst returned document.
553
554 # We process the lists in order of increasing length. When the
555 # number of accumulators exceeds $wanted, no new documents are
556 # added, only the ranking/weighting of the seen documents is
557 # improved. The resulting ranking list must be pruned, since only
558 # the top most documents end up near their "optimal" rank.
559
560 if (keys %score < $wanted) {
561 for my $did (keys %post) {
562 if (my $freq = $self->{db}->{'m'. $did}) {
563 $score{$did} += $post{$did} / $freq * $idf;
564 }
565 }
566 } else {
567 for my $did (keys %score) {
568 next unless exists $post{$did};
569 if (my $freq = $self->{db}->{'m'. $did}) {
570 $score{$did} += $post{$did} / $freq * $idf;
571 }
572 }
573 }
574 }
575 return %score;
576 }
577 my @max; $max[$#terms+1]=0;
578 my @idf;
579
580 # Preparation loop. This extra loop makes sense only when "reorg"
581 # and "wanted" are true. But at the time beeing, keeping the code
582 # for the different search algorithms in one place seems more
583 # desirable than some minor speedup of the brute force version. We
584 # do cache $idf though.
585
586 for (my $i = $#terms; $i >=0; $i--) {
587 local $_ = $terms[$i];
588 # Lookup the number of documents the term occurs in (document frequency)
589 my $df = $self->{db}->{'o'.$_};
590
591 # The frequency *must* be 1 at least since the posting list is nonempty
592 _complain('search for term', $_) and $df = 1 if $df < 1;
593
594 # This is the inverse document frequency. The log of the inverse
595 # fraction of documents the term occurs in.
596 $idf[$i] = log($self->{records}/$df);
597
598 my ($did,$occ);
599 if ($self->{reorg}) {
600 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
601 } else { # Maybe this costs more than it helps
602 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
603 }
604 my $freq = $self->{db}->{'m'. $did};
605 my $max = $occ/$freq*$idf[$i];
606 $max[$i] = $max + $max[$i+1];
607 }
608
609 # Main loop
610 for my $i (0 .. $#terms) {
611 my $term = $terms[$i];
612 # Unpack posting list for current query term $term. We loose the
613 # sorting order because the assignment to a hash.
614 my %post = unpack 'w*', $self->{db}->{'p'.$term};
615
616 _complain('search for term', $term)
617 if $self->{db}->{'o'.$term} != keys %post;
618
619 my $idf = $idf[$i];
620 my $full; # Need to process all postings
621 my $chop; # Score necessary to enter the ranking list
622
623 if (# We know that wanted is true since we especial cased the
624 # exhaustive search.
625
626 $wanted and
627
628 # We did sort here if necessary in
629 # the preparation loop
630 # $self->{reorg} and
631
632 scalar keys %score > $wanted) {
633 $chop = (sort { $b <=> $a } values %score)[$wanted];
634 $full = $max[$i] > $chop;
635 } else {
636 $full = 1;
637 }
638
639 if ($full) {
640 # We need to inspect the full list. Either $wanted is not given,
641 # the index is not sorted, or we don't have enough accumulators
642 # yet.
643 if (defined $chop) {
644 # We might be able to avoid allocating accumulators
645 for my $did (keys %post) {
646 if (my $freq = $self->{db}->{'m'. $did}) {
647 my $wgt = $post{$did} / $freq * $idf;
648 # We add an accumulator if $wgt exeeds $chop
649 if (exists $score{$did} or $wgt > $chop) {
650 $score{$did} += $wgt;
651 }
652 }
653 }
654 } else {
655 # Allocate acumulators for each seen document.
656 for my $did (keys %post) {
657 if (my $freq = $self->{db}->{'m'. $did}) {
658 $score{$did} += $post{$did} / $freq * $idf;
659 }
660 }
661 }
662 } else {
663 # Update existing accumulators
664 for my $did (keys %score) {
665 next unless exists $post{$did};
666 if (my $freq = $self->{db}->{'m'. $did}) {
667 $score{$did} += $post{$did} / $freq * $idf;
668 }
669 }
670 }
671 }
672 #warn sprintf "Used %d accumulators\n", scalar keys %score;
673 %score;
674 }
675
676 sub set {
677 my ($self, $attr, $value) = @_;
678
679 die "No such indexy attribute: '$attr'" unless $attr eq 'top';
680
681 return delete $self->{reorg} if $value == 0;
682
683 return if $self->{reorg}; # we are sorted already
684 return unless $self->{mode} & O_RDWR;
685 defined $self->{db} or $self->open;
686
687 $self->sync;
688 while (my($key, $value) = each %{$self->{db}}) {
689 next if $key !~ /^p/;
690 $self->{db}->{$key} = $self->sort_postings($value);
691 }
692 $self->{reorg} = 1;
693 }
694
695 sub sync {
696 my $self = shift;
697
698 if ($self->{mode} & O_RDWR) {
699 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
700 while (my($key, $value) = each %{$self->{cache}}) {
701 $self->{db}->{"p". $key} ||= "";
702 if ($self->{reorg}) {
703 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
704 . $value);
705 } else {
706 $self->{db}->{'p'.$key} .= $value;
707 }
708 }
709 while (my($key, $value) = each %{$self->{cdict}}) {
710 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
711 $self->{db}->{'o'.$key} += $value;
712 }
713 $self->{cache} = {};
714 $self->{cdict} = {};
715 $self->{cached} = 0;
716 }
717 }
718
719 sub close {
720 my $self = shift;
721
722 if ($self->{dbh}) {
723 $self->sync;
724 delete $self->{dbh};
725 untie %{$self->{db}};
726 delete $self->{db};
727 delete $self->{func};
728 delete $self->{cache};
729 delete $self->{cached};
730 delete $self->{cdict};
731 delete $self->{pfunc} if defined $self->{pfunc};
732 delete $self->{ifunc} if defined $self->{ifunc};
733 delete $self->{xfunc} if defined $self->{xfunc};
734 }
735 }
736
737 1;
738

Properties

Name Value
cvs2svn:cvs-rev 1.6

  ViewVC Help
Powered by ViewVC 1.1.26