/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 22 - (show annotations)
Sat Nov 11 16:58:53 2000 UTC (23 years, 5 months ago) by ulpfr
File size: 21034 byte(s)
Modified structure of inverted indices.  The old one was not able to
cope with words starting with "\377(o|m)".  Prefix and interval
searches have not been tested yet.

1 # -*- Mode: Perl -*-
2 # $Basename: InvertedIndex.pm $
3 # $Revision: 1.30 $
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sat Nov 11 16:32:38 2000
8 # Language : CPerl
9 #
10 # (C) Copyright 1996-2000, Ulrich Pfeifer
11 #
12
13 package WAIT::InvertedIndex;
14 use strict;
15 use DB_File;
16 use Fcntl;
17 use WAIT::Filter;
18 use Carp;
19 use vars qw(%FUNC);
20
21 # The dictionary has three different key types:
22 # 'o'.$word
23 #
24 # The document frequency is the number of documents a term occurs
25 # in. The idea is that a term occuring in a significant part of the
26 # documents is not too significant.
27 #
28 # 'm'.$word
29 #
30 # The maximum term frequency of a document is the frequency of the
31 # most frequent term in the document. It is related to the document
32 # length obviously. A document in which the most frequnet term occurs
33 # 100 times is probably much longer than a document whichs most
34 # frequent term occurs five time.
35 #
36 # 'p'.$word
37 #
38 # Under this key we store the actual posting list as pairs of
39 # packed integers.
40
41 my $no_old_index_support = 0; # do not check for old indices if set
42
43 sub new {
44 my $type = shift;
45 my %parm = @_;
46 my $self = {};
47
48 $self->{file} = $parm{file} or croak "No file specified";
49 $self->{attr} = $parm{attr} or croak "No attributes specified";
50 $self->{filter} = $parm{filter};
51 $self->{'name'} = $parm{'name'};
52 $self->{records} = 0;
53 for (qw(intervall prefix)) {
54 if (exists $parm{$_}) {
55 if (ref $parm{$_}) {
56 $self->{$_} = [@{$parm{$_}}] # clone
57 } else {
58 $self->{$_} = $parm{$_}
59 }
60 }
61 }
62 bless $self, ref($type) || $type;
63 }
64
65 sub name {$_[0]->{'name'}}
66
67 sub _split_pos {
68 my ($text, $pos) = @{$_[0]};
69 my @result;
70
71 $text =~ s/(^\s+)// and $pos += length($1);
72 while ($text =~ s/(^\S+)//) {
73 my $word = $1;
74 push @result, [$word, $pos];
75 $pos += length($word);
76 $text =~ s/(^\s+)// and $pos += length($1);
77 }
78 @result;
79 }
80
81 sub _xfiltergen {
82 my $filter = pop @_;
83
84 # Oops, we cannot overrule the user's choice. Other filters may kill
85 # stopwords, such as isotr clobbers "isn't" to "isnt".
86
87 # if ($filter eq 'stop') { # avoid the slow stopword elimination
88 # return _xfiltergen(@_); # it's cheaper to look them up afterwards
89 # }
90 if (@_) {
91 if ($filter =~ /^split(\d*)/) {
92 if ($1) {
93 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
94 } else {
95 "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
96 }
97 } else {
98 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
99 }
100 } else {
101 if ($filter =~ /^split(\d*)/) {
102 if ($1) {
103 "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
104 } else {
105 "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
106 }
107 } else {
108 "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
109 }
110 }
111 }
112
113 sub parse_pos {
114 my $self = shift;
115
116 unless (exists $self->{xfunc}) {
117 $self->{xfunc} =
118 eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
119 #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
120 }
121 &{$self->{xfunc}}($_[0]);
122 }
123
124 sub _filtergen {
125 my $filter = pop @_;
126
127 if (@_) {
128 "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
129 } else {
130 "map(&WAIT::Filter::$filter(\$_), \@_)";
131 }
132 }
133
134 sub drop {
135 my $self = shift;
136 if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
137 my $file = $self->{file};
138
139 ! (!-e $file or unlink $file);
140 } else { # notify our database
141 croak ref($self)."::drop called directly";
142 }
143 }
144
145 sub is_an_old_index {
146 my $self = shift;
147
148 return 0 if $no_old_index_support;
149 return $self->{old_index} if exists $self->{old_index};
150
151 # We can only guess if this is an old index. We lookup the first 10
152 # $O entries. If all values are integers, we assume that the index
153 # is an old one.
154
155 defined $self->{db} or $self->open;
156 $self->sync;
157 my $dbh = $self->{dbh}; # for convenience
158
159 my $O = pack('C', 0xff)."o";
160 my ($word, $value) = ($O.$;);
161 $dbh->seq($word, $value, R_CURSOR);
162 for (my $i=0; $i<10;$i++) {
163 if ($value !~ /^\d+$/) {
164 return $self->{old_index} = 0;
165 }
166 if ($dbh->seq($word, $value, R_NEXT) or # no values left
167 $word !~ /^$O/o # no $O values left
168 ) {
169 # we are not sure enough that this is an old index
170 return $self->{old_index} = 0;
171 }
172 }
173 return $self->{old_index} = 1;
174 }
175
176 sub open {
177 my $self = shift;
178 my $file = $self->{file};
179
180 if (defined $self->{dbh}) {
181 $self->{dbh};
182 } else {
183 $self->{func} =
184 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
185 $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
186 $self->{mode}, 0664, $DB_BTREE);
187 $self->{cache} = {}
188 if $self->{mode} & O_RDWR;
189 $self->{cdict} = {}
190 if $self->{mode} & O_RDWR;
191 $self->{cached} = 0;
192 if (!$no_old_index_support and $self->is_an_old_index()) {
193 warn "This is an old index, upgrade you database";
194 require WAIT::InvertedIndexOld;
195 bless $self, 'WAIT::InvertedIndexOld';
196 }
197 }
198 }
199
200 sub insert {
201 my $self = shift;
202 my $key = shift;
203 my %occ;
204
205 defined $self->{db} or $self->open;
206 grep $occ{$_}++, &{$self->{func}}(@_);
207 my ($word, $noc);
208 $self->{records}++;
209 while (($word, $noc) = each %occ) {
210 if (defined $self->{cache}->{$word}) {
211 $self->{cdict}->{$word}++;
212 $self->{cache}->{$word} .= pack 'w2', $key, $noc;
213 } else {
214 $self->{cdict}->{$word} = 1;
215 $self->{cache}->{$word} = pack 'w2', $key, $noc;
216 }
217 $self->{cached}++;
218 }
219 # This cache limit should be configurable
220 $self->sync if $self->{cached} > 100_000;
221 my $maxtf = 0;
222 for (values %occ) {
223 $maxtf = $_ if $_ > $maxtf;
224 }
225 $self->{db}->{'m'. $key} = $maxtf;
226 }
227
228 # We sort postings by increasing max term frequency (~ by increasing
229 # document length. This reduces the quality degradation if we process
230 # only the first part of a posting list.
231
232 sub sort_postings {
233 my $self = shift;
234 my $post = shift; # reference to a hash or packed string
235
236 if (ref $post) {
237 # we skip the sort part, if the index is not sorted
238 return pack('w*', %$post) unless $self->{reorg};
239 } else {
240 $post = { unpack 'w*', $post };
241 }
242
243 my $r = '';
244
245 # Sort posting list by increasing ratio of maximum term frequency (~
246 # "document length") and term frequency. This rati multipied by the
247 # inverse document frequence gives the score for a term. This sort
248 # order can be exploited for tuning of single term queries.
249
250 for my $did (sort { $post->{$b} / $self->{db}->{'m'. $b}
251 <=>
252 $post->{$a} / $self->{db}->{'m'. $a}
253 } keys %$post) {
254 $r .= pack 'w2', $did, $post->{$did};
255 }
256 #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
257 $r;
258 }
259
260 sub delete {
261 my $self = shift;
262 my $key = shift;
263 my %occ;
264
265 my $db;
266 defined $self->{db} or $self->open;
267 $db = $self->{db};
268 $self->sync;
269 $self->{records}--;
270
271 # less than zero documents in database?
272 _complain('delete of document', $key) and $self->{records} = 0
273 if $self->{records} < 0;
274
275 grep $occ{$_}++, &{$self->{func}}(@_);
276
277 for (keys %occ) {# may reorder posting list
278 my %post = unpack 'w*', $db->{'p'.$_};
279 delete $post{$key};
280 $db->{'p'.$_} = $self->sort_postings(\%post);
281 _complain('delete of term', $_) if $db->{'o'.$_}-1 != keys %post;
282 $db->{'o'.$_} = scalar keys %post;
283 }
284 delete $db->{'m'. $key};
285 }
286
287 sub intervall {
288 my ($self, $first, $last) = @_;
289 my $value = '';
290 my $word = '';
291 my @result;
292
293 return unless exists $self->{'intervall'};
294
295 defined $self->{db} or $self->open;
296 $self->sync;
297 my $dbh = $self->{dbh}; # for convenience
298
299 if (ref $self->{'intervall'}) {
300 unless (exists $self->{'ifunc'}) {
301 $self->{'ifunc'} =
302 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
303 }
304 ($first) = &{$self->{'ifunc'}}($first) if $first;
305 ($last) = &{$self->{'ifunc'}}($last) if $last;
306 }
307 $first = 'p'.($first||'');
308 $last = (defined $last)?'p'.$last:'q';
309
310 # set the cursor to $first
311 $dbh->seq($first, $value, R_CURSOR);
312
313 # $first would be after the last word
314 return () if $first gt $last;
315
316 push @result, substr($first,1);
317 while (!$dbh->seq($word, $value, R_NEXT)) {
318 # We should limit this to a "resonable" number of words
319 last if $word gt $last;
320 push @result, substr($word,1);
321 }
322 \@result; # speed
323 }
324
325 sub prefix {
326 my ($self, $prefix) = @_;
327 my $value = '';
328 my $word = '';
329 my @result;
330
331 return () unless defined $prefix; # Full dictionary requested !!
332 return unless exists $self->{'prefix'};
333 defined $self->{db} or $self->open;
334 $self->sync;
335 my $dbh = $self->{dbh};
336
337 if (ref $self->{'prefix'}) {
338 unless (exists $self->{'pfunc'}) {
339 $self->{'pfunc'} =
340 eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
341 }
342 ($prefix) = &{$self->{'pfunc'}}($prefix);
343 }
344
345 if ($dbh->seq($word = 'p'.$prefix, $value, R_CURSOR)) {
346 return ();
347 }
348 return () if $word !~ /^p$prefix/;
349 push @result, substr($word,1);
350
351 while (!$dbh->seq($word, $value, R_NEXT)) {
352 # We should limit this to a "resonable" number of words
353 last if $word !~ /^p$prefix/;
354 push @result, substr($word,1);
355 }
356 \@result; # speed
357 }
358
359 =head2 search($query)
360
361 The search method supports a range of search algorithms. It is
362 recommended to tune the index by calling
363 C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
364 into the table. This is a computing intense operation and all inserts
365 and deletes after this optimization are slightly more expensive. Once
366 reorganized, the index is kept sorted automatically until you switch
367 the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
368
369 When searching a tuned index, a query can be processed faster if the
370 caller requests only the topmost documents. This can be done by
371 passing a C<top =E<gt>> I<n> parameter to the search method.
372
373 For single term queries, the method returns only the I<n> top ranking
374 documents. For multi term queries two optimized algorithms are
375 available. The first algorithm computes the top n documents
376 approximately but very fast, sacrificing a little bit of precision for
377 speed. The second algorithm computes the topmost I<n> documents
378 precisely. This algorithm is slower and should be used only for small
379 values of I<n>. It can be requested by passing the query attribute
380 C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
381 While the picky version might not be faster than the brute force
382 version on average for modest size databases it uses less memory and
383 the processing time is almost linear in the number of query terms, not
384 in the size of the lists.
385
386 =cut
387
388 sub search {
389 my $self = shift;
390 my $query = shift;
391
392 defined $self->{db} or $self->open;
393 $self->sync;
394 $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
395 }
396
397 sub parse {
398 my $self = shift;
399
400 defined $self->{db} or $self->open;
401 &{$self->{func}}(@_);
402 }
403
404 sub keys {
405 my $self = shift;
406
407 defined $self->{db} or $self->open;
408 keys %{$self->{db}};
409 }
410
411 sub search_prefix {
412 my $self = shift;
413
414 # print "search_prefix(@_)\n";
415 defined $self->{db} or $self->open;
416 $self->search_raw(map($self->prefix($_), @_));
417 }
418
419 sub _complain ($$) {
420 my ($action, $term) = @_;
421
422 require Carp;
423 Carp::cluck
424 (sprintf("WAIT database inconsistency during $action [%s]: ".
425 "Please rebuild index\n",
426 $term,));
427 }
428
429 sub search_raw {
430 my $self = shift;
431 my $query = shift;
432 my %score;
433
434 # Top $wanted documents must be correct. Zero means all matching
435 # documents.
436 my $wanted = $query->{top};
437 my $strict = $query->{picky};
438
439 # Return at least $minacc documents. Zero means all matching
440 # documents.
441 # my $minacc = $query->{accus} || $wanted;
442
443 # Open index and flush cache if necessary
444 defined $self->{db} or $self->open;
445 $self->sync;
446
447 # We keep duplicates
448 my @terms =
449 # Sort words by decreasing document frequency
450 sort { $self->{db}->{'o'.$a} <=> $self->{db}->{'o'.$b} }
451 # check which words occur in the index.
452 grep { $self->{db}->{'o'.$_} } @_;
453
454 return () unless @terms; # nothing to search for
455
456 # We special-case one term queries here. If the index was sorted,
457 # choping off the rest of the list will return the same ranking.
458 if ($wanted and @terms == 1) {
459 my $term = shift @terms;
460 my $idf = log($self->{records}/$self->{db}->{'o'.$term});
461 my @res;
462
463 if ($self->{reorg}) { # or not $query->{picky}
464 @res = unpack "w". int(2*$wanted), $self->{db}->{'p'.$term};
465 } else {
466 @res = unpack 'w*', $self->{db}->{'p'.$term};
467 }
468
469 for (my $i=1; $i<@res; $i+=2) {
470 $res[$i] /= $self->{db}->{'m'. $res[$i-1]} / $idf;
471 }
472
473 return @res
474 }
475
476 # We separate exhaustive search here to avoid overhead and make the
477 # code more readable. The block can be removed without changing the
478 # result.
479 unless ($wanted) {
480 for (@terms) {
481 my $df = $self->{db}->{'o'.$_};
482
483 # The frequency *must* be 1 at least since the posting list is nonempty
484 _complain('search for term', $_) and $df = 1 if $df < 1;
485
486 # Unpack posting list for current query term $_
487 my %post = unpack 'w*', $self->{db}->{'p'.$_};
488
489 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
490 # This is the inverse document frequency. The log of the inverse
491 # fraction of documents the term occurs in.
492 my $idf = log($self->{records}/$df);
493 for my $did (keys %post) {
494 if (my $freq = $self->{db}->{'m'. $did}) {
495 $score{$did} += $post{$did} / $freq * $idf;
496 }
497 }
498 }
499 # warn sprintf "Used %d accumulators\n", scalar keys %score;
500 return %score;
501 }
502
503 # A sloppy but fast algorithm for multiple term queries.
504 unless ($strict) {
505 for (@terms) {
506 # Unpack posting list for current query term $_
507 my %post = unpack 'w*', $self->{db}->{'p'.$_};
508
509 # Lookup the number of documents the term occurs in (document frequency)
510 my $occ = $self->{db}->{'o'.$_};
511
512 _complain('search for term', $_) if $self->{db}->{'o'.$_} != keys %post;
513 # The frequency *must* be 1 at least since the posting list is nonempty
514 _complain('search for term', $_) and $occ = 1 if $occ < 1;
515
516 # This is the inverse document frequency. The log of the inverse
517 # fraction of documents the term occurs in.
518 my $idf = log($self->{records}/$occ);
519
520 # If we have a reasonable number of accumulators, change the
521 # loop to iterate over the accumulators. This will compromise
522 # quality for better speed. The algorithm still computes the
523 # exact weights, but the result is not guaranteed to contain the
524 # *best* results. The database might contain documents better
525 # than the worst returned document.
526
527 # We process the lists in order of increasing length. When the
528 # number of accumulators exceeds $wanted, no new documents are
529 # added, only the ranking/weighting of the seen documents is
530 # improved. The resulting ranking list must be pruned, since only
531 # the top most documents end up near their "optimal" rank.
532
533 if (keys %score < $wanted) {
534 for my $did (keys %post) {
535 if (my $freq = $self->{db}->{'m'. $did}) {
536 $score{$did} += $post{$did} / $freq * $idf;
537 }
538 }
539 } else {
540 for my $did (keys %score) {
541 next unless exists $post{$did};
542 if (my $freq = $self->{db}->{'m'. $did}) {
543 $score{$did} += $post{$did} / $freq * $idf;
544 }
545 }
546 }
547 }
548 return %score;
549 }
550 my @max; $max[$#terms+1]=0;
551 my @idf;
552
553 # Preparation loop. This extra loop makes sense only when "reorg"
554 # and "wanted" are true. But at the time beeing, keeping the code
555 # for the different search algorithms in one place seems more
556 # desirable than some minor speedup of the brute force version. We
557 # do cache $idf though.
558
559 for (my $i = $#terms; $i >=0; $i--) {
560 local $_ = $terms[$i];
561 # Lookup the number of documents the term occurs in (document frequency)
562 my $df = $self->{db}->{'o'.$_};
563
564 # The frequency *must* be 1 at least since the posting list is nonempty
565 _complain('search for term', $_) and $df = 1 if $df < 1;
566
567 # This is the inverse document frequency. The log of the inverse
568 # fraction of documents the term occurs in.
569 $idf[$i] = log($self->{records}/$df);
570
571 my ($did,$occ);
572 if ($self->{reorg}) {
573 ($did,$occ) = unpack 'w2', $self->{db}->{'p'.$_};
574 } else { # Maybe this costs more than it helps
575 ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{'p'.$_});
576 }
577 my $freq = $self->{db}->{'m'. $did};
578 my $max = $occ/$freq*$idf[$i];
579 $max[$i] = $max + $max[$i+1];
580 }
581
582 # Main loop
583 for my $i (0 .. $#terms) {
584 my $term = $terms[$i];
585 # Unpack posting list for current query term $term. We loose the
586 # sorting order because the assignment to a hash.
587 my %post = unpack 'w*', $self->{db}->{'p'.$term};
588
589 _complain('search for term', $term)
590 if $self->{db}->{'o'.$term} != keys %post;
591
592 my $idf = $idf[$i];
593 my $full; # Need to process all postings
594 my $chop; # Score necessary to enter the ranking list
595
596 if (# We know that wanted is true since we especial cased the
597 # exhaustive search.
598
599 $wanted and
600
601 # We did sort here if necessary in
602 # the preparation loop
603 # $self->{reorg} and
604
605 scalar keys %score > $wanted) {
606 $chop = (sort { $b <=> $a } values %score)[$wanted];
607 $full = $max[$i] > $chop;
608 } else {
609 $full = 1;
610 }
611
612 if ($full) {
613 # We need to inspect the full list. Either $wanted is not given,
614 # the index is not sorted, or we don't have enough accumulators
615 # yet.
616 if (defined $chop) {
617 # We might be able to avoid allocating accumulators
618 for my $did (keys %post) {
619 if (my $freq = $self->{db}->{'m'. $did}) {
620 my $wgt = $post{$did} / $freq * $idf;
621 # We add an accumulator if $wgt exeeds $chop
622 if (exists $score{$did} or $wgt > $chop) {
623 $score{$did} += $wgt;
624 }
625 }
626 }
627 } else {
628 # Allocate acumulators for each seen document.
629 for my $did (keys %post) {
630 if (my $freq = $self->{db}->{'m'. $did}) {
631 $score{$did} += $post{$did} / $freq * $idf;
632 }
633 }
634 }
635 } else {
636 # Update existing accumulators
637 for my $did (keys %score) {
638 next unless exists $post{$did};
639 if (my $freq = $self->{db}->{'m'. $did}) {
640 $score{$did} += $post{$did} / $freq * $idf;
641 }
642 }
643 }
644 }
645 #warn sprintf "Used %d accumulators\n", scalar keys %score;
646 %score;
647 }
648
649 sub set {
650 my ($self, $attr, $value) = @_;
651
652 die "No such indexy attribute: '$attr'" unless $attr eq 'top';
653
654 return delete $self->{reorg} if $value == 0;
655
656 return if $self->{reorg}; # we are sorted already
657 return unless $self->{mode} & O_RDWR;
658 defined $self->{db} or $self->open;
659
660 $self->sync;
661 while (my($key, $value) = each %{$self->{db}}) {
662 next if $key !~ /^p/;
663 $self->{db}->{$key} = $self->sort_postings($value);
664 }
665 $self->{reorg} = 1;
666 }
667
668 sub sync {
669 my $self = shift;
670
671 if ($self->{mode} & O_RDWR) {
672 print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
673 while (my($key, $value) = each %{$self->{cache}}) {
674 if ($self->{reorg}) {
675 $self->{db}->{'p'.$key} = $self->sort_postings($self->{db}->{'p'.$key}
676 . $value);
677 } else {
678 $self->{db}->{'p'.$key} .= $value;
679 }
680 }
681 while (my($key, $value) = each %{$self->{cdict}}) {
682 $self->{db}->{'o'.$key} = 0 unless $self->{db}->{'o'.$key};
683 $self->{db}->{'o'.$key} += $value;
684 }
685 $self->{cache} = {};
686 $self->{cdict} = {};
687 $self->{cached} = 0;
688 }
689 }
690
691 sub close {
692 my $self = shift;
693
694 if ($self->{dbh}) {
695 $self->sync;
696 delete $self->{dbh};
697 untie %{$self->{db}};
698 delete $self->{db};
699 delete $self->{func};
700 delete $self->{cache};
701 delete $self->{cached};
702 delete $self->{cdict};
703 delete $self->{pfunc} if defined $self->{pfunc};
704 delete $self->{ifunc} if defined $self->{ifunc};
705 delete $self->{xfunc} if defined $self->{xfunc};
706 }
707 }
708
709 1;
710

Properties

Name Value
cvs2svn:cvs-rev 1.2

  ViewVC Help
Powered by ViewVC 1.1.26