/[wait]/branches/unido/lib/WAIT/InvertedIndexOld.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /branches/unido/lib/WAIT/InvertedIndexOld.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 106 - (hide annotations)
Tue Jul 13 12:22:09 2004 UTC (19 years, 10 months ago) by dpavlin
File size: 19904 byte(s)
Changes made by Andreas J. Koenig <andreas.koenig(at)anima.de> for Unido project

1 dpavlin 106 # -*- Mode: Perl -*-
2     # $Basename: InvertedIndex.pm $
3     # $Revision: 1.1 $
4     # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7     # Last Modified On: Sat Nov 11 14:36:39 2000
8     # Language : CPerl
9     #
10     # (C) Copyright 1996-2000, Ulrich Pfeifer
11     #
12    
13     package WAIT::InvertedIndexOld;
14     use strict;
15     use DB_File;
16     use Fcntl;
17     use WAIT::Filter;
18     use Carp;
19     use vars qw(%FUNC);
20    
21     my $O = pack('C', 0xff)."o"; # occurances (document ferquency)
22    
23     # The document frequency is the number of documents a term occurs
24     # in. The idea is that a term occuring in a significant part of the
25     # documents is not too significant.
26    
27     my $M = pack('C', 0xff)."m"; # maxtf (term frequency)
28    
29     # The maximum term frequency of a document is the frequency of the
30     # most frequent term in the document. It is related to the document
31     # length obviously. A document in which the most frequnet term occurs
32     # 100 times is probably much longer than a document whichs most
33     # frequent term occurs five time.
34    
35     sub new {
36     my $type = shift;
37     my %parm = @_;
38     my $self = {};
39    
40     $self->{file} = $parm{file} or croak "No file specified";
41     $self->{attr} = $parm{attr} or croak "No attributes specified";
42     $self->{filter} = $parm{filter};
43     $self->{'name'} = $parm{'name'};
44     $self->{records} = 0;
45     for (qw(intervall prefix)) {
46     if (exists $parm{$_}) {
47     if (ref $parm{$_}) {
48     $self->{$_} = [@{$parm{$_}}] # clone
49     } else {
50     $self->{$_} = $parm{$_}
51     }
52     }
53     }
54     bless $self, ref($type) || $type;
55     }
56    
57     sub name {$_[0]->{'name'}}
58    
59     sub _split_pos {
60     my ($text, $pos) = @{$_[0]};
61     my @result;
62    
63     $text =~ s/(^\s+)// and $pos += length($1);
64     while ($text =~ s/(^\S+)//) {
65     my $word = $1;
66     push @result, [$word, $pos];
67     $pos += length($word);
68     $text =~ s/(^\s+)// and $pos += length($1);
69     }
70     @result;
71     }
72    
73     sub _xfiltergen {
74     my $filter = pop @_;
75    
76     # Oops, we cannot overrule the user's choice. Other filters may kill
77     # stopwords, such as isotr clobbers "isn't" to "isnt".
78    
79     # if ($filter eq 'stop') { # avoid the slow stopword elimination
80     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
81     # }
82     if (@_) {
83     if ($filter =~ /^split(\d*)/) {
84     if ($1) {
85     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
86     } else {
87     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
88     }
89     } else {
90     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
91     }
92     } else {
93     if ($filter =~ /^split(\d*)/) {
94     if ($1) {
95     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
96     } else {
97     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
98     }
99     } else {
100     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
101     }
102     }
103     }
104    
105     sub parse_pos {
106     my $self = shift;
107    
108     unless (exists $self->{xfunc}) {
109     $self->{xfunc} =
110     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
111     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
112     }
113     &{$self->{xfunc}}($_[0]);
114     }
115    
116     sub _filtergen {
117     my $filter = pop @_;
118    
119     if (@_) {
120     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
121     } else {
122     "map(&WAIT::Filter::$filter(\$_), \@_)";
123     }
124     }
125    
126     sub drop {
127     my $self = shift;
128     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
129     my $file = $self->{file};
130    
131     ! (!-e $file or unlink $file);
132     } else { # notify our database
133     croak ref($self)."::drop called directly";
134     }
135     }
136    
137     sub open {
138     my $self = shift;
139     my $file = $self->{file};
140    
141     if (defined $self->{dbh}) {
142     $self->{dbh};
143     } else {
144     $self->{func} =
145     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
146     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
147     $self->{mode}, 0664, $DB_BTREE);
148     $self->{cache} = {}
149     if $self->{mode} & O_RDWR;
150     $self->{cdict} = {}
151     if $self->{mode} & O_RDWR;
152     $self->{cached} = 0;
153     }
154     }
155    
156     sub insert {
157     my $self = shift;
158     my $key = shift;
159     my %occ;
160    
161     defined $self->{db} or $self->open;
162     grep $occ{$_}++, &{$self->{func}}(@_);
163     my ($word, $noc);
164     $self->{records}++;
165     while (($word, $noc) = each %occ) {
166     if (defined $self->{cache}->{$word}) {
167     $self->{cdict}->{$O,$word}++;
168     $self->{cache}->{$word} .= pack 'w2', $key, $noc;
169     } else {
170     $self->{cdict}->{$O,$word} = 1;
171     $self->{cache}->{$word} = pack 'w2', $key, $noc;
172     }
173     $self->{cached}++;
174     }
175     # This cache limit should be configurable
176     $self->sync if $self->{cached} > 100_000;
177     my $maxtf = 0;
178     for (values %occ) {
179     $maxtf = $_ if $_ > $maxtf;
180     }
181     $self->{db}->{$M, $key} = $maxtf;
182     }
183    
184     # We sort postings by increasing max term frequency (~ by increasing
185     # document length. This reduces the quality degradation if we process
186     # only the first part of a posting list.
187    
188     sub sort_postings {
189     my $self = shift;
190     my $post = shift; # reference to a hash or packed string
191    
192     if (ref $post) {
193     # we skip the sort part, if the index is not sorted
194     return pack('w*', %$post) unless $self->{reorg};
195     } else {
196     $post = { unpack 'w*', $post };
197     }
198    
199     my $r = '';
200    
201     # Sort posting list by increasing ratio of maximum term frequency (~
202     # "document length") and term frequency. This rati multipied by the
203     # inverse document frequence gives the score for a term. This sort
204     # order can be exploited for tuning of single term queries.
205    
206     for my $did (sort { $post->{$b} / $self->{db}->{$M, $b}
207     <=>
208     $post->{$a} / $self->{db}->{$M, $a}
209     } keys %$post) {
210     $r .= pack 'w2', $did, $post->{$did};
211     }
212     #warn sprintf "reorg %d %s\n", scalar keys %$post, join ' ', unpack 'w*', $r;
213     $r;
214     }
215    
216     sub delete {
217     my $self = shift;
218     my $key = shift;
219     my %occ;
220    
221     my $db;
222     defined $self->{db} or $self->open;
223     $db = $self->{db};
224     $self->sync;
225     $self->{records}--;
226    
227     # less than zero documents in database?
228     _complain('delete of document', $key) and $self->{records} = 0
229     if $self->{records} < 0;
230    
231     grep $occ{$_}++, &{$self->{func}}(@_);
232    
233     for (keys %occ) {# may reorder posting list
234     my %post = unpack 'w*', $db->{$_};
235     delete $post{$key};
236     $db->{$_} = $self->sort_postings(\%post);
237     _complain('delete of term', $_) if $db->{$O,$_}-1 != keys %post;
238     $db->{$O,$_} = scalar keys %post;
239     }
240     delete $db->{$M, $key};
241     }
242    
243     sub intervall {
244     my ($self, $first, $last) = @_;
245     my $value = '';
246     my $word = '';
247     my @result;
248    
249     return unless exists $self->{'intervall'};
250    
251     defined $self->{db} or $self->open;
252     $self->sync;
253     my $dbh = $self->{dbh}; # for convenience
254    
255     if (ref $self->{'intervall'}) {
256     unless (exists $self->{'ifunc'}) {
257     $self->{'ifunc'} =
258     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
259     }
260     ($first) = &{$self->{'ifunc'}}($first) if $first;
261     ($last) = &{$self->{'ifunc'}}($last) if $last;
262     }
263     if (defined $first and $first ne '') { # set the cursor to $first
264     $dbh->seq($first, $value, R_CURSOR);
265     } else {
266     $dbh->seq($first, $value, R_FIRST);
267     }
268     # We assume that word do not start with the character \377
269     # $last = pack 'C', 0xff unless defined $last and $last ne '';
270     return () if defined $last and $first gt $last; # $first would be after the last word
271    
272     push @result, $first;
273     while (!$dbh->seq($word, $value, R_NEXT)) {
274     # We should limit this to a "resonable" number of words
275     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
276     push @result, $word;
277     }
278     \@result; # speed
279     }
280    
281     sub prefix {
282     my ($self, $prefix) = @_;
283     my $value = '';
284     my $word = '';
285     my @result;
286    
287     return () unless defined $prefix; # Full dictionary requested !!
288     return unless exists $self->{'prefix'};
289     defined $self->{db} or $self->open;
290     $self->sync;
291     my $dbh = $self->{dbh};
292    
293     if (ref $self->{'prefix'}) {
294     unless (exists $self->{'pfunc'}) {
295     $self->{'pfunc'} =
296     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
297     }
298     ($prefix) = &{$self->{'pfunc'}}($prefix);
299     }
300    
301     if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
302     return ();
303     }
304     return () if $word !~ /^$prefix/;
305     push @result, $word;
306    
307     while (!$dbh->seq($word, $value, R_NEXT)) {
308     # We should limit this to a "resonable" number of words
309     last if $word !~ /^$prefix/;
310     push @result, $word;
311     }
312     \@result; # speed
313     }
314    
315     =head2 search($query)
316    
317     The search method supports a range of search algorithms. It is
318     recommended to tune the index by calling
319     C<$table-E<gt>set(top=E<gt>1)> B<after> bulk inserting the documents
320     into the table. This is a computing intense operation and all inserts
321     and deletes after this optimization are slightly more expensive. Once
322     reorganized, the index is kept sorted automatically until you switch
323     the optimization off by calling C<$table-E<gt>set(top=E<gt>0)>.
324    
325     When searching a tuned index, a query can be processed faster if the
326     caller requests only the topmost documents. This can be done by
327     passing a C<top =E<gt>> I<n> parameter to the search method.
328    
329     For single term queries, the method returns only the I<n> top ranking
330     documents. For multi term queries two optimized algorithms are
331     available. The first algorithm computes the top n documents
332     approximately but very fast, sacrificing a little bit of precision for
333     speed. The second algorithm computes the topmost I<n> documents
334     precisely. This algorithm is slower and should be used only for small
335     values of I<n>. It can be requested by passing the query attribute
336     C<picky =E<gt> 1>. Both algorithms may return more than I<n> hits.
337     While the picky version might not be faster than the brute force
338     version on average for modest size databases it uses less memory and
339     the processing time is almost linear in the number of query terms, not
340     in the size of the lists.
341    
342     =cut
343    
344     sub search {
345     my $self = shift;
346     my $query = shift;
347    
348     defined $self->{db} or $self->open;
349     $self->sync;
350     $self->search_raw($query, &{$self->{func}}(@_)); # No call to parse() here
351     }
352    
353     sub parse {
354     my $self = shift;
355    
356     defined $self->{db} or $self->open;
357     &{$self->{func}}(@_);
358     }
359    
360     sub keys {
361     my $self = shift;
362    
363     defined $self->{db} or $self->open;
364     keys %{$self->{db}};
365     }
366    
367     sub search_prefix {
368     my $self = shift;
369    
370     # print "search_prefix(@_)\n";
371     defined $self->{db} or $self->open;
372     $self->search_raw(map($self->prefix($_), @_));
373     }
374    
375     sub _complain ($$) {
376     my ($action, $term) = @_;
377    
378     require Carp;
379     Carp::cluck
380     (sprintf("WAIT database inconsistency during $action [%s]: ".
381     "Please rebuild index\n",
382     $term,));
383     }
384    
385     sub search_raw {
386     my $self = shift;
387     my $query = shift;
388     my %score;
389    
390     # Top $wanted documents must be correct. Zero means all matching
391     # documents.
392     my $wanted = $query->{top};
393     my $strict = $query->{picky};
394    
395     # Return at least $minacc documents. Zero means all matching
396     # documents.
397     # my $minacc = $query->{accus} || $wanted;
398    
399     # Open index and flush cache if necessary
400     defined $self->{db} or $self->open;
401     $self->sync;
402    
403     # We keep duplicates
404     my @terms =
405     # Sort words by decreasing document frequency
406     sort { $self->{db}->{$O,$a} <=> $self->{db}->{$O,$b} }
407     # check which words occur in the index.
408     grep { $self->{db}->{$O,$_} } @_;
409    
410     return () unless @terms; # nothing to search for
411    
412     # We special-case one term queries here. If the index was sorted,
413     # choping off the rest of the list will return the same ranking.
414     if ($wanted and @terms == 1) {
415     my $term = shift @terms;
416     my $idf = log($self->{records}/$self->{db}->{$O,$term});
417     my @res;
418    
419     if ($self->{reorg}) { # or not $query->{picky}
420     @res = unpack "w". int(2*$wanted), $self->{db}->{$term};
421     } else {
422     @res = unpack 'w*', $self->{db}->{$term};
423     }
424    
425     for (my $i=1; $i<@res; $i+=2) {
426     $res[$i] /= $self->{db}->{$M, $res[$i-1]} / $idf;
427     }
428    
429     return @res
430     }
431    
432     # We separate exhaustive search here to avoid overhead and make the
433     # code more readable. The block can be removed without changing the
434     # result.
435     unless ($wanted) {
436     for (@terms) {
437     my $df = $self->{db}->{$O,$_};
438    
439     # The frequency *must* be 1 at least since the posting list is nonempty
440     _complain('search for term', $_) and $df = 1 if $df < 1;
441    
442     # Unpack posting list for current query term $_
443     my %post = unpack 'w*', $self->{db}->{$_};
444    
445     _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
446     # This is the inverse document frequency. The log of the inverse
447     # fraction of documents the term occurs in.
448     my $idf = log($self->{records}/$df);
449     for my $did (keys %post) {
450     if (my $freq = $self->{db}->{$M, $did}) {
451     $score{$did} += $post{$did} / $freq * $idf;
452     }
453     }
454     }
455     # warn sprintf "Used %d accumulators\n", scalar keys %score;
456     return %score;
457     }
458    
459     # A sloppy but fast algorithm for multiple term queries.
460     unless ($strict) {
461     for (@terms) {
462     # Unpack posting list for current query term $_
463     my %post = unpack 'w*', $self->{db}->{$_};
464    
465     # Lookup the number of documents the term occurs in (document frequency)
466     my $occ = $self->{db}->{$O,$_};
467    
468     _complain('search for term', $_) if $self->{db}->{$O,$_} != keys %post;
469     # The frequency *must* be 1 at least since the posting list is nonempty
470     _complain('search for term', $_) and $occ = 1 if $occ < 1;
471    
472     # This is the inverse document frequency. The log of the inverse
473     # fraction of documents the term occurs in.
474     my $idf = log($self->{records}/$occ);
475    
476     # If we have a reasonable number of accumulators, change the
477     # loop to iterate over the accumulators. This will compromise
478     # quality for better speed. The algorithm still computes the
479     # exact weights, but the result is not guaranteed to contain the
480     # *best* results. The database might contain documents better
481     # than the worst returned document.
482    
483     # We process the lists in order of increasing length. When the
484     # number of accumulators exceeds $wanted, no new documents are
485     # added, only the ranking/weighting of the seen documents is
486     # improved. The resulting ranking list must be pruned, since only
487     # the top most documents end up near their "optimal" rank.
488    
489     if (keys %score < $wanted) {
490     for my $did (keys %post) {
491     if (my $freq = $self->{db}->{$M, $did}) {
492     $score{$did} += $post{$did} / $freq * $idf;
493     }
494     }
495     } else {
496     for my $did (keys %score) {
497     next unless exists $post{$did};
498     if (my $freq = $self->{db}->{$M, $did}) {
499     $score{$did} += $post{$did} / $freq * $idf;
500     }
501     }
502     }
503     }
504     return %score;
505     }
506     my @max; $max[$#terms+1]=0;
507     my @idf;
508    
509     # Preparation loop. This extra loop makes sense only when "reorg"
510     # and "wanted" are true. But at the time beeing, keeping the code
511     # for the different search algorithms in one place seems more
512     # desirable than some minor speedup of the brute force version. We
513     # do cache $idf though.
514    
515     for (my $i = $#terms; $i >=0; $i--) {
516     local $_ = $terms[$i];
517     # Lookup the number of documents the term occurs in (document frequency)
518     my $df = $self->{db}->{$O,$_};
519    
520     # The frequency *must* be 1 at least since the posting list is nonempty
521     _complain('search for term', $_) and $df = 1 if $df < 1;
522    
523     # This is the inverse document frequency. The log of the inverse
524     # fraction of documents the term occurs in.
525     $idf[$i] = log($self->{records}/$df);
526    
527     my ($did,$occ);
528     if ($self->{reorg}) {
529     ($did,$occ) = unpack 'w2', $self->{db}->{$_};
530     } else { # Maybe this costs more than it helps
531     ($did,$occ) = unpack 'w2', $self->sort_postings($self->{db}->{$_});
532     }
533     my $freq = $self->{db}->{$M, $did};
534     my $max = $occ/$freq*$idf[$i];
535     $max[$i] = $max + $max[$i+1];
536     }
537    
538     # Main loop
539     for my $i (0 .. $#terms) {
540     my $term = $terms[$i];
541     # Unpack posting list for current query term $term. We loose the
542     # sorting order because the assignment to a hash.
543     my %post = unpack 'w*', $self->{db}->{$term};
544    
545     _complain('search for term', $term)
546     if $self->{db}->{$O,$term} != keys %post;
547    
548     my $idf = $idf[$i];
549     my $full; # Need to process all postings
550     my $chop; # Score necessary to enter the ranking list
551    
552     if (# We know that wanted is true since we especial cased the
553     # exhaustive search.
554    
555     $wanted and
556    
557     # We did sort here if necessary in
558     # the preparation loop
559     # $self->{reorg} and
560    
561     scalar keys %score > $wanted) {
562     $chop = (sort { $b <=> $a } values %score)[$wanted];
563     $full = $max[$i] > $chop;
564     } else {
565     $full = 1;
566     }
567    
568     if ($full) {
569     # We need to inspect the full list. Either $wanted is not given,
570     # the index is not sorted, or we don't have enough accumulators
571     # yet.
572     if (defined $chop) {
573     # We might be able to avoid allocating accumulators
574     for my $did (keys %post) {
575     if (my $freq = $self->{db}->{$M, $did}) {
576     my $wgt = $post{$did} / $freq * $idf;
577     # We add an accumulator if $wgt exeeds $chop
578     if (exists $score{$did} or $wgt > $chop) {
579     $score{$did} += $wgt;
580     }
581     }
582     }
583     } else {
584     # Allocate acumulators for each seen document.
585     for my $did (keys %post) {
586     if (my $freq = $self->{db}->{$M, $did}) {
587     $score{$did} += $post{$did} / $freq * $idf;
588     }
589     }
590     }
591     } else {
592     # Update existing accumulators
593     for my $did (keys %score) {
594     next unless exists $post{$did};
595     if (my $freq = $self->{db}->{$M, $did}) {
596     $score{$did} += $post{$did} / $freq * $idf;
597     }
598     }
599     }
600     }
601     #warn sprintf "Used %d accumulators\n", scalar keys %score;
602     %score;
603     }
604    
605     sub set {
606     my ($self, $attr, $value) = @_;
607    
608     die "No such indexy attribute: '$attr'" unless $attr eq 'top';
609    
610     return delete $self->{reorg} if $value == 0;
611    
612     return if $self->{reorg}; # we are sorted already
613     return unless $self->{mode} & O_RDWR;
614     defined $self->{db} or $self->open;
615    
616     $self->sync;
617     while (my($key, $value) = each %{$self->{db}}) {
618     next if $key =~ /^\377[om]/;
619     $self->{db}->{$key} = $self->sort_postings($value);
620     }
621     $self->{reorg} = 1;
622     }
623    
624     sub sync {
625     my $self = shift;
626    
627     if ($self->{mode} & O_RDWR) {
628     print STDERR "Flushing $self->{cached} postings\n" if $self->{cached};
629     while (my($key, $value) = each %{$self->{cache}}) {
630     if ($self->{reorg}) {
631     $self->{db}->{$key} = $self->sort_postings($self->{db}->{$key}
632     . $value);
633     } else {
634     $self->{db}->{$key} .= $value;
635     }
636     }
637     while (my($key, $value) = each %{$self->{cdict}}) {
638     $self->{db}->{$key} = 0 unless $self->{db}->{$key};
639     $self->{db}->{$key} += $value;
640     }
641     $self->{cache} = {};
642     $self->{cdict} = {};
643     $self->{cached} = 0;
644     }
645     }
646    
647     sub close {
648     my $self = shift;
649    
650     if ($self->{dbh}) {
651     $self->sync;
652     delete $self->{dbh};
653     untie %{$self->{db}};
654     delete $self->{db};
655     delete $self->{func};
656     delete $self->{cache};
657     delete $self->{cached};
658     delete $self->{cdict};
659     delete $self->{pfunc} if defined $self->{pfunc};
660     delete $self->{ifunc} if defined $self->{ifunc};
661     delete $self->{xfunc} if defined $self->{xfunc};
662     }
663     }
664    
665     1;
666    

  ViewVC Help
Powered by ViewVC 1.1.26