1 |
# -*- Mode: Perl -*- |
# -*- Mode: Cperl -*- |
2 |
# InvertedIndex.pm -- |
# InvertedIndex.pm -- |
3 |
# ITIID : $ITI$ $Header $__Header$ |
# ITIID : $ITI$ $Header $__Header$ |
4 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
5 |
# Created On : Thu Aug 8 13:05:10 1996 |
# Created On : Thu Aug 8 13:05:10 1996 |
7 |
# Last Modified On: Sun Nov 22 18:44:42 1998 |
# Last Modified On: Sun Nov 22 18:44:42 1998 |
8 |
# Language : CPerl |
# Language : CPerl |
9 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
10 |
# |
# |
11 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
12 |
# |
# |
13 |
|
|
14 |
package WAIT::InvertedIndex; |
package WAIT::InvertedIndex; |
15 |
use strict; |
use strict; |
63 |
sub _xfiltergen { |
sub _xfiltergen { |
64 |
my $filter = pop @_; |
my $filter = pop @_; |
65 |
|
|
66 |
if ($filter eq 'stop') { # avoid the slow stopword elimination |
# Oops, we cannot overrule the user's choice. Other filters may kill |
67 |
return _xfiltergen(@_); # it's cheaper to look them up afterwards |
# stopwords, such as isotr clobbers "isn't" to "isnt". |
68 |
} |
|
69 |
|
# if ($filter eq 'stop') { # avoid the slow stopword elimination |
70 |
|
# return _xfiltergen(@_); # it's cheaper to look them up afterwards |
71 |
|
# } |
72 |
if (@_) { |
if (@_) { |
73 |
if ($filter =~ /^split(\d*)/) { |
if ($filter =~ /^split(\d*)/) { |
74 |
if ($1) { |
if ($1) { |
151 |
my $self = shift; |
my $self = shift; |
152 |
my $key = shift; |
my $key = shift; |
153 |
my %occ; |
my %occ; |
154 |
|
|
155 |
defined $self->{db} or $self->open; |
defined $self->{db} or $self->open; |
156 |
grep $occ{$_}++, &{$self->{func}}(@_); |
grep $occ{$_}++, &{$self->{func}}(@_); |
157 |
my ($word, $noc); |
my ($word, $noc); |
163 |
} else { |
} else { |
164 |
$self->{cdict}->{$O,$word} = 1; |
$self->{cdict}->{$O,$word} = 1; |
165 |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
$self->{cache}->{$word} = pack 'w2', $key, $noc; |
166 |
} |
} |
167 |
$self->{cached}++; |
$self->{cached}++; |
168 |
} |
} |
169 |
$self->sync if $self->{cached} > 100_000; |
$self->sync if $self->{cached} > 100_000; |
280 |
&{$self->{func}}(@_); |
&{$self->{func}}(@_); |
281 |
} |
} |
282 |
|
|
283 |
|
sub keys { |
284 |
|
my $self = shift; |
285 |
|
|
286 |
|
defined $self->{db} or $self->open; |
287 |
|
keys %{$self->{db}}; |
288 |
|
} |
289 |
|
|
290 |
sub search_prefix { |
sub search_prefix { |
291 |
my $self = shift; |
my $self = shift; |
292 |
|
|
308 |
for (keys %occ) { |
for (keys %occ) { |
309 |
if (defined $self->{db}->{$_}) { |
if (defined $self->{db}->{$_}) { |
310 |
my %post = unpack 'w*', $self->{db}->{$_}; |
my %post = unpack 'w*', $self->{db}->{$_}; |
311 |
my $idf = log($self->{records}/$self->{db}->{$O,$_}); |
my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1)); |
312 |
my $did; |
my $did; |
313 |
for $did (keys %post) { |
for $did (keys %post) { |
314 |
$score{$did} = 0 unless defined $score{$did}; # perl -w |
$score{$did} = 0 unless defined $score{$did}; # perl -w |
324 |
my $self = shift; |
my $self = shift; |
325 |
|
|
326 |
if ($self->{mode} & O_RDWR) { |
if ($self->{mode} & O_RDWR) { |
327 |
print STDERR "\aFlushing $self->{cached} postings\n"; |
print STDERR "Flushing $self->{cached} postings\n"; |
328 |
while (my($key, $value) = each %{$self->{cache}}) { |
while (my($key, $value) = each %{$self->{cache}}) { |
329 |
$self->{db}->{$key} .= $value; |
$self->{db}->{$key} .= $value; |
330 |
#delete $self->{cache}->{$key}; |
#delete $self->{cache}->{$key}; |