/[wait]/cvs-head/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 13 - (hide annotations)
Fri Apr 28 15:42:44 2000 UTC (24 years ago) by ulpfr
Original Path: branches/CPAN/lib/WAIT/InvertedIndex.pm
File size: 9201 byte(s)
Import of WAIT-1.710

1 ulpfr 13 # -*- Mode: Cperl -*-
2     # InvertedIndex.pm --
3 ulpfr 10 # ITIID : $ITI$ $Header $__Header$
4     # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7     # Last Modified On: Sun Nov 22 18:44:42 1998
8     # Language : CPerl
9     # Status : Unknown, Use with caution!
10 ulpfr 13 #
11 ulpfr 10 # Copyright (c) 1996-1997, Ulrich Pfeifer
12 ulpfr 13 #
13 ulpfr 10
14     package WAIT::InvertedIndex;
15     use strict;
16     use DB_File;
17     use Fcntl;
18     use WAIT::Filter;
19     use Carp;
20     use vars qw(%FUNC);
21    
22     my $O = pack('C', 0xff)."o"; # occurances
23     my $M = pack('C', 0xff)."m"; # maxtf
24    
25     sub new {
26     my $type = shift;
27     my %parm = @_;
28     my $self = {};
29    
30     $self->{file} = $parm{file} or croak "No file specified";
31     $self->{attr} = $parm{attr} or croak "No attributes specified";
32     $self->{filter} = $parm{filter};
33     $self->{'name'} = $parm{'name'};
34     $self->{records} = 0;
35     for (qw(intervall prefix)) {
36     if (exists $parm{$_}) {
37     if (ref $parm{$_}) {
38     $self->{$_} = [@{$parm{$_}}] # clone
39     } else {
40     $self->{$_} = $parm{$_}
41     }
42     }
43     }
44     bless $self, ref($type) || $type;
45     }
46    
47     sub name {$_[0]->{'name'}}
48    
49     sub _split_pos {
50     my ($text, $pos) = @{$_[0]};
51     my @result;
52    
53     $text =~ s/(^\s+)// and $pos += length($1);
54     while ($text =~ s/(^\S+)//) {
55     my $word = $1;
56     push @result, [$word, $pos];
57     $pos += length($word);
58     $text =~ s/(^\s+)// and $pos += length($1);
59     }
60     @result;
61     }
62    
63     sub _xfiltergen {
64     my $filter = pop @_;
65    
66 ulpfr 13 # Oops, we cannot overrule the user's choice. Other filters may kill
67     # stopwords, such as isotr clobbers "isn't" to "isnt".
68    
69     # if ($filter eq 'stop') { # avoid the slow stopword elimination
70     # return _xfiltergen(@_); # it's cheaper to look them up afterwards
71     # }
72 ulpfr 10 if (@_) {
73     if ($filter =~ /^split(\d*)/) {
74     if ($1) {
75     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
76     } else {
77     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
78     }
79     } else {
80     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
81     }
82     } else {
83     if ($filter =~ /^split(\d*)/) {
84     if ($1) {
85     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
86     } else {
87     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
88     }
89     } else {
90     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
91     }
92     }
93     }
94    
95     sub parse_pos {
96     my $self = shift;
97    
98     unless (exists $self->{xfunc}) {
99     $self->{xfunc} =
100     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
101     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
102     }
103     &{$self->{xfunc}}($_[0]);
104     }
105    
106     sub _filtergen {
107     my $filter = pop @_;
108    
109     if (@_) {
110     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
111     } else {
112     "map(&WAIT::Filter::$filter(\$_), \@_)";
113     }
114     }
115    
116     sub drop {
117     my $self = shift;
118     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
119     my $file = $self->{file};
120    
121     ! (!-e $file or unlink $file);
122     } else { # notify our database
123     croak ref($self)."::drop called directly";
124     }
125     }
126    
127     sub open {
128     my $self = shift;
129     my $file = $self->{file};
130    
131     if (defined $self->{dbh}) {
132     $self->{dbh};
133     } else {
134     $self->{func} =
135     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
136     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
137     $self->{mode}, 0664, $DB_BTREE);
138     # tie(%{$self->{cache}}, 'DB_File', undef,
139     # $self->{mode}, 0664, $DB_BTREE)
140     $self->{cache} = {}
141     if $self->{mode} & O_RDWR;
142     # tie(%{$self->{cdict}}, 'DB_File', undef,
143     # $self->{mode}, 0664, $DB_BTREE)
144     $self->{cdict} = {}
145     if $self->{mode} & O_RDWR;
146     $self->{cached} = 0;
147     }
148     }
149    
150     sub insert {
151     my $self = shift;
152     my $key = shift;
153     my %occ;
154 ulpfr 13
155 ulpfr 10 defined $self->{db} or $self->open;
156     grep $occ{$_}++, &{$self->{func}}(@_);
157     my ($word, $noc);
158     $self->{records}++;
159     while (($word, $noc) = each %occ) {
160     if (defined $self->{cache}->{$word}) {
161     $self->{cdict}->{$O,$word}++;
162     $self->{cache}->{$word} .= pack 'w2', $key, $noc;
163     } else {
164     $self->{cdict}->{$O,$word} = 1;
165     $self->{cache}->{$word} = pack 'w2', $key, $noc;
166 ulpfr 13 }
167 ulpfr 10 $self->{cached}++;
168     }
169     $self->sync if $self->{cached} > 100_000;
170     my $maxtf = 0;
171     for (values %occ) {
172     $maxtf = $_ if $_ > $maxtf;
173     }
174     $self->{db}->{$M, $key} = $maxtf;
175     }
176    
177     sub delete {
178     my $self = shift;
179     my $key = shift;
180     my %occ;
181    
182     defined $self->{db} or $self->open;
183     $self->sync;
184     $self->{records}--;
185     grep $occ{$_}++, &{$self->{func}}(@_);
186     for (keys %occ) {
187     # may reorder posting list
188     my %post = unpack 'w*', $self->{db}->{$_};
189     $self->{db}->{$O,$_}--;
190     delete $post{$key};
191     $self->{db}->{$_} = pack 'w*', %post;
192     }
193     delete $self->{db}->{$M, $key};
194     }
195    
196     sub intervall {
197     my ($self, $first, $last) = @_;
198     my $value = '';
199     my $word = '';
200     my @result;
201    
202     return unless exists $self->{'intervall'};
203    
204     defined $self->{db} or $self->open;
205     $self->sync;
206     my $dbh = $self->{dbh}; # for convenience
207    
208     if (ref $self->{'intervall'}) {
209     unless (exists $self->{'ifunc'}) {
210     $self->{'ifunc'} =
211     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
212     }
213     ($first) = &{$self->{'ifunc'}}($first) if $first;
214     ($last) = &{$self->{'ifunc'}}($last) if $last;
215     }
216     if (defined $first and $first ne '') { # set the cursor to $first
217     $dbh->seq($first, $value, R_CURSOR);
218     } else {
219     $dbh->seq($first, $value, R_FIRST);
220     }
221     # We assume that word do not start with the character \377
222     # $last = pack 'C', 0xff unless defined $last and $last ne '';
223     return () if defined $last and $first gt $last; # $first would be after the last word
224    
225     push @result, $first;
226     while (!$dbh->seq($word, $value, R_NEXT)) {
227     # We should limit this to a "resonable" number of words
228     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
229     push @result, $word;
230     }
231     \@result; # speed
232     }
233    
234     sub prefix {
235     my ($self, $prefix) = @_;
236     my $value = '';
237     my $word = '';
238     my @result;
239    
240     return () unless defined $prefix; # Full dictionary requested !!
241     return unless exists $self->{'prefix'};
242     defined $self->{db} or $self->open;
243     $self->sync;
244     my $dbh = $self->{dbh};
245    
246     if (ref $self->{'prefix'}) {
247     unless (exists $self->{'pfunc'}) {
248     $self->{'pfunc'} =
249     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
250     }
251     ($prefix) = &{$self->{'pfunc'}}($prefix);
252     }
253    
254     if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
255     return ();
256     }
257     return () if $word !~ /^$prefix/;
258     push @result, $word;
259    
260     while (!$dbh->seq($word, $value, R_NEXT)) {
261     # We should limit this to a "resonable" number of words
262     last if $word !~ /^$prefix/;
263     push @result, $word;
264     }
265     \@result; # speed
266     }
267    
268     sub search {
269     my $self = shift;
270    
271     defined $self->{db} or $self->open;
272     $self->sync;
273     $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
274     }
275    
276     sub parse {
277     my $self = shift;
278    
279     defined $self->{db} or $self->open;
280     &{$self->{func}}(@_);
281     }
282    
283 ulpfr 13 sub keys {
284     my $self = shift;
285    
286     defined $self->{db} or $self->open;
287     keys %{$self->{db}};
288     }
289    
290 ulpfr 10 sub search_prefix {
291     my $self = shift;
292    
293     # print "search_prefix(@_)\n";
294     defined $self->{db} or $self->open;
295     $self->search_raw(map($self->prefix($_), @_));
296     }
297    
298     sub search_raw {
299     my $self = shift;
300     my %occ;
301     my %score;
302    
303     return () unless @_;
304    
305     defined $self->{db} or $self->open;
306     $self->sync;
307     grep $occ{$_}++, @_;
308     for (keys %occ) {
309     if (defined $self->{db}->{$_}) {
310     my %post = unpack 'w*', $self->{db}->{$_};
311 ulpfr 13 my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));
312 ulpfr 10 my $did;
313     for $did (keys %post) {
314     $score{$did} = 0 unless defined $score{$did}; # perl -w
315     $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
316     if $self->{db}->{$M, $did}; # db may be broken
317     }
318     }
319     }
320     %score;
321     }
322    
323     sub sync {
324     my $self = shift;
325    
326     if ($self->{mode} & O_RDWR) {
327 ulpfr 13 print STDERR "Flushing $self->{cached} postings\n";
328 ulpfr 10 while (my($key, $value) = each %{$self->{cache}}) {
329     $self->{db}->{$key} .= $value;
330     #delete $self->{cache}->{$key};
331     }
332     while (my($key, $value) = each %{$self->{cdict}}) {
333     $self->{db}->{$key} = 0 unless $self->{db}->{$key};
334     $self->{db}->{$key} += $value;
335     #delete $self->{cdict}->{$key};
336     }
337     $self->{cache} = {};
338     $self->{cdict} = {};
339     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
340     $self->{cached} = 0;
341     # $self->{dbh}->sync if $self->{dbh};
342     }
343     }
344    
345     sub close {
346     my $self = shift;
347    
348     if ($self->{dbh}) {
349     $self->sync;
350     delete $self->{dbh};
351     untie %{$self->{db}};
352     delete $self->{db};
353     delete $self->{func};
354     delete $self->{cache};
355     delete $self->{cached};
356     delete $self->{cdict};
357     delete $self->{pfunc} if defined $self->{pfunc};
358     delete $self->{ifunc} if defined $self->{ifunc};
359     delete $self->{xfunc} if defined $self->{xfunc};
360     }
361     }
362    
363     1;
364    

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2

  ViewVC Help
Powered by ViewVC 1.1.26