/[wait]/trunk/lib/WAIT/InvertedIndex.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/lib/WAIT/InvertedIndex.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (hide annotations)
Fri Apr 28 15:41:10 2000 UTC (24 years ago) by unknown
Original Path: branches/CPAN/lib/WAIT/InvertedIndex.pm
File size: 8976 byte(s)
This commit was manufactured by cvs2svn to create branch 'CPAN'.
1 ulpfr 10 # -*- Mode: Perl -*-
2     # InvertedIndex.pm --
3     # ITIID : $ITI$ $Header $__Header$
4     # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7     # Last Modified On: Sun Nov 22 18:44:42 1998
8     # Language : CPerl
9     # Status : Unknown, Use with caution!
10     #
11     # Copyright (c) 1996-1997, Ulrich Pfeifer
12     #
13    
14     package WAIT::InvertedIndex;
15     use strict;
16     use DB_File;
17     use Fcntl;
18     use WAIT::Filter;
19     use Carp;
20     use vars qw(%FUNC);
21    
22     my $O = pack('C', 0xff)."o"; # occurances
23     my $M = pack('C', 0xff)."m"; # maxtf
24    
25     sub new {
26     my $type = shift;
27     my %parm = @_;
28     my $self = {};
29    
30     $self->{file} = $parm{file} or croak "No file specified";
31     $self->{attr} = $parm{attr} or croak "No attributes specified";
32     $self->{filter} = $parm{filter};
33     $self->{'name'} = $parm{'name'};
34     $self->{records} = 0;
35     for (qw(intervall prefix)) {
36     if (exists $parm{$_}) {
37     if (ref $parm{$_}) {
38     $self->{$_} = [@{$parm{$_}}] # clone
39     } else {
40     $self->{$_} = $parm{$_}
41     }
42     }
43     }
44     bless $self, ref($type) || $type;
45     }
46    
47     sub name {$_[0]->{'name'}}
48    
49     sub _split_pos {
50     my ($text, $pos) = @{$_[0]};
51     my @result;
52    
53     $text =~ s/(^\s+)// and $pos += length($1);
54     while ($text =~ s/(^\S+)//) {
55     my $word = $1;
56     push @result, [$word, $pos];
57     $pos += length($word);
58     $text =~ s/(^\s+)// and $pos += length($1);
59     }
60     @result;
61     }
62    
63     sub _xfiltergen {
64     my $filter = pop @_;
65    
66     if ($filter eq 'stop') { # avoid the slow stopword elimination
67     return _xfiltergen(@_); # it's cheaper to look them up afterwards
68     }
69     if (@_) {
70     if ($filter =~ /^split(\d*)/) {
71     if ($1) {
72     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
73     } else {
74     "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
75     }
76     } else {
77     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
78     }
79     } else {
80     if ($filter =~ /^split(\d*)/) {
81     if ($1) {
82     "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
83     } else {
84     "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
85     }
86     } else {
87     "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
88     }
89     }
90     }
91    
92     sub parse_pos {
93     my $self = shift;
94    
95     unless (exists $self->{xfunc}) {
96     $self->{xfunc} =
97     eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
98     #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
99     }
100     &{$self->{xfunc}}($_[0]);
101     }
102    
103     sub _filtergen {
104     my $filter = pop @_;
105    
106     if (@_) {
107     "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
108     } else {
109     "map(&WAIT::Filter::$filter(\$_), \@_)";
110     }
111     }
112    
113     sub drop {
114     my $self = shift;
115     if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
116     my $file = $self->{file};
117    
118     ! (!-e $file or unlink $file);
119     } else { # notify our database
120     croak ref($self)."::drop called directly";
121     }
122     }
123    
124     sub open {
125     my $self = shift;
126     my $file = $self->{file};
127    
128     if (defined $self->{dbh}) {
129     $self->{dbh};
130     } else {
131     $self->{func} =
132     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
133     $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
134     $self->{mode}, 0664, $DB_BTREE);
135     # tie(%{$self->{cache}}, 'DB_File', undef,
136     # $self->{mode}, 0664, $DB_BTREE)
137     $self->{cache} = {}
138     if $self->{mode} & O_RDWR;
139     # tie(%{$self->{cdict}}, 'DB_File', undef,
140     # $self->{mode}, 0664, $DB_BTREE)
141     $self->{cdict} = {}
142     if $self->{mode} & O_RDWR;
143     $self->{cached} = 0;
144     }
145     }
146    
147     sub insert {
148     my $self = shift;
149     my $key = shift;
150     my %occ;
151    
152     defined $self->{db} or $self->open;
153     grep $occ{$_}++, &{$self->{func}}(@_);
154     my ($word, $noc);
155     $self->{records}++;
156     while (($word, $noc) = each %occ) {
157     if (defined $self->{cache}->{$word}) {
158     $self->{cdict}->{$O,$word}++;
159     $self->{cache}->{$word} .= pack 'w2', $key, $noc;
160     } else {
161     $self->{cdict}->{$O,$word} = 1;
162     $self->{cache}->{$word} = pack 'w2', $key, $noc;
163     }
164     $self->{cached}++;
165     }
166     $self->sync if $self->{cached} > 100_000;
167     my $maxtf = 0;
168     for (values %occ) {
169     $maxtf = $_ if $_ > $maxtf;
170     }
171     $self->{db}->{$M, $key} = $maxtf;
172     }
173    
174     sub delete {
175     my $self = shift;
176     my $key = shift;
177     my %occ;
178    
179     defined $self->{db} or $self->open;
180     $self->sync;
181     $self->{records}--;
182     grep $occ{$_}++, &{$self->{func}}(@_);
183     for (keys %occ) {
184     # may reorder posting list
185     my %post = unpack 'w*', $self->{db}->{$_};
186     $self->{db}->{$O,$_}--;
187     delete $post{$key};
188     $self->{db}->{$_} = pack 'w*', %post;
189     }
190     delete $self->{db}->{$M, $key};
191     }
192    
193     sub intervall {
194     my ($self, $first, $last) = @_;
195     my $value = '';
196     my $word = '';
197     my @result;
198    
199     return unless exists $self->{'intervall'};
200    
201     defined $self->{db} or $self->open;
202     $self->sync;
203     my $dbh = $self->{dbh}; # for convenience
204    
205     if (ref $self->{'intervall'}) {
206     unless (exists $self->{'ifunc'}) {
207     $self->{'ifunc'} =
208     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
209     }
210     ($first) = &{$self->{'ifunc'}}($first) if $first;
211     ($last) = &{$self->{'ifunc'}}($last) if $last;
212     }
213     if (defined $first and $first ne '') { # set the cursor to $first
214     $dbh->seq($first, $value, R_CURSOR);
215     } else {
216     $dbh->seq($first, $value, R_FIRST);
217     }
218     # We assume that word do not start with the character \377
219     # $last = pack 'C', 0xff unless defined $last and $last ne '';
220     return () if defined $last and $first gt $last; # $first would be after the last word
221    
222     push @result, $first;
223     while (!$dbh->seq($word, $value, R_NEXT)) {
224     # We should limit this to a "resonable" number of words
225     last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
226     push @result, $word;
227     }
228     \@result; # speed
229     }
230    
231     sub prefix {
232     my ($self, $prefix) = @_;
233     my $value = '';
234     my $word = '';
235     my @result;
236    
237     return () unless defined $prefix; # Full dictionary requested !!
238     return unless exists $self->{'prefix'};
239     defined $self->{db} or $self->open;
240     $self->sync;
241     my $dbh = $self->{dbh};
242    
243     if (ref $self->{'prefix'}) {
244     unless (exists $self->{'pfunc'}) {
245     $self->{'pfunc'} =
246     eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
247     }
248     ($prefix) = &{$self->{'pfunc'}}($prefix);
249     }
250    
251     if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
252     return ();
253     }
254     return () if $word !~ /^$prefix/;
255     push @result, $word;
256    
257     while (!$dbh->seq($word, $value, R_NEXT)) {
258     # We should limit this to a "resonable" number of words
259     last if $word !~ /^$prefix/;
260     push @result, $word;
261     }
262     \@result; # speed
263     }
264    
265     sub search {
266     my $self = shift;
267    
268     defined $self->{db} or $self->open;
269     $self->sync;
270     $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
271     }
272    
273     sub parse {
274     my $self = shift;
275    
276     defined $self->{db} or $self->open;
277     &{$self->{func}}(@_);
278     }
279    
280     sub search_prefix {
281     my $self = shift;
282    
283     # print "search_prefix(@_)\n";
284     defined $self->{db} or $self->open;
285     $self->search_raw(map($self->prefix($_), @_));
286     }
287    
288     sub search_raw {
289     my $self = shift;
290     my %occ;
291     my %score;
292    
293     return () unless @_;
294    
295     defined $self->{db} or $self->open;
296     $self->sync;
297     grep $occ{$_}++, @_;
298     for (keys %occ) {
299     if (defined $self->{db}->{$_}) {
300     my %post = unpack 'w*', $self->{db}->{$_};
301     my $idf = log($self->{records}/$self->{db}->{$O,$_});
302     my $did;
303     for $did (keys %post) {
304     $score{$did} = 0 unless defined $score{$did}; # perl -w
305     $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
306     if $self->{db}->{$M, $did}; # db may be broken
307     }
308     }
309     }
310     %score;
311     }
312    
313     sub sync {
314     my $self = shift;
315    
316     if ($self->{mode} & O_RDWR) {
317     print STDERR "\aFlushing $self->{cached} postings\n";
318     while (my($key, $value) = each %{$self->{cache}}) {
319     $self->{db}->{$key} .= $value;
320     #delete $self->{cache}->{$key};
321     }
322     while (my($key, $value) = each %{$self->{cdict}}) {
323     $self->{db}->{$key} = 0 unless $self->{db}->{$key};
324     $self->{db}->{$key} += $value;
325     #delete $self->{cdict}->{$key};
326     }
327     $self->{cache} = {};
328     $self->{cdict} = {};
329     # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
330     $self->{cached} = 0;
331     # $self->{dbh}->sync if $self->{dbh};
332     }
333     }
334    
335     sub close {
336     my $self = shift;
337    
338     if ($self->{dbh}) {
339     $self->sync;
340     delete $self->{dbh};
341     untie %{$self->{db}};
342     delete $self->{db};
343     delete $self->{func};
344     delete $self->{cache};
345     delete $self->{cached};
346     delete $self->{cdict};
347     delete $self->{pfunc} if defined $self->{pfunc};
348     delete $self->{ifunc} if defined $self->{ifunc};
349     delete $self->{xfunc} if defined $self->{xfunc};
350     }
351     }
352    
353     1;
354    

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26