lib/WAIT/InvertedIndex.pm

#                              -*- Mode: Cperl -*- 
# InvertedIndex.pm --
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug  8 13:05:10 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:42 1998
# Language        : CPerl
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#

package WAIT::InvertedIndex;
use strict;
use DB_File;
use Fcntl;
use WAIT::Filter;
use Carp;
use vars qw(%FUNC);

my $O = pack('C', 0xff)."o";                  # occurances
my $M = pack('C', 0xff)."m";                  # maxtf

sub new {
  my $type = shift;
  my %parm = @_;
  my $self = {};

  $self->{file}     = $parm{file}     or croak "No file specified";
  $self->{attr}     = $parm{attr}     or croak "No attributes specified";
  $self->{filter}   = $parm{filter};
  $self->{'name'}   = $parm{'name'};
  $self->{records}  = 0;
  for (qw(intervall prefix)) {
    if (exists $parm{$_}) {
      if (ref $parm{$_}) {
        $self->{$_} = [@{$parm{$_}}] # clone
      } else {
        $self->{$_} = $parm{$_}
      }
    }
  }
  bless $self, ref($type) || $type;
}

sub name {$_[0]->{'name'}}

sub _split_pos {
  my ($text, $pos) = @{$_[0]};
  my @result;

  $text =~ s/(^\s+)// and $pos += length($1);
  while ($text =~ s/(^\S+)//) {
    my $word = $1;
    push @result, [$word, $pos];
    $pos += length($word);
    $text =~ s/(^\s+)// and $pos += length($1);
  }
  @result;
}

sub _xfiltergen {
  my $filter = pop @_;

# Oops, we cannot overrule the user's choice. Other filters may kill
# stopwords, such as isotr clobbers "isn't" to "isnt".

#  if ($filter eq 'stop') {      # avoid the slow stopword elimination
#    return _xfiltergen(@_);            # it's cheaper to look them up afterwards
#  }
  if (@_) {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
    }
  } else {
    if ($filter =~ /^split(\d*)/) {
      if ($1) {
        "grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
      } else {
        "map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
      }
    } else {
      "map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
    }
  }
}

sub parse_pos {
  my $self = shift;

  unless (exists $self->{xfunc}) {
    $self->{xfunc}     =
      eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
    #printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
  }
  &{$self->{xfunc}}($_[0]);
}

sub _filtergen {
  my $filter = pop @_;

  if (@_) {
    "map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
  } else {
    "map(&WAIT::Filter::$filter(\$_), \@_)";
  }
}

sub drop {
  my $self = shift;
  if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
    my $file = $self->{file};

    ! (!-e $file or unlink $file);
  } else {                              # notify our database
    croak ref($self)."::drop called directly";
  }
}

sub open {
  my $self = shift;
  my $file = $self->{file};

  if (defined $self->{dbh}) {
    $self->{dbh};
  } else {
    $self->{func}     =
      eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
    $self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
                       $self->{mode}, 0664, $DB_BTREE);
#    tie(%{$self->{cache}}, 'DB_File', undef,
#        $self->{mode}, 0664, $DB_BTREE)
    $self->{cache} = {}
      if $self->{mode} & O_RDWR;
#    tie(%{$self->{cdict}}, 'DB_File', undef,
#        $self->{mode}, 0664, $DB_BTREE)
    $self->{cdict} = {}
      if $self->{mode} & O_RDWR;
    $self->{cached} = 0;
  }
}

sub insert {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  grep $occ{$_}++, &{$self->{func}}(@_);
  my ($word, $noc);
  $self->{records}++;
  while (($word, $noc) = each %occ) {
    if (defined $self->{cache}->{$word}) {
      $self->{cdict}->{$O,$word}++;
      $self->{cache}->{$word} .= pack 'w2', $key, $noc;
    } else {
      $self->{cdict}->{$O,$word} = 1;
      $self->{cache}->{$word}  = pack 'w2', $key, $noc;
    }
    $self->{cached}++;
  }
  $self->sync if $self->{cached} > 100_000;
  my $maxtf = 0;
  for (values %occ) {
    $maxtf = $_ if $_ > $maxtf;
  }
  $self->{db}->{$M, $key} = $maxtf;
}

sub delete {
  my $self  = shift;
  my $key   = shift;
  my %occ;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->{records}--;
  grep $occ{$_}++, &{$self->{func}}(@_);
  for (keys %occ) {
    # may reorder posting list
    my %post = unpack 'w*', $self->{db}->{$_};
    $self->{db}->{$O,$_}--;
    delete $post{$key};
    $self->{db}->{$_} = pack 'w*', %post;
  }
  delete $self->{db}->{$M, $key};
}

sub intervall {
  my ($self, $first, $last) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return unless exists $self->{'intervall'};

  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};       # for convenience

  if (ref $self->{'intervall'}) {
    unless (exists $self->{'ifunc'}) {
      $self->{'ifunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
    }
    ($first) = &{$self->{'ifunc'}}($first) if $first;
    ($last)  = &{$self->{'ifunc'}}($last) if $last;
  }
  if (defined $first and $first ne '') {         # set the cursor to $first
    $dbh->seq($first, $value, R_CURSOR);
  } else {
    $dbh->seq($first, $value, R_FIRST);
  }
  # We assume that word do not start with the character \377
  # $last = pack 'C', 0xff unless defined $last and $last ne '';
  return () if defined $last and $first gt $last; # $first would be after the last word
  
  push @result, $first;
  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if (defined $last and $word gt $last) or $word =~ /^($M|$O)/o;
    push @result, $word;
  }
  \@result;                     # speed
}

sub prefix {
  my ($self, $prefix) = @_;
  my $value = '';
  my $word  = '';
  my @result;

  return () unless defined $prefix; # Full dictionary requested !!
  return unless exists $self->{'prefix'};
  defined $self->{db} or $self->open;
  $self->sync;
  my $dbh = $self->{dbh};
  
  if (ref $self->{'prefix'}) {
    unless (exists $self->{'pfunc'}) {
      $self->{'pfunc'} =
        eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
    }
    ($prefix) = &{$self->{'pfunc'}}($prefix);
  }

  if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
    return ();
  }
  return () if $word !~ /^$prefix/;
  push @result, $word;

  while (!$dbh->seq($word, $value, R_NEXT)) {
    # We should limit this to a "resonable" number of words
    last if $word !~ /^$prefix/;
    push @result, $word;
  }
  \@result;                     # speed
}

sub search {
  my $self  = shift;

  defined $self->{db} or $self->open;
  $self->sync;
  $self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
}

sub parse {
  my $self  = shift;

  defined $self->{db} or $self->open;
  &{$self->{func}}(@_);
}

sub keys {
  my $self  = shift;

  defined $self->{db} or $self->open;
  keys %{$self->{db}};
}

sub search_prefix {
  my $self  = shift;

  # print "search_prefix(@_)\n";
  defined $self->{db} or $self->open;
  $self->search_raw(map($self->prefix($_), @_));
}

sub search_raw {
  my $self  = shift;
  my %occ;
  my %score;

  return () unless @_;

  defined $self->{db} or $self->open;
  $self->sync;
  grep $occ{$_}++, @_;
  for (keys %occ) {
    if (defined $self->{db}->{$_}) {
      my %post = unpack 'w*', $self->{db}->{$_};
      my $idf = log($self->{records}/($self->{db}->{$O,$_} || 1));
      my $did;
      for $did (keys %post) {
        $score{$did} = 0 unless defined $score{$did}; # perl -w 
        $score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
          if $self->{db}->{$M, $did}; # db may be broken
      }
    }
  }
  %score;
}

sub sync {
  my $self = shift;

  if ($self->{mode} & O_RDWR) {
    print STDERR "Flushing $self->{cached} postings\n";
    while (my($key, $value) = each %{$self->{cache}}) {
      $self->{db}->{$key} .= $value;
      #delete $self->{cache}->{$key};
    }
    while (my($key, $value) = each %{$self->{cdict}}) {
      $self->{db}->{$key} = 0 unless  $self->{db}->{$key};
      $self->{db}->{$key} += $value;
      #delete $self->{cdict}->{$key};
    }
    $self->{cache} = {};
    $self->{cdict} = {};
    # print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
    $self->{cached} = 0;
    # $self->{dbh}->sync if $self->{dbh};
  }
}

sub close {
  my $self = shift;

  if ($self->{dbh}) {
    $self->sync;
    delete $self->{dbh};
    untie %{$self->{db}};
    delete $self->{db};
    delete $self->{func};
    delete $self->{cache};
    delete $self->{cached};
    delete $self->{cdict};
    delete $self->{pfunc} if defined $self->{pfunc};
    delete $self->{ifunc} if defined $self->{ifunc};
    delete $self->{xfunc} if defined $self->{xfunc};
  }
}

1;

1	ulpfr	13	# -- Mode: Cperl --
2			# InvertedIndex.pm --
3	ulpfr	10	# ITIID : $ITI$ $Header $__Header$
4			# Author : Ulrich Pfeifer
5			# Created On : Thu Aug 8 13:05:10 1996
6			# Last Modified By: Ulrich Pfeifer
7			# Last Modified On: Sun Nov 22 18:44:42 1998
8			# Language : CPerl
9			# Status : Unknown, Use with caution!
10	ulpfr	13	#
11	ulpfr	10	# Copyright (c) 1996-1997, Ulrich Pfeifer
12	ulpfr	13	#
13	ulpfr	10
14			package WAIT::InvertedIndex;
15			use strict;
16			use DB_File;
17			use Fcntl;
18			use WAIT::Filter;
19			use Carp;
20			use vars qw(%FUNC);
21
22			my $O = pack('C', 0xff)."o"; # occurances
23			my $M = pack('C', 0xff)."m"; # maxtf
24
25			sub new {
26			my $type = shift;
27			my %parm = @_;
28			my $self = {};
29
30			$self->{file} = $parm{file} or croak "No file specified";
31			$self->{attr} = $parm{attr} or croak "No attributes specified";
32			$self->{filter} = $parm{filter};
33			$self->{'name'} = $parm{'name'};
34			$self->{records} = 0;
35			for (qw(intervall prefix)) {
36			if (exists $parm{$_}) {
37			if (ref $parm{$_}) {
38			$self->{$_} = [@{$parm{$_}}] # clone
39			} else {
40			$self->{$_} = $parm{$_}
41			}
42			}
43			}
44			bless $self, ref($type) \|\| $type;
45			}
46
47			sub name {$_[0]->{'name'}}
48
49			sub _split_pos {
50			my ($text, $pos) = @{$_[0]};
51			my @result;
52
53			$text =~ s/(^\s+)// and $pos += length($1);
54			while ($text =~ s/(^\S+)//) {
55			my $word = $1;
56			push @result, [$word, $pos];
57			$pos += length($word);
58			$text =~ s/(^\s+)// and $pos += length($1);
59			}
60			@result;
61			}
62
63			sub _xfiltergen {
64			my $filter = pop @_;
65
66	ulpfr	13	# Oops, we cannot overrule the user's choice. Other filters may kill
67			# stopwords, such as isotr clobbers "isn't" to "isnt".
68
69			# if ($filter eq 'stop') { # avoid the slow stopword elimination
70			# return _xfiltergen(@_); # it's cheaper to look them up afterwards
71			# }
72	ulpfr	10	if (@_) {
73			if ($filter =~ /^split(\d*)/) {
74			if ($1) {
75			"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .'))' ;
76			} else {
77			"map(&WAIT::Filter::split_pos(\$_), " . _xfiltergen(@_) .')' ;
78			}
79			} else {
80			"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]]," ._xfiltergen(@_) .')';
81			}
82			} else {
83			if ($filter =~ /^split(\d*)/) {
84			if ($1) {
85			"grep(length(\$_->[0])>=$1, map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0]))" ;
86			} else {
87			"map(&WAIT::Filter::split_pos(\$_), [\$_[0], 0])" ;
88			}
89			} else {
90			"map ([&WAIT::Filter::$filter(\$_->[0]), \$_->[1]], [\$_[0], 0])";
91			}
92			}
93			}
94
95			sub parse_pos {
96			my $self = shift;
97
98			unless (exists $self->{xfunc}) {
99			$self->{xfunc} =
100			eval sprintf("sub {%s}", _xfiltergen(@{$self->{filter}}));
101			#printf "\nsub{%s}$@\n", _xfiltergen(@{$self->{filter}});
102			}
103			&{$self->{xfunc}}($_[0]);
104			}
105
106			sub _filtergen {
107			my $filter = pop @_;
108
109			if (@_) {
110			"map(&WAIT::Filter::$filter(\$_), " . _filtergen(@_) . ')';
111			} else {
112			"map(&WAIT::Filter::$filter(\$_), \@_)";
113			}
114			}
115
116			sub drop {
117			my $self = shift;
118			if ((caller)[0] eq 'WAIT::Table') { # Table knows about this
119			my $file = $self->{file};
120
121			! (!-e $file or unlink $file);
122			} else { # notify our database
123			croak ref($self)."::drop called directly";
124			}
125			}
126
127			sub open {
128			my $self = shift;
129			my $file = $self->{file};
130
131			if (defined $self->{dbh}) {
132			$self->{dbh};
133			} else {
134			$self->{func} =
135			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{filter}}));
136			$self->{dbh} = tie(%{$self->{db}}, 'DB_File', $file,
137			$self->{mode}, 0664, $DB_BTREE);
138			# tie(%{$self->{cache}}, 'DB_File', undef,
139			# $self->{mode}, 0664, $DB_BTREE)
140			$self->{cache} = {}
141			if $self->{mode} & O_RDWR;
142			# tie(%{$self->{cdict}}, 'DB_File', undef,
143			# $self->{mode}, 0664, $DB_BTREE)
144			$self->{cdict} = {}
145			if $self->{mode} & O_RDWR;
146			$self->{cached} = 0;
147			}
148			}
149
150			sub insert {
151			my $self = shift;
152			my $key = shift;
153			my %occ;
154	ulpfr	13
155	ulpfr	10	defined $self->{db} or $self->open;
156			grep $occ{$_}++, &{$self->{func}}(@_);
157			my ($word, $noc);
158			$self->{records}++;
159			while (($word, $noc) = each %occ) {
160			if (defined $self->{cache}->{$word}) {
161			$self->{cdict}->{$O,$word}++;
162			$self->{cache}->{$word} .= pack 'w2', $key, $noc;
163			} else {
164			$self->{cdict}->{$O,$word} = 1;
165			$self->{cache}->{$word} = pack 'w2', $key, $noc;
166	ulpfr	13	}
167	ulpfr	10	$self->{cached}++;
168			}
169			$self->sync if $self->{cached} > 100_000;
170			my $maxtf = 0;
171			for (values %occ) {
172			$maxtf = $_ if $_ > $maxtf;
173			}
174			$self->{db}->{$M, $key} = $maxtf;
175			}
176
177			sub delete {
178			my $self = shift;
179			my $key = shift;
180			my %occ;
181
182			defined $self->{db} or $self->open;
183			$self->sync;
184			$self->{records}--;
185			grep $occ{$_}++, &{$self->{func}}(@_);
186			for (keys %occ) {
187			# may reorder posting list
188			my %post = unpack 'w*', $self->{db}->{$_};
189			$self->{db}->{$O,$_}--;
190			delete $post{$key};
191			$self->{db}->{$_} = pack 'w*', %post;
192			}
193			delete $self->{db}->{$M, $key};
194			}
195
196			sub intervall {
197			my ($self, $first, $last) = @_;
198			my $value = '';
199			my $word = '';
200			my @result;
201
202			return unless exists $self->{'intervall'};
203
204			defined $self->{db} or $self->open;
205			$self->sync;
206			my $dbh = $self->{dbh}; # for convenience
207
208			if (ref $self->{'intervall'}) {
209			unless (exists $self->{'ifunc'}) {
210			$self->{'ifunc'} =
211			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{intervall}}));
212			}
213			($first) = &{$self->{'ifunc'}}($first) if $first;
214			($last) = &{$self->{'ifunc'}}($last) if $last;
215			}
216			if (defined $first and $first ne '') { # set the cursor to $first
217			$dbh->seq($first, $value, R_CURSOR);
218			} else {
219			$dbh->seq($first, $value, R_FIRST);
220			}
221			# We assume that word do not start with the character \377
222			# $last = pack 'C', 0xff unless defined $last and $last ne '';
223			return () if defined $last and $first gt $last; # $first would be after the last word
224
225			push @result, $first;
226			while (!$dbh->seq($word, $value, R_NEXT)) {
227			# We should limit this to a "resonable" number of words
228			last if (defined $last and $word gt $last) or $word =~ /^($M\|$O)/o;
229			push @result, $word;
230			}
231			\@result; # speed
232			}
233
234			sub prefix {
235			my ($self, $prefix) = @_;
236			my $value = '';
237			my $word = '';
238			my @result;
239
240			return () unless defined $prefix; # Full dictionary requested !!
241			return unless exists $self->{'prefix'};
242			defined $self->{db} or $self->open;
243			$self->sync;
244			my $dbh = $self->{dbh};
245
246			if (ref $self->{'prefix'}) {
247			unless (exists $self->{'pfunc'}) {
248			$self->{'pfunc'} =
249			eval sprintf("sub {grep /./, %s}", _filtergen(@{$self->{prefix}}));
250			}
251			($prefix) = &{$self->{'pfunc'}}($prefix);
252			}
253
254			if ($dbh->seq($word = $prefix, $value, R_CURSOR)) {
255			return ();
256			}
257			return () if $word !~ /^$prefix/;
258			push @result, $word;
259
260			while (!$dbh->seq($word, $value, R_NEXT)) {
261			# We should limit this to a "resonable" number of words
262			last if $word !~ /^$prefix/;
263			push @result, $word;
264			}
265			\@result; # speed
266			}
267
268			sub search {
269			my $self = shift;
270
271			defined $self->{db} or $self->open;
272			$self->sync;
273			$self->search_raw(&{$self->{func}}(@_)); # No call to parse() here
274			}
275
276			sub parse {
277			my $self = shift;
278
279			defined $self->{db} or $self->open;
280			&{$self->{func}}(@_);
281			}
282
283	ulpfr	13	sub keys {
284			my $self = shift;
285
286			defined $self->{db} or $self->open;
287			keys %{$self->{db}};
288			}
289
290	ulpfr	10	sub search_prefix {
291			my $self = shift;
292
293			# print "search_prefix(@_)\n";
294			defined $self->{db} or $self->open;
295			$self->search_raw(map($self->prefix($_), @_));
296			}
297
298			sub search_raw {
299			my $self = shift;
300			my %occ;
301			my %score;
302
303			return () unless @_;
304
305			defined $self->{db} or $self->open;
306			$self->sync;
307			grep $occ{$_}++, @_;
308			for (keys %occ) {
309			if (defined $self->{db}->{$_}) {
310			my %post = unpack 'w*', $self->{db}->{$_};
311	ulpfr	13	my $idf = log($self->{records}/($self->{db}->{$O,$_} \|\| 1));
312	ulpfr	10	my $did;
313			for $did (keys %post) {
314			$score{$did} = 0 unless defined $score{$did}; # perl -w
315			$score{$did} += $post{$did} / $self->{db}->{$M, $did} * $idf
316			if $self->{db}->{$M, $did}; # db may be broken
317			}
318			}
319			}
320			%score;
321			}
322
323			sub sync {
324			my $self = shift;
325
326			if ($self->{mode} & O_RDWR) {
327	ulpfr	13	print STDERR "Flushing $self->{cached} postings\n";
328	ulpfr	10	while (my($key, $value) = each %{$self->{cache}}) {
329			$self->{db}->{$key} .= $value;
330			#delete $self->{cache}->{$key};
331			}
332			while (my($key, $value) = each %{$self->{cdict}}) {
333			$self->{db}->{$key} = 0 unless $self->{db}->{$key};
334			$self->{db}->{$key} += $value;
335			#delete $self->{cdict}->{$key};
336			}
337			$self->{cache} = {};
338			$self->{cdict} = {};
339			# print STDERR "*** $self->{cache} ", tied(%{$self->{cache}}), "==\n";
340			$self->{cached} = 0;
341			# $self->{dbh}->sync if $self->{dbh};
342			}
343			}
344
345			sub close {
346			my $self = shift;
347
348			if ($self->{dbh}) {
349			$self->sync;
350			delete $self->{dbh};
351			untie %{$self->{db}};
352			delete $self->{db};
353			delete $self->{func};
354			delete $self->{cache};
355			delete $self->{cached};
356			delete $self->{cdict};
357			delete $self->{pfunc} if defined $self->{pfunc};
358			delete $self->{ifunc} if defined $self->{ifunc};
359			delete $self->{xfunc} if defined $self->{xfunc};
360			}
361			}
362
363			1;
364