/[wait]/cvs-head/lib/WAIT/Table.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/Table.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (hide annotations)
Fri Apr 28 15:40:52 2000 UTC (24 years ago) by ulpfr
File size: 17340 byte(s)
Initial revision

1 ulpfr 10 # -*- Mode: Perl -*-
2     # Table.pm --
3     # ITIID : $ITI$ $Header $__Header$
4     # Author : Ulrich Pfeifer
5     # Created On : Thu Aug 8 13:05:10 1996
6     # Last Modified By: Ulrich Pfeifer
7     # Last Modified On: Sun Nov 22 18:44:37 1998
8     # Language : CPerl
9     # Update Count : 51
10     # Status : Unknown, Use with caution!
11     #
12     # Copyright (c) 1996-1997, Ulrich Pfeifer
13     #
14    
15     =head1 NAME
16    
17     WAIT::Table -- Module for maintaining Tables / Relations
18    
19     =head1 SYNOPSIS
20    
21     require WAIT::Table;
22    
23     =head1 DESCRIPTION
24    
25     =cut
26    
27     package WAIT::Table;
28     require WAIT::Parse::Base;
29     use strict;
30     use Carp;
31     use DB_File;
32     use Fcntl;
33    
34     my $USE_RECNO = 0;
35    
36     =head2 Creating a Table.
37    
38     The constructor WAIT::Table-<gt>new is normally called via the
39     create_table method of a database handle. This is not enforced, but
40     creating a table doesn not make any sense unless the table is
41     registered by the database because the latter implements persistence
42     of the meta data. Registering is done automatically by letting the
43     database handle create a table.
44    
45     my $db = create WAIT::Database name => 'sample';
46     my $tb = $db->create_table (name => 'test',
47     attr => ['docid', 'headline'],
48     layout => $layout,
49     access => $access,
50     );
51    
52     The constructor returns a handle for the table. This handle is hidden by the
53     table module, to prevent direct access if called via Table.
54    
55     =over 10
56    
57     =item C<access> => I<accesobj>
58    
59     A reference to a acces object for the external parts (attributes) of
60     tuples. As you may remember, the WAIT System does not enforce that
61     objects are completely stored inside the system to avoid duplication.
62     There is no (strong) point in storing all you HTML-Documents inside
63     the system when indexing your WWW-Server.
64    
65     =item C<file> => I<fname>
66    
67     The filename of the records file. Files for indexes will have I<fname>
68     as prefix. I<Mandatory>
69    
70     =item C<name> => I<name>
71    
72     The name of this table. I<Mandatory>
73    
74     =item C<attr> => [ I<attr> ... ]
75    
76     A reference to an array of attribute names. I<Mandatory>
77    
78     =item C<djk> => [ I<attr> ... ]
79    
80     A reference to an array of attribute names which make up the
81     I<disjointness key>. Don't think about it - i's of no use yet;
82    
83     =item C<layout> => I<layoutobj>
84    
85     A reference to an external parser object. Defaults to anew instance of
86     C<WAIT::Parse::Base>
87    
88     =item C<access> => I<accesobj>
89    
90     A reference to a acces object for the external parts of tuples.
91    
92     =back
93    
94     =cut
95    
96     sub new {
97     my $type = shift;
98     my %parm = @_;
99     my $self = {};
100    
101     # Do that before we eventually add '_weight' to attributes.
102     $self->{keyset} = $parm{keyset} || [[@{$parm{attr}}]];
103     $self->{mode} = O_CREAT | O_RDWR;
104     # Determine and set up subclass
105     $type = ref($type) || $type;
106     if (defined $parm{djk}) {
107     if (@{$parm{djk}} == @{$parm{attr}}) {
108     # All attributes in DK (sloppy test here!)
109     $type .= '::Independent';
110     require WAIT::Table::Independent;
111     } else {
112     $type .= '::Disjoint';
113     require WAIT::Table::Disjoint;
114     }
115     # Add '_weight' to attributes
116     my %attr;
117     @attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
118     unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
119     }
120    
121     $self->{file} = $parm{file} or croak "No file specified";
122     if (-d $self->{file} or !mkdir($self->{file}, 0775)) {
123     croak "Could not 'mkdir $self->{file}': $!\n";
124     }
125     $self->{name} = $parm{name} or croak "No name specified";
126     $self->{attr} = $parm{attr} or croak "No attributes specified";
127     $self->{djk} = $parm{djk} if defined $parm{djk};
128     $self->{layout} = $parm{layout} || new WAIT::Parse::Base;
129     $self->{access} = $parm{access} if defined $parm{access};
130     $self->{nextk} = 1; # next record to insert; first record unused
131     $self->{deleted} = {}; # no deleted records yet
132     $self->{indexes} = {};
133    
134     bless $self, $type;
135     # Call create_index() and create_index() for compatibility
136     for (@{$self->{keyset}||[]}) {
137     #carp "Specification of indexes at table create time is deprecated";
138     $self->create_index(@$_);
139     }
140     while (@{$parm{invindex}||[]}) {
141     # carp "Specification of inverted indexes at table create time is deprecated";
142     my $att = shift @{$parm{invindex}};
143     my @spec = @{shift @{$parm{invindex}}};
144     my @opt;
145    
146     if (ref($spec[0])) {
147     carp "Secondary pipelines are deprecated\n";
148     @opt = %{shift @spec};
149     }
150     $self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt);
151     }
152     $self;
153     # end of backwarn compatibility stuff
154     }
155    
156     =head2 Creating an index
157    
158     $tb->create_index('docid');
159    
160     =item C<create_index>
161    
162     must be called with a list of attributes. This must be a subset of the
163     attributes specified when the table was created. Currently this
164     method must be called before the first tuple is inserted in the
165     table!
166    
167     =cut
168    
169     sub create_index {
170     my $self= shift;
171    
172     croak "Cannot create index for table aready populated"
173     if $self->{nextk} > 1;
174    
175     require WAIT::Index;
176    
177     my $name = join '-', @_;
178     $self->{indexes}->{$name} =
179     new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
180     }
181    
182     =head2 Creating an inverted index
183    
184     $tb->create_inverted_index
185     (attribute => 'au',
186     pipeline => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
187     predicate => 'plain',
188     );
189    
190     =over 5
191    
192     =item C<attribute>
193    
194     The attribute to build the index on. This attribute may not be in the
195     set attributes specified when the table was created.
196    
197     =item C<pipeline>
198    
199     A piplines specification is a reference to and array of method names
200     (from package C<WAIT::Filter>) which are to applied in sequence to the
201     contents of the named attribute. The attribute name may not be in the
202     attribute list.
203    
204     =item C<predicate>
205    
206     An indication which predicate the index implements. This may be
207     e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
208     query processing. Currently there is no standard set of predicate
209     names. The predicate defaults to the last member of the ppline if
210     omitted.
211    
212     =back
213    
214     Currently this method must be called before the first tuple is
215     inserted in the table!
216    
217     =cut
218    
219     sub create_inverted_index {
220     my $self = shift;
221     my %parm = @_;
222    
223     croak "No attribute specified" unless $parm{attribute};
224     croak "No pipeline specified" unless $parm{pipeline};
225    
226     $parm{predicate} ||= $parm{pipeline}->[-1];
227    
228     croak "Cannot create index for table aready populated"
229     if $self->{nextk} > 1;
230    
231     require WAIT::InvertedIndex;
232    
233     # backward compatibility stuff
234     my %opt = %parm;
235     for (qw(attribute pipeline predicate)) {
236     delete $opt{$_};
237     }
238    
239     my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
240     my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name,
241     filter => [@{$parm{pipeline}}], # clone
242     name => $name,
243     attr => $parm{attribute},
244     %opt, # backward compatibility stuff
245     );
246     # We will have to use $parm{predicate} here
247     push @{$self->{inverted}->{$parm{attribute}}}, $idx;
248     }
249    
250     sub dir {
251     $_[0]->{file};
252     }
253    
254     =head2 C<$tb-E<gt>layout>
255    
256     Returns the reference to the associated parser object.
257    
258     =cut
259    
260     sub layout { $_[0]->{layout} }
261    
262     =head2 C<$tb-E<gt>fields>
263    
264     Returns the array of attribute names.
265    
266     =cut
267    
268    
269     sub fields { keys %{$_[0]->{inverted}}}
270    
271     =head2 C<$tb-E<gt>drop>
272    
273     Must be called via C<WAIT::Database::drop_table>
274    
275     =cut
276    
277     sub drop {
278     my $self = shift;
279     if ((caller)[0] eq 'WAIT::Database') { # database knows about this
280     $self->close; # just make sure
281     my $file = $self->{file};
282    
283     for (values %{$self->{indexes}}) {
284     $_->drop;
285     }
286     unlink "$file/records";
287     ! (!-e $file or rmdir $file);
288     } else {
289     croak ref($self)."::drop called directly";
290     }
291     }
292    
293     sub mrequire ($) {
294     my $module = shift;
295    
296     $module =~ s{::}{/}g;
297     $module .= '.pm';
298     require $module;
299     }
300    
301     sub open {
302     my $self = shift;
303     my $file = $self->{file} . '/records';
304    
305     mrequire ref($self); # that's tricky eh?
306     if (defined $self->{'layout'}) {
307     mrequire ref($self->{'layout'});
308     }
309     if (defined $self->{'access'}) {
310     mrequire ref($self->{'access'});
311     }
312     if (exists $self->{indexes}) {
313     require WAIT::Index;
314     for (values %{$self->{indexes}}) {
315     $_->{mode} = $self->{mode};
316     }
317     }
318     if (exists $self->{inverted}) {
319     my ($att, $idx);
320     for $att (keys %{$self->{inverted}}) {
321     for $idx (@{$self->{inverted}->{$att}}) {
322     $idx->{mode} = $self->{mode};
323     }
324     }
325     require WAIT::InvertedIndex;
326     }
327     unless (defined $self->{dbh}) {
328     if ($USE_RECNO) {
329     $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
330     $self->{mode}, 0664, $DB_RECNO);
331     } else {
332     $self->{dbh} =
333     tie(%{$self->{db}}, 'DB_File', $file,
334     $self->{mode}, 0664, $DB_BTREE);
335     }
336     }
337     $self;
338     }
339    
340     sub fetch_extern {
341     my $self = shift;
342    
343     print "#@_", $self->{'access'}->{Mode}, "\n";
344     if (exists $self->{'access'}) {
345     mrequire ref($self->{'access'});
346     $self->{'access'}->FETCH(@_);
347     }
348     }
349    
350     sub fetch_extern_by_id {
351     my $self = shift;
352    
353     $self->fetch_extern($self->fetch(@_));
354     }
355    
356     sub _find_index {
357     my $self = shift;
358     my (@att) = @_;
359     my %att;
360     my $name;
361    
362     @att{@att} = @att;
363    
364     KEY: for $name (keys %{$self->{indexes}}) {
365     my @iat = split /-/, $name;
366     for (@iat) {
367     next KEY unless exists $att{$_};
368     }
369     return $self->{indexes}->{$name};
370     }
371     return undef;
372     }
373    
374     sub have {
375     my $self = shift;
376     my %parm = @_;
377    
378     my $index = $self->_find_index(keys %parm);
379     croak "No index found" unless $index;
380     defined $self->{db} or $self->open;
381     return $index->have(@_);
382     }
383    
384     sub insert {
385     my $self = shift;
386     my %parm = @_;
387    
388     defined $self->{db} or $self->open;
389    
390     my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
391     my $key;
392     my @deleted = keys %{$self->{deleted}};
393    
394     if (@deleted) {
395     $key = pop @deleted;
396     delete $self->{deleted}->{$key};
397     } else {
398     $key = $self->{nextk}++;
399     }
400     if ($USE_RECNO) {
401     $self->{db}->[$key] = $tuple;
402     } else {
403     $self->{db}->{$key} = $tuple;
404     }
405     for (values %{$self->{indexes}}) {
406     unless ($_->insert($key, %parm)) {
407     # duplicate key, undo changes
408     if ($key == $self->{nextk}-1) {
409     $self->{nextk}--;
410     } else {
411     $self->{deleted}->{$key}=1;
412     }
413     my $idx;
414     for $idx (values %{$self->{indexes}}) {
415     last if $idx eq $_;
416     $idx->remove($key, %parm);
417     }
418     return undef;
419     }
420     }
421     if (defined $self->{inverted}) {
422     my $att;
423     for $att (keys %{$self->{inverted}}) {
424     if (defined $parm{$att}) {
425     map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
426     #map $_->sync, @{$self->{inverted}->{$att}}
427     }
428     }
429     }
430     $key
431     }
432    
433     sub sync {
434     my $self = shift;
435    
436     for (values %{$self->{indexes}}) {
437     map $_->sync, $_;
438     }
439     if (defined $self->{inverted}) {
440     my $att;
441     for $att (keys %{$self->{inverted}}) {
442     map $_->sync, @{$self->{inverted}->{$att}}
443     }
444     }
445     }
446    
447     sub fetch {
448     my $self = shift;
449     my $key = shift;
450    
451     return () if exists $self->{deleted}->{$key};
452    
453     defined $self->{db} or $self->open;
454     if ($USE_RECNO) {
455     $self->unpack($self->{db}->[$key]);
456     } else {
457     $self->unpack($self->{db}->{$key});
458     }
459     }
460    
461     sub delete_by_key {
462     my $self = shift;
463     my $key = shift;
464    
465     return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
466     my %tuple = $self->fetch($key);
467     for (values %{$self->{indexes}}) {
468     $_->delete($key, %tuple);
469     }
470     if (defined $self->{inverted}) {
471     # User *must* provide the full record for this or the entries
472     # in the inverted index will not be removed
473     %tuple = (%tuple, @_);
474     my $att;
475     for $att (keys %{$self->{inverted}}) {
476     if (defined $tuple{$att}) {
477     map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
478     }
479     }
480     }
481     ++$self->{deleted}->{$key};
482     }
483    
484     sub delete {
485     my $self = shift;
486     my $tkey = $self->have(@_);
487    
488     defined $tkey && $self->delete_by_key($tkey, @_);
489     }
490    
491     sub unpack {
492     my $self = shift;
493     my $tuple = shift;
494    
495     my $att;
496     my @result;
497     my @tuple = split /$;/, $tuple;
498    
499     for $att (@{$self->{attr}}) {
500     push @result, $att, shift @tuple;
501     }
502     @result;
503     }
504    
505     sub close {
506     my $self = shift;
507    
508     if (exists $self->{'access'}) {
509     eval {$self->{'access'}->close}; # dont bother if not opened
510     }
511     for (values %{$self->{indexes}}) {
512     $_->close();
513     }
514     if (defined $self->{inverted}) {
515     my $att;
516     for $att (keys %{$self->{inverted}}) {
517     if ($] > 5.003) { # avoid bug in perl up to 5.003_05
518     my $idx;
519     for $idx (@{$self->{inverted}->{$att}}) {
520     $idx->close;
521     }
522     } else {
523     map $_->close(), @{$self->{inverted}->{$att}};
524     }
525     }
526     }
527     if ($self->{dbh}) {
528     delete $self->{dbh};
529    
530     if ($USE_RECNO) {
531     untie @{$self->{db}};
532     } else {
533     untie %{$self->{db}};
534     }
535     delete $self->{db};
536     }
537    
538     1;
539     }
540    
541     sub open_scan {
542     my $self = shift;
543     my $code = shift;
544    
545     $self->{dbh} or $self->open;
546     require WAIT::Scan;
547     new WAIT::Scan $self, $self->{nextk}-1, $code;
548     }
549    
550     sub open_index_scan {
551     my $self = shift;
552     my $attr = shift;
553     my $code = shift;
554     my $name = join '-', @$attr;
555    
556     if (defined $self->{indexes}->{$name}) {
557     $self->{indexes}->{$name}->open_scan($code);
558     } else {
559     croak "No such index '$name'";
560     }
561     }
562    
563     eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;
564    
565     sub prefix {
566     my ($self , $attr, $prefix) = @_;
567     my %result;
568    
569     defined $self->{db} or $self->open; # require layout
570    
571     for (@{$self->{inverted}->{$attr}}) {
572     my $result = $_->prefix($prefix);
573     if (defined $result) {
574     $result{$_->name} = $result;
575     }
576     }
577     bless \%result, 'WAIT::Query::Raw';
578     }
579    
580     sub intervall {
581     my ($self, $attr, $lb, $ub) = @_;
582     my %result;
583    
584     defined $self->{db} or $self->open; # require layout
585    
586     for (@{$self->{inverted}->{$attr}}) {
587     my $result = $_->intervall($lb, $ub);
588     if (defined $result) {
589     $result{$_->name} = $result;
590     }
591     }
592     bless \%result, 'WAIT::Query::Raw';
593     }
594    
595     sub search {
596     my $self = shift;
597     my $attr = shift;
598     my $cont = shift;
599     my $raw = shift;
600     my %result;
601    
602     defined $self->{db} or $self->open; # require layout
603    
604     if ($raw) {
605     for (@{$self->{inverted}->{$attr}}) {
606     my $name = $_->name;
607     if (exists $raw->{$name} and @{$raw->{$name}}) {
608     my $scale = 1/scalar(@{$raw->{$name}});
609     my %r = $_->search_raw(@{$raw->{$name}});
610     my ($key, $val);
611     while (($key, $val) = each %r) {
612     if (exists $result{$key}) {
613     $result{$key} += $val*$scale;
614     } else {
615     $result{$key} = $val*$scale;
616     }
617     }
618     }
619     }
620     }
621     if (defined $cont and $cont ne '') {
622     for (@{$self->{inverted}->{$attr}}) {
623     my %r = $_->search($cont);
624     my ($key, $val);
625     while (($key, $val) = each %r) {
626     if (exists $result{$key}) {
627     $result{$key} += $val;
628     } else {
629     $result{$key} = $val;
630     }
631     }
632     }
633     }
634     # sanity check for deleted documents.
635     # this should not be necessary !@#$
636     for (keys %result) {
637     delete $result{$_} if $self->{deleted}->{$_}
638     }
639     %result;
640     }
641    
642     sub hilight_positions {
643     my ($self, $attr, $text, $query, $raw) = @_;
644     my %pos;
645    
646     if (defined $raw) {
647     for (@{$self->{inverted}->{$attr}}) {
648     my $name = $_->name;
649     if (exists $raw->{$name}) {
650     my %qt;
651     grep $qt{$_}++, @{$raw->{$name}};
652     for ($_->parse_pos($text)) {
653     if (exists $qt{$_->[0]}) {
654     $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
655     }
656     }
657     }
658     }
659     }
660     if (defined $query) {
661     for (@{$self->{inverted}->{$attr}}) {
662     my %qt;
663    
664     grep $qt{$_}++, $_->parse($query);
665     for ($_->parse_pos($text)) {
666     if (exists $qt{$_->[0]}) {
667     if (exists $pos{$_->[1]}) { # perl -w ;-)
668     $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
669     } else {
670     $pos{$_->[1]} = length($_->[0]);
671     }
672     }
673     }
674     }
675     }
676    
677     \%pos;
678     }
679    
680     sub hilight {
681     my ($tb, $text, $query, $raw) = @_;
682     my $type = $tb->layout();
683     my @result;
684    
685     $query ||= {};
686     $raw ||= {};
687     my @ttxt = $type->tag($text);
688     while (@ttxt) {
689     no strict 'refs';
690     my %tag = %{shift @ttxt};
691     my $txt = shift @ttxt;
692     my $fld;
693    
694     my %hl;
695     for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
696     my $hp = $tb->hilight_positions($fld, $txt,
697     $query->{$fld}, $raw->{$fld});
698     for (keys %$hp) {
699     if (exists $hl{$_}) { # -w ;-(
700     $hl{$_} = max($hl{$_}, $hp->{$_});
701     } else {
702     $hl{$_} = $hp->{$_};
703     }
704     }
705     }
706     my $pos;
707     my $qt = {_qt => 1, %tag};
708     my $pl = \%tag;
709     my $last = length($txt);
710     my @tmp;
711     for $pos (sort {$b <=> $a} keys %hl) {
712     unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
713     unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
714     $last = $pos;
715     }
716     push @result, $pl, substr($txt,0,$last);
717     push @result, @tmp;
718     }
719     @result; # no speed necessary
720     }
721    
722     1;
723    

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26