/[wait]/cvs-head/lib/WAIT/Table.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Table.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 10 - (show annotations)
Fri Apr 28 15:40:52 2000 UTC (24 years ago) by ulpfr
File size: 17340 byte(s)
Initial revision

1 # -*- Mode: Perl -*-
2 # Table.pm --
3 # ITIID : $ITI$ $Header $__Header$
4 # Author : Ulrich Pfeifer
5 # Created On : Thu Aug 8 13:05:10 1996
6 # Last Modified By: Ulrich Pfeifer
7 # Last Modified On: Sun Nov 22 18:44:37 1998
8 # Language : CPerl
9 # Update Count : 51
10 # Status : Unknown, Use with caution!
11 #
12 # Copyright (c) 1996-1997, Ulrich Pfeifer
13 #
14
15 =head1 NAME
16
17 WAIT::Table -- Module for maintaining Tables / Relations
18
19 =head1 SYNOPSIS
20
21 require WAIT::Table;
22
23 =head1 DESCRIPTION
24
25 =cut
26
27 package WAIT::Table;
28 require WAIT::Parse::Base;
29 use strict;
30 use Carp;
31 use DB_File;
32 use Fcntl;
33
34 my $USE_RECNO = 0;
35
36 =head2 Creating a Table.
37
38 The constructor WAIT::Table-<gt>new is normally called via the
39 create_table method of a database handle. This is not enforced, but
40 creating a table doesn not make any sense unless the table is
41 registered by the database because the latter implements persistence
42 of the meta data. Registering is done automatically by letting the
43 database handle create a table.
44
45 my $db = create WAIT::Database name => 'sample';
46 my $tb = $db->create_table (name => 'test',
47 attr => ['docid', 'headline'],
48 layout => $layout,
49 access => $access,
50 );
51
52 The constructor returns a handle for the table. This handle is hidden by the
53 table module, to prevent direct access if called via Table.
54
55 =over 10
56
57 =item C<access> => I<accesobj>
58
59 A reference to a acces object for the external parts (attributes) of
60 tuples. As you may remember, the WAIT System does not enforce that
61 objects are completely stored inside the system to avoid duplication.
62 There is no (strong) point in storing all you HTML-Documents inside
63 the system when indexing your WWW-Server.
64
65 =item C<file> => I<fname>
66
67 The filename of the records file. Files for indexes will have I<fname>
68 as prefix. I<Mandatory>
69
70 =item C<name> => I<name>
71
72 The name of this table. I<Mandatory>
73
74 =item C<attr> => [ I<attr> ... ]
75
76 A reference to an array of attribute names. I<Mandatory>
77
78 =item C<djk> => [ I<attr> ... ]
79
80 A reference to an array of attribute names which make up the
81 I<disjointness key>. Don't think about it - i's of no use yet;
82
83 =item C<layout> => I<layoutobj>
84
85 A reference to an external parser object. Defaults to anew instance of
86 C<WAIT::Parse::Base>
87
88 =item C<access> => I<accesobj>
89
90 A reference to a acces object for the external parts of tuples.
91
92 =back
93
94 =cut
95
96 sub new {
97 my $type = shift;
98 my %parm = @_;
99 my $self = {};
100
101 # Do that before we eventually add '_weight' to attributes.
102 $self->{keyset} = $parm{keyset} || [[@{$parm{attr}}]];
103 $self->{mode} = O_CREAT | O_RDWR;
104 # Determine and set up subclass
105 $type = ref($type) || $type;
106 if (defined $parm{djk}) {
107 if (@{$parm{djk}} == @{$parm{attr}}) {
108 # All attributes in DK (sloppy test here!)
109 $type .= '::Independent';
110 require WAIT::Table::Independent;
111 } else {
112 $type .= '::Disjoint';
113 require WAIT::Table::Disjoint;
114 }
115 # Add '_weight' to attributes
116 my %attr;
117 @attr{@{$parm{attr}}} = (1) x @{$parm{attr}};
118 unshift @{$parm{attr}}, '_weight' unless $attr{'_weight'};
119 }
120
121 $self->{file} = $parm{file} or croak "No file specified";
122 if (-d $self->{file} or !mkdir($self->{file}, 0775)) {
123 croak "Could not 'mkdir $self->{file}': $!\n";
124 }
125 $self->{name} = $parm{name} or croak "No name specified";
126 $self->{attr} = $parm{attr} or croak "No attributes specified";
127 $self->{djk} = $parm{djk} if defined $parm{djk};
128 $self->{layout} = $parm{layout} || new WAIT::Parse::Base;
129 $self->{access} = $parm{access} if defined $parm{access};
130 $self->{nextk} = 1; # next record to insert; first record unused
131 $self->{deleted} = {}; # no deleted records yet
132 $self->{indexes} = {};
133
134 bless $self, $type;
135 # Call create_index() and create_index() for compatibility
136 for (@{$self->{keyset}||[]}) {
137 #carp "Specification of indexes at table create time is deprecated";
138 $self->create_index(@$_);
139 }
140 while (@{$parm{invindex}||[]}) {
141 # carp "Specification of inverted indexes at table create time is deprecated";
142 my $att = shift @{$parm{invindex}};
143 my @spec = @{shift @{$parm{invindex}}};
144 my @opt;
145
146 if (ref($spec[0])) {
147 carp "Secondary pipelines are deprecated\n";
148 @opt = %{shift @spec};
149 }
150 $self->create_inverted_index(attribute => $att, pipeline => \@spec, @opt);
151 }
152 $self;
153 # end of backwarn compatibility stuff
154 }
155
156 =head2 Creating an index
157
158 $tb->create_index('docid');
159
160 =item C<create_index>
161
162 must be called with a list of attributes. This must be a subset of the
163 attributes specified when the table was created. Currently this
164 method must be called before the first tuple is inserted in the
165 table!
166
167 =cut
168
169 sub create_index {
170 my $self= shift;
171
172 croak "Cannot create index for table aready populated"
173 if $self->{nextk} > 1;
174
175 require WAIT::Index;
176
177 my $name = join '-', @_;
178 $self->{indexes}->{$name} =
179 new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
180 }
181
182 =head2 Creating an inverted index
183
184 $tb->create_inverted_index
185 (attribute => 'au',
186 pipeline => ['detex', 'isotr', 'isolc', 'split2', 'stop'],
187 predicate => 'plain',
188 );
189
190 =over 5
191
192 =item C<attribute>
193
194 The attribute to build the index on. This attribute may not be in the
195 set attributes specified when the table was created.
196
197 =item C<pipeline>
198
199 A piplines specification is a reference to and array of method names
200 (from package C<WAIT::Filter>) which are to applied in sequence to the
201 contents of the named attribute. The attribute name may not be in the
202 attribute list.
203
204 =item C<predicate>
205
206 An indication which predicate the index implements. This may be
207 e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
208 query processing. Currently there is no standard set of predicate
209 names. The predicate defaults to the last member of the ppline if
210 omitted.
211
212 =back
213
214 Currently this method must be called before the first tuple is
215 inserted in the table!
216
217 =cut
218
219 sub create_inverted_index {
220 my $self = shift;
221 my %parm = @_;
222
223 croak "No attribute specified" unless $parm{attribute};
224 croak "No pipeline specified" unless $parm{pipeline};
225
226 $parm{predicate} ||= $parm{pipeline}->[-1];
227
228 croak "Cannot create index for table aready populated"
229 if $self->{nextk} > 1;
230
231 require WAIT::InvertedIndex;
232
233 # backward compatibility stuff
234 my %opt = %parm;
235 for (qw(attribute pipeline predicate)) {
236 delete $opt{$_};
237 }
238
239 my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
240 my $idx = new WAIT::InvertedIndex(file => $self->{file}.'/'.$name,
241 filter => [@{$parm{pipeline}}], # clone
242 name => $name,
243 attr => $parm{attribute},
244 %opt, # backward compatibility stuff
245 );
246 # We will have to use $parm{predicate} here
247 push @{$self->{inverted}->{$parm{attribute}}}, $idx;
248 }
249
250 sub dir {
251 $_[0]->{file};
252 }
253
254 =head2 C<$tb-E<gt>layout>
255
256 Returns the reference to the associated parser object.
257
258 =cut
259
260 sub layout { $_[0]->{layout} }
261
262 =head2 C<$tb-E<gt>fields>
263
264 Returns the array of attribute names.
265
266 =cut
267
268
269 sub fields { keys %{$_[0]->{inverted}}}
270
271 =head2 C<$tb-E<gt>drop>
272
273 Must be called via C<WAIT::Database::drop_table>
274
275 =cut
276
277 sub drop {
278 my $self = shift;
279 if ((caller)[0] eq 'WAIT::Database') { # database knows about this
280 $self->close; # just make sure
281 my $file = $self->{file};
282
283 for (values %{$self->{indexes}}) {
284 $_->drop;
285 }
286 unlink "$file/records";
287 ! (!-e $file or rmdir $file);
288 } else {
289 croak ref($self)."::drop called directly";
290 }
291 }
292
293 sub mrequire ($) {
294 my $module = shift;
295
296 $module =~ s{::}{/}g;
297 $module .= '.pm';
298 require $module;
299 }
300
301 sub open {
302 my $self = shift;
303 my $file = $self->{file} . '/records';
304
305 mrequire ref($self); # that's tricky eh?
306 if (defined $self->{'layout'}) {
307 mrequire ref($self->{'layout'});
308 }
309 if (defined $self->{'access'}) {
310 mrequire ref($self->{'access'});
311 }
312 if (exists $self->{indexes}) {
313 require WAIT::Index;
314 for (values %{$self->{indexes}}) {
315 $_->{mode} = $self->{mode};
316 }
317 }
318 if (exists $self->{inverted}) {
319 my ($att, $idx);
320 for $att (keys %{$self->{inverted}}) {
321 for $idx (@{$self->{inverted}->{$att}}) {
322 $idx->{mode} = $self->{mode};
323 }
324 }
325 require WAIT::InvertedIndex;
326 }
327 unless (defined $self->{dbh}) {
328 if ($USE_RECNO) {
329 $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
330 $self->{mode}, 0664, $DB_RECNO);
331 } else {
332 $self->{dbh} =
333 tie(%{$self->{db}}, 'DB_File', $file,
334 $self->{mode}, 0664, $DB_BTREE);
335 }
336 }
337 $self;
338 }
339
340 sub fetch_extern {
341 my $self = shift;
342
343 print "#@_", $self->{'access'}->{Mode}, "\n";
344 if (exists $self->{'access'}) {
345 mrequire ref($self->{'access'});
346 $self->{'access'}->FETCH(@_);
347 }
348 }
349
350 sub fetch_extern_by_id {
351 my $self = shift;
352
353 $self->fetch_extern($self->fetch(@_));
354 }
355
356 sub _find_index {
357 my $self = shift;
358 my (@att) = @_;
359 my %att;
360 my $name;
361
362 @att{@att} = @att;
363
364 KEY: for $name (keys %{$self->{indexes}}) {
365 my @iat = split /-/, $name;
366 for (@iat) {
367 next KEY unless exists $att{$_};
368 }
369 return $self->{indexes}->{$name};
370 }
371 return undef;
372 }
373
374 sub have {
375 my $self = shift;
376 my %parm = @_;
377
378 my $index = $self->_find_index(keys %parm);
379 croak "No index found" unless $index;
380 defined $self->{db} or $self->open;
381 return $index->have(@_);
382 }
383
384 sub insert {
385 my $self = shift;
386 my %parm = @_;
387
388 defined $self->{db} or $self->open;
389
390 my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
391 my $key;
392 my @deleted = keys %{$self->{deleted}};
393
394 if (@deleted) {
395 $key = pop @deleted;
396 delete $self->{deleted}->{$key};
397 } else {
398 $key = $self->{nextk}++;
399 }
400 if ($USE_RECNO) {
401 $self->{db}->[$key] = $tuple;
402 } else {
403 $self->{db}->{$key} = $tuple;
404 }
405 for (values %{$self->{indexes}}) {
406 unless ($_->insert($key, %parm)) {
407 # duplicate key, undo changes
408 if ($key == $self->{nextk}-1) {
409 $self->{nextk}--;
410 } else {
411 $self->{deleted}->{$key}=1;
412 }
413 my $idx;
414 for $idx (values %{$self->{indexes}}) {
415 last if $idx eq $_;
416 $idx->remove($key, %parm);
417 }
418 return undef;
419 }
420 }
421 if (defined $self->{inverted}) {
422 my $att;
423 for $att (keys %{$self->{inverted}}) {
424 if (defined $parm{$att}) {
425 map $_->insert($key, $parm{$att}), @{$self->{inverted}->{$att}};
426 #map $_->sync, @{$self->{inverted}->{$att}}
427 }
428 }
429 }
430 $key
431 }
432
433 sub sync {
434 my $self = shift;
435
436 for (values %{$self->{indexes}}) {
437 map $_->sync, $_;
438 }
439 if (defined $self->{inverted}) {
440 my $att;
441 for $att (keys %{$self->{inverted}}) {
442 map $_->sync, @{$self->{inverted}->{$att}}
443 }
444 }
445 }
446
447 sub fetch {
448 my $self = shift;
449 my $key = shift;
450
451 return () if exists $self->{deleted}->{$key};
452
453 defined $self->{db} or $self->open;
454 if ($USE_RECNO) {
455 $self->unpack($self->{db}->[$key]);
456 } else {
457 $self->unpack($self->{db}->{$key});
458 }
459 }
460
461 sub delete_by_key {
462 my $self = shift;
463 my $key = shift;
464
465 return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
466 my %tuple = $self->fetch($key);
467 for (values %{$self->{indexes}}) {
468 $_->delete($key, %tuple);
469 }
470 if (defined $self->{inverted}) {
471 # User *must* provide the full record for this or the entries
472 # in the inverted index will not be removed
473 %tuple = (%tuple, @_);
474 my $att;
475 for $att (keys %{$self->{inverted}}) {
476 if (defined $tuple{$att}) {
477 map $_->delete($key, $tuple{$att}), @{$self->{inverted}->{$att}}
478 }
479 }
480 }
481 ++$self->{deleted}->{$key};
482 }
483
484 sub delete {
485 my $self = shift;
486 my $tkey = $self->have(@_);
487
488 defined $tkey && $self->delete_by_key($tkey, @_);
489 }
490
491 sub unpack {
492 my $self = shift;
493 my $tuple = shift;
494
495 my $att;
496 my @result;
497 my @tuple = split /$;/, $tuple;
498
499 for $att (@{$self->{attr}}) {
500 push @result, $att, shift @tuple;
501 }
502 @result;
503 }
504
505 sub close {
506 my $self = shift;
507
508 if (exists $self->{'access'}) {
509 eval {$self->{'access'}->close}; # dont bother if not opened
510 }
511 for (values %{$self->{indexes}}) {
512 $_->close();
513 }
514 if (defined $self->{inverted}) {
515 my $att;
516 for $att (keys %{$self->{inverted}}) {
517 if ($] > 5.003) { # avoid bug in perl up to 5.003_05
518 my $idx;
519 for $idx (@{$self->{inverted}->{$att}}) {
520 $idx->close;
521 }
522 } else {
523 map $_->close(), @{$self->{inverted}->{$att}};
524 }
525 }
526 }
527 if ($self->{dbh}) {
528 delete $self->{dbh};
529
530 if ($USE_RECNO) {
531 untie @{$self->{db}};
532 } else {
533 untie %{$self->{db}};
534 }
535 delete $self->{db};
536 }
537
538 1;
539 }
540
541 sub open_scan {
542 my $self = shift;
543 my $code = shift;
544
545 $self->{dbh} or $self->open;
546 require WAIT::Scan;
547 new WAIT::Scan $self, $self->{nextk}-1, $code;
548 }
549
550 sub open_index_scan {
551 my $self = shift;
552 my $attr = shift;
553 my $code = shift;
554 my $name = join '-', @$attr;
555
556 if (defined $self->{indexes}->{$name}) {
557 $self->{indexes}->{$name}->open_scan($code);
558 } else {
559 croak "No such index '$name'";
560 }
561 }
562
563 eval {sub WAIT::Query::Raw::new} unless defined \&WAIT::Query::Raw::new;
564
565 sub prefix {
566 my ($self , $attr, $prefix) = @_;
567 my %result;
568
569 defined $self->{db} or $self->open; # require layout
570
571 for (@{$self->{inverted}->{$attr}}) {
572 my $result = $_->prefix($prefix);
573 if (defined $result) {
574 $result{$_->name} = $result;
575 }
576 }
577 bless \%result, 'WAIT::Query::Raw';
578 }
579
580 sub intervall {
581 my ($self, $attr, $lb, $ub) = @_;
582 my %result;
583
584 defined $self->{db} or $self->open; # require layout
585
586 for (@{$self->{inverted}->{$attr}}) {
587 my $result = $_->intervall($lb, $ub);
588 if (defined $result) {
589 $result{$_->name} = $result;
590 }
591 }
592 bless \%result, 'WAIT::Query::Raw';
593 }
594
595 sub search {
596 my $self = shift;
597 my $attr = shift;
598 my $cont = shift;
599 my $raw = shift;
600 my %result;
601
602 defined $self->{db} or $self->open; # require layout
603
604 if ($raw) {
605 for (@{$self->{inverted}->{$attr}}) {
606 my $name = $_->name;
607 if (exists $raw->{$name} and @{$raw->{$name}}) {
608 my $scale = 1/scalar(@{$raw->{$name}});
609 my %r = $_->search_raw(@{$raw->{$name}});
610 my ($key, $val);
611 while (($key, $val) = each %r) {
612 if (exists $result{$key}) {
613 $result{$key} += $val*$scale;
614 } else {
615 $result{$key} = $val*$scale;
616 }
617 }
618 }
619 }
620 }
621 if (defined $cont and $cont ne '') {
622 for (@{$self->{inverted}->{$attr}}) {
623 my %r = $_->search($cont);
624 my ($key, $val);
625 while (($key, $val) = each %r) {
626 if (exists $result{$key}) {
627 $result{$key} += $val;
628 } else {
629 $result{$key} = $val;
630 }
631 }
632 }
633 }
634 # sanity check for deleted documents.
635 # this should not be necessary !@#$
636 for (keys %result) {
637 delete $result{$_} if $self->{deleted}->{$_}
638 }
639 %result;
640 }
641
642 sub hilight_positions {
643 my ($self, $attr, $text, $query, $raw) = @_;
644 my %pos;
645
646 if (defined $raw) {
647 for (@{$self->{inverted}->{$attr}}) {
648 my $name = $_->name;
649 if (exists $raw->{$name}) {
650 my %qt;
651 grep $qt{$_}++, @{$raw->{$name}};
652 for ($_->parse_pos($text)) {
653 if (exists $qt{$_->[0]}) {
654 $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
655 }
656 }
657 }
658 }
659 }
660 if (defined $query) {
661 for (@{$self->{inverted}->{$attr}}) {
662 my %qt;
663
664 grep $qt{$_}++, $_->parse($query);
665 for ($_->parse_pos($text)) {
666 if (exists $qt{$_->[0]}) {
667 if (exists $pos{$_->[1]}) { # perl -w ;-)
668 $pos{$_->[1]} = max($pos{$_->[1]}, length($_->[0]));
669 } else {
670 $pos{$_->[1]} = length($_->[0]);
671 }
672 }
673 }
674 }
675 }
676
677 \%pos;
678 }
679
680 sub hilight {
681 my ($tb, $text, $query, $raw) = @_;
682 my $type = $tb->layout();
683 my @result;
684
685 $query ||= {};
686 $raw ||= {};
687 my @ttxt = $type->tag($text);
688 while (@ttxt) {
689 no strict 'refs';
690 my %tag = %{shift @ttxt};
691 my $txt = shift @ttxt;
692 my $fld;
693
694 my %hl;
695 for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
696 my $hp = $tb->hilight_positions($fld, $txt,
697 $query->{$fld}, $raw->{$fld});
698 for (keys %$hp) {
699 if (exists $hl{$_}) { # -w ;-(
700 $hl{$_} = max($hl{$_}, $hp->{$_});
701 } else {
702 $hl{$_} = $hp->{$_};
703 }
704 }
705 }
706 my $pos;
707 my $qt = {_qt => 1, %tag};
708 my $pl = \%tag;
709 my $last = length($txt);
710 my @tmp;
711 for $pos (sort {$b <=> $a} keys %hl) {
712 unshift @tmp, $pl, substr($txt,$pos+$hl{$pos},$last-$pos-$hl{$pos});
713 unshift @tmp, $qt, substr($txt,$pos,$hl{$pos});
714 $last = $pos;
715 }
716 push @result, $pl, substr($txt,0,$last);
717 push @result, @tmp;
718 }
719 @result; # no speed necessary
720 }
721
722 1;
723

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26