/[wait]/trunk/lib/WAIT/Table.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/Table.pm

Parent Directory | Revision Log | View Patch Patch

-revision 10 by ulpfr,
Fri Apr 28 15:40:52 2000 UTC
+revision 85 by ulpfr,
Fri May  3 16:16:10 2002 UTC
 Line 1
- #                              -*- Mode: Perl -*-
+ #                              -*- Mode: Cperl -*-
  # Table.pm --
  # ITIID           : $ITI$ $Header $__Header$
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 22 18:44:37 1998
+ # Last Modified On: Sat Apr 27 17:20:31 2002
  # Language        : CPerl
- # Update Count    : 51
+ # Update Count    : 172
  # Status          : Unknown, Use with caution!
  #
  # Copyright (c) 1996-1997, Ulrich Pfeifer
  #
  =head1 NAME
 Line 25 
 WAIT::Table -- Module for maintaining Ta
  =cut
  package WAIT::Table;
+ use WAIT::Table::Handle ();
  require WAIT::Parse::Base;
  use strict;
  use Carp;
- use DB_File;
+ # use autouse Carp => qw( croak($) );
+ use BerkeleyDB;
  use Fcntl;
+ use LockFile::Simple ();
  my $USE_RECNO = 0;
  =head2 Creating a Table.
- The constructor WAIT::Table-<gt>new is normally called via the
+ The constructor WAIT::Table-E<gt>new is normally called via the
  create_table method of a database handle. This is not enforced, but
- creating a table doesn not make any sense unless the table is
+ creating a table does not make any sense unless the table is
  registered by the database because the latter implements persistence
  of the meta data. Registering is done automatically by letting the
- database handle create a table.
+ database handle the creation of a table.
-   my $db = create WAIT::Database name => 'sample';
+   my $db = WAIT::Database->create(name => 'sample');
-   my $tb = $db->create_table (name     => 'test',
+   my $tb = $db->create_table(name     => 'test',
-                               attr     => ['docid', 'headline'],
+                              access   => $access,
-                               layout   => $layout,
+                              layout   => $layout,
-                               access   => $access,
+                              attr     => ['docid', 'headline'],
-                              );
+                             );
  The constructor returns a handle for the table. This handle is hidden by the
  table module, to prevent direct access if called via Table.
  =over 10
- =item C<access> => I<accesobj>
+ =item C<access> => I<accessobj>
- A reference to a acces object for the external parts (attributes) of
+ A reference to an access object for the external parts (attributes) of
  tuples. As you may remember, the WAIT System does not enforce that
  objects are completely stored inside the system to avoid duplication.
- There is no (strong) point in storing all you HTML-Documents inside
+ There is no (strong) point in storing all your HTML documents inside
  the system when indexing your WWW-Server.
+ The access object is designed to work like as a tied hash. You pass
+ the refernce to the object, not the tied hash though. An example
+ implementation of an access class that works for manpages is
+ WAIT::Document::Nroff.
+ The implementation needs to take into account that WAIT will keep this
+ object in a Data::Dumper or Storable database and re-use it when sman
+ is run. So it is not good enough if we can produce the index with it
+ now, when we create or actively access the table, WAIT also must be
+ able to retrieve documents on its own, when we are in a different
+ context. This happens specifically in a retrieval. To get this working
+ seemlessly, the access-defining class must implement a close method.
+ This method will be called before the Data::Dumper dump takes place.
+ In that moment the access-defining class must get rid of all data
+ structures that cannot be reconstructed via the Data::Dumper dump,
+ such as database handles or C pointers.
  =item C<file> => I<fname>
  The filename of the records file. Files for indexes will have I<fname>
- as prefix. I<Mandatory>
+ as prefix. I<Mandatory>, but usually taken care of by the
+ WAIT::Database handle when the constructor is called via
+ WAIT::Database::create_table().
  =item C<name> => I<name>
-Line 73 
 The name of this table. I<Mandatory>
+Line 97 
 The name of this table. I<Mandatory>
  =item C<attr> => [ I<attr> ... ]
- A reference to an array of attribute names. I<Mandatory>
+ A reference to an array of attribute names. WAIT will keep the
+ contents of these attributes in its table. I<Mandatory>
  =item C<djk> => [ I<attr> ... ]
  A reference to an array of attribute names which make up the
- I<disjointness key>. Don't think about it - i's of no use yet;
+ I<disjointness key>. Don't think about it - it's of no use yet;
  =item C<layout> => I<layoutobj>
- A reference to an external parser object. Defaults to anew instance of
+ A reference to an external parser object. Defaults to a new instance
- C<WAIT::Parse::Base>
+ of C<WAIT::Parse::Base>. For an example implementation see
+ WAIT::Parse::Nroff. A layout class can be implemented as a singleton
+ class if you so like.
+ =item C<keyset> => I<keyset>
- =item C<access> => I<accesobj>
+ The set of attributes needed to identify a record. Defaults to all
+ attributes.
- A reference to a acces object for the external parts of tuples.
+ =item C<invindex> => I<inverted index>
+ A reference to an anon array defining attributes of each record that
+ need to be indexed. See the source of smakewhatis for how to set this
+ up.
  =back
-Line 98 
 sub new {
+Line 132 
 sub new {
    my %parm = @_;
    my $self = {};
+   # Check for mandatory attrs early
+   $self->{name}     = $parm{name}     or croak "No name specified";
+   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
    # Do that before we eventually add '_weight' to attributes.
    $self->{keyset}   = $parm{keyset}   || [[@{$parm{attr}}]];
    $self->{mode}     = O_CREAT | O_RDWR;
    # Determine and set up subclass
    $type = ref($type) || $type;
    if (defined $parm{djk}) {
-Line 119 
 sub new {
+Line 159 
 sub new {
    }
    $self->{file}     = $parm{file}     or croak "No file specified";
-   if (-d  $self->{file} or !mkdir($self->{file}, 0775)) {
+   if (-e  $self->{file}){
-     croak "Could not 'mkdir $self->{file}': $!\n";
+     warn "Warning: file '$self->{file}' already exists\n";
    }
-   $self->{name}     = $parm{name}     or croak "No name specified";
-   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
    $self->{djk}      = $parm{djk}      if defined $parm{djk};
    $self->{layout}   = $parm{layout} || new WAIT::Parse::Base;
    $self->{access}   = $parm{access} if defined $parm{access};
-Line 132 
 sub new {
+Line 171 
 sub new {
    $self->{indexes}  = {};
    bless $self, $type;
+   # Checking for readers is not necessary, but let's go with the
+   # generic method.
+   $self->getlock(O_RDWR|O_CREAT); # dies when failing
    # Call create_index() and create_index() for compatibility
    for (@{$self->{keyset}||[]}) {
      #carp "Specification of indexes at table create time is deprecated";
-Line 142 
 sub new {
+Line 186 
 sub new {
      my $att  = shift @{$parm{invindex}};
      my @spec = @{shift @{$parm{invindex}}};
      my @opt;
      if (ref($spec[0])) {
        carp "Secondary pipelines are deprecated\n";
        @opt = %{shift @spec};
      }
      $self->create_inverted_index(attribute => $att, pipeline  => \@spec, @opt);
    }
    $self;
    # end of backwarn compatibility stuff
  }
-Line 168 
 table!
+Line 213 
 table!
  sub create_index {
    my $self= shift;
    croak "Cannot create index for table aready populated"
      if $self->{nextk} > 1;
    require WAIT::Index;
    my $name = join '-', @_;
    $self->{indexes}->{$name} =
-     new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
+     new WAIT::Index file => $self->{file}, name => $name, attr => $_;
  }
  =head2 Creating an inverted index
-Line 196 
 set attributes specified when the table
+Line 241 
 set attributes specified when the table
  =item C<pipeline>
- A piplines specification is a reference to and array of method names
+ A piplines specification is a reference to an array of method names
- (from package C<WAIT::Filter>) which are to applied in sequence to the
+ (from package C<WAIT::Filter>) which are to be applied in sequence to
- contents of the named attribute. The attribute name may not be in the
+ the contents of the named attribute. The attribute name may not be in
- attribute list.
+ the attribute list.
  =item C<predicate>
  An indication which predicate the index implements. This may be
  e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
  query processing. Currently there is no standard set of predicate
- names. The predicate defaults to the last member of the ppline if
+ names. The predicate defaults to the last member of the pipeline if
  omitted.
  =back
-Line 224 
 sub create_inverted_index {
+Line 269 
 sub create_inverted_index {
    croak "No pipeline specified"  unless $parm{pipeline};
    $parm{predicate} ||= $parm{pipeline}->[-1];
    croak "Cannot create index for table aready populated"
      if $self->{nextk} > 1;
    require WAIT::InvertedIndex;
    # backward compatibility stuff
-Line 235 
 sub create_inverted_index {
+Line 280 
 sub create_inverted_index {
    for (qw(attribute pipeline predicate)) {
      delete $opt{$_};
    }
    my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
    my $idx = new WAIT::InvertedIndex(file   => $self->{file}.'/'.$name,
                                      filter => [@{$parm{pipeline}}], # clone
-Line 276 
 Must be called via C<WAIT::Database::dro
+Line 321 
 Must be called via C<WAIT::Database::dro
  sub drop {
    my $self = shift;
+   unless ($self->{write_lock}){
+     warn "Cannot drop table without write lock. Nothing done";
+     return;
+   }
    if ((caller)[0] eq 'WAIT::Database') { # database knows about this
      $self->close;               # just make sure
      my $file = $self->{file};
      for (values %{$self->{indexes}}) {
        $_->drop;
      }
-     unlink "$file/records";
+     rmdir "$file.read" or warn "Could not rmdir '$file/read'";
-     ! (!-e $file or rmdir $file);
+     unlink "$file";
    } else {
      croak ref($self)."::drop called directly";
    }
-Line 324 
 sub open {
+Line 377 
 sub open {
      }
      require WAIT::InvertedIndex;
    }
+   $self->getlock($self->{mode});
+   my $dbmode = ($self->{mode} & O_CREAT) ? DB_CREATE : 0;
    unless (defined $self->{dbh}) {
      if ($USE_RECNO) {
-       $self->{dbh} = tie(@{$self->{db}}, 'DB_File', $file,
+       tie(%{$self->{db}}, 'BerkeleyDB::Recno',
-                          $self->{mode}, 0664, $DB_RECNO);
+           -Filename => $self->{file},
+           -Subname  => 'records',
+           -Flags    => $dbmode);
      } else {
        $self->{dbh} =
-         tie(%{$self->{db}}, 'DB_File', $file,
+         tie(%{$self->{db}}, 'BerkeleyDB::Btree',
-                          $self->{mode}, 0664, $DB_BTREE);
+             -Filename => $self->{file},
+             -Subname  => 'records',
+             -Mode     => 0664,
+             -Flags    => $dbmode);
      }
    }
    $self;
  }
  sub fetch_extern {
    my $self  = shift;
-   print "#@_", $self->{'access'}->{Mode}, "\n";
+   # print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING?
    if (exists $self->{'access'}) {
      mrequire ref($self->{'access'});
      $self->{'access'}->FETCH(@_);
-Line 358 
 sub _find_index {
+Line 422 
 sub _find_index {
    my (@att) = @_;
    my %att;
    my $name;
    @att{@att} = @att;
    KEY: for $name (keys %{$self->{indexes}}) {
-Line 375 
 sub have {
+Line 439 
 sub have {
    my $self  = shift;
    my %parm  = @_;
-   my $index = $self->_find_index(keys %parm);
+   my $index = $self->_find_index(keys %parm) or return; # no index-no have
-   croak "No index found" unless $index;
    defined $self->{db} or $self->open;
    return $index->have(@_);
  }
-Line 387 
 sub insert {
+Line 451 
 sub insert {
    defined $self->{db} or $self->open;
+   # We should move all writing methods to a subclass to check only once
+   $self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode";
    my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
    my $key;
    my @deleted = keys %{$self->{deleted}};
+   my $gotkey = 0;
    if (@deleted) {
      $key = pop @deleted;
      delete $self->{deleted}->{$key};
+     # Sanity check
+     if ($key && $key>0) {
+       $gotkey=1;
    } else {
+       warn(sprintf("WAIT database inconsistency during insert ".
+                    "key[%s]: Please rebuild index\n",
+                    $key
+                   ));
+     }
+   }
+   unless ($gotkey) {
      $key = $self->{nextk}++;
    }
    if ($USE_RECNO) {
-Line 408 
 sub insert {
+Line 486 
 sub insert {
        if ($key == $self->{nextk}-1) {
          $self->{nextk}--;
        } else {
+         # warn "setting key[$key] deleted during insert";
          $self->{deleted}->{$key}=1;
        }
        my $idx;
-Line 416 
 sub insert {
+Line 495 
 sub insert {
          $idx->remove($key, %parm);
        }
        return undef;
      }
    }
    if (defined $self->{inverted}) {
      my $att;
-Line 432 
 sub insert {
+Line 511 
 sub insert {
  sub sync {
    my $self  = shift;
    for (values %{$self->{indexes}}) {
      map $_->sync, $_;
    }
-Line 449 
 sub fetch {
+Line 528 
 sub fetch {
    my $key   = shift;
    return () if exists $self->{deleted}->{$key};
    defined $self->{db} or $self->open;
    if ($USE_RECNO) {
      $self->unpack($self->{db}->[$key]);
-Line 462 
 sub delete_by_key {
+Line 541 
 sub delete_by_key {
    my $self  = shift;
    my $key   = shift;
+   unless ($key) {
+     Carp::cluck "Warning: delete_by_key called without key. Looks like a bug in WAIT?";
+     return;
+   }
    return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
    my %tuple = $self->fetch($key);
    for (values %{$self->{indexes}}) {
-Line 478 
 sub delete_by_key {
+Line 562 
 sub delete_by_key {
        }
      }
    }
+   # warn "setting key[$key] deleted during delete_by_key";
    ++$self->{deleted}->{$key};
  }
  sub delete {
    my $self  = shift;
    my $tkey = $self->have(@_);
+   # warn "tkey[$tkey]\@_[@_]";
    defined $tkey && $self->delete_by_key($tkey, @_);
  }
  sub unpack {
-   my $self = shift;
+   my($self, $tuple) = @_;
-   my $tuple = shift;
+   unless (defined $tuple){
+     # require Carp; # unfortunately gives us "bizarre copy...." :-(((((
+     warn("Debug: somebody called unpack without argument tuple!");
+     return;
+   }
    my $att;
    my @result;
-Line 502 
 sub unpack {
+Line 592 
 sub unpack {
    @result;
  }
+ sub set {
+   my ($self, $iattr, $value) = @_;
+   unless ($self->{write_lock}){
+     warn "Cannot set iattr[$iattr] without write lock. Nothing done";
+     return;
+   }
+   # in the rare case that they haven't written a single record yet, we
+   # make sure, the inverted inherits our $self->{mode}:
+   defined $self->{db} or $self->open;
+   for my $att (keys %{$self->{inverted}}) {
+     if ($] > 5.003) {         # avoid bug in perl up to 5.003_05
+       my $idx;
+       for $idx (@{$self->{inverted}->{$att}}) {
+         $idx->set($iattr, $value);
+       }
+     } else {
+       map $_->set($iattr, $value), @{$self->{inverted}->{$att}};
+     }
+   }
+;
+ }
  sub close {
    my $self = shift;
    if (exists $self->{'access'}) {
      eval {$self->{'access'}->close}; # dont bother if not opened
    }
-   for (values %{$self->{indexes}}) {
+   if ($WAIT::Index::VERSION) {
-     $_->close();
+     for (values %{$self->{indexes}}) {
+       $_->close();
+     }
    }
-   if (defined $self->{inverted}) {
+   if (defined $self->{inverted} && $WAIT::InvertedIndex::VERSION) {
+     # require WAIT::InvertedIndex; Uli: we can avoid closing indexes:
+     # if WAIT::InvertedIndex has not been loaded, they cannot have
+     # been altered so far
      my $att;
      for $att (keys %{$self->{inverted}}) {
        if ($] > 5.003) {         # avoid bug in perl up to 5.003_05
-Line 535 
 sub close {
+Line 656 
 sub close {
      delete $self->{db};
    }
+   $self->unlock;
 ;
  }
+ # Locking
+ #
+ # We allow multiple readers to coexists.  But write access excludes
+ # all read access and vice versa.  In practice read access on tables
+ # open for writing will mostly work ;-)
+ # If a "write" lock is requested, an existing "read" lock will be
+ # released.  If a "read" lock ist requested, an existing "write" lock
+ # will be released.  Requiring a lock already hold has no effect.
+ sub getlock {
+   my ($self, $mode) = @_;
+   # autoclean cleans on DESTROY, stale sends SIGZERO to the owner
+   #
+   my $lockmgr = LockFile::Simple->make(-autoclean => 1, -stale => 1);
+   my $file    = $self->{file};
+   my $lockdir = $self->{file} . '.read';
+   unless (-d $lockdir) {
+     mkdir $lockdir, 0755 or die "Could not mkdir $lockdir: $!";
+   }
+   if ($mode & O_RDWR) {         # Get a write lock.  Release it again
+                                 # and die if there is any valid
+                                 # readers.
+     # Have a write lock already
+     return $self if $self->{write_lock};
+     if ($self->{read_lock}) {   # We are a becoming a writer now. So
+                                 # we release the read lock to avoid
+                                 # blocking ourselves.
+       $self->{read_lock}->release;
+       delete $self->{read_lock};
+     }
+     # Get the preliminary write lock
+     $self->{write_lock} = $lockmgr->lock($self->{file} . '.write')
+       or die "Can't lock '$self->{file}.write'";
+     # If we actually want to write we must check if there are any
+     # readers.  The write lock is confirmed if wen cannot find any
+     # valid readers.
+     local *DIR;
+     opendir DIR, $lockdir or
+       die "Could not opendir '$lockdir': $!";
+     for my $lockfile (grep { -f "$lockdir/$_" } readdir DIR) {
+       # Check if the locks are still valid.  Since we are protected by
+       # a write lock, we could use a plain file.  But we want to use
+       # the stale testing from LockFile::Simple.
+       if (my $lck = $lockmgr->trylock("$lockdir/$lockfile")) {
+         warn "Removing stale lockfile '$lockdir/$lockfile'";
+         $lck->release;
+       } else {                  # Found an active reader, rats!
+         $self->{write_lock}->release;
+         die "Cannot write table '$file' while it's in use";
+       }
+     }
+     closedir DIR;
+   } else {
+     # Have a read lock already
+     return $self if $self->{read_lock};
+     # Get the preliminary write lock to protect the directory
+     # operations.
+     my $write_lock = $lockmgr->lock($self->{file} . '.read/write')
+       or die "Can't lock '$self->{file}.read/write'";
+     # Find a new read slot.  Maybe the plain file would be better?
+     my $id = time;
+     while (-f "$lockdir/$id.lock") { # here assume ".lock" format!
+       $id++;
+     }
+     $self->{read_lock} = $lockmgr->lock("$lockdir/$id")
+       or die "Can't lock '$lockdir/$id'";
+     # We are a reader now. So we release the write lock
+     $write_lock->release;
+   }
+   return $self;
+ }
+ sub unlock {
+   my $self = shift;
+   # Either we have a read or a write lock (or we close the table already)
+   # unless ($self->{read_lock} || $self->{write_lock}) {
+   #   warn "WAIT::Table::unlock: Table aparently hold's no lock"
+   # }
+   if ($self->{write_lock}) {
+     $self->{write_lock}->release();
+     delete $self->{write_lock};
+   }
+   if ($self->{read_lock}) {
+     $self->{read_lock}->release();
+     delete $self->{read_lock};
+   }
+ }
+ sub DESTROY {
+   my $self = shift;
+   if ($self->{write_lock} || $self->{read_lock}) {
+     warn "Table handle destroyed without closing it first";
+     $self->unlock;
+   }
+ }
  sub open_scan {
    my $self = shift;
    my $code = shift;
-Line 593 
 sub intervall {
+Line 829 
 sub intervall {
  }
  sub search {
-   my $self = shift;
+   my $self  = shift;
-   my $attr = shift;
+   my ($query, $attr, $cont, $raw);
-   my $cont = shift;
+   if (ref $_[0]) {
-   my $raw  = shift;
+     $query = shift;
+     $attr = $query->{attr};
+     $cont = $query->{cont};
+     $raw  = $query->{raw};
+   } else {
+     require Carp;
+     Carp::cluck("Using three argument search interface is deprecated, use hashref interface instead");
+     $attr = shift;
+     $cont = shift;
+     $raw  = shift;
+     $query = {
+               attr => $attr,
+               cont => $cont,
+               raw  => $raw,
+              };
+   }
    my %result;
    defined $self->{db} or $self->open; # require layout
-Line 606 
 sub search {
+Line 859 
 sub search {
        my $name = $_->name;
        if (exists $raw->{$name} and @{$raw->{$name}}) {
          my $scale = 1/scalar(@{$raw->{$name}});
-         my %r = $_->search_raw(@{$raw->{$name}});
+         my %r = $_->search_raw($query, @{$raw->{$name}});
          my ($key, $val);
          while (($key, $val) = each %r) {
            if (exists $result{$key}) {
-Line 620 
 sub search {
+Line 873 
 sub search {
    }
    if (defined $cont and $cont ne '') {
      for (@{$self->{inverted}->{$attr}}) {
-       my %r = $_->search($cont);
+       my %r = $_->search($query, $cont);
        my ($key, $val);
        while (($key, $val) = each %r) {
          if (exists $result{$key}) {
-Line 644 
 sub hilight_positions {
+Line 897 
 sub hilight_positions {
    my %pos;
    if (defined $raw) {
-     for (@{$self->{inverted}->{$attr}}) {
+     for (@{$self->{inverted}->{$attr}}) { # objects of type
+                                           # WAIT::InvertedIndex for
+                                           # this index field $attr
        my $name = $_->name;
        if (exists $raw->{$name}) {
          my %qt;
-Line 678 
 sub hilight_positions {
+Line 933 
 sub hilight_positions {
  }
  sub hilight {
-   my ($tb, $text, $query, $raw) = @_;
+   my ($tb, $buf, $qplain, $qraw) = @_;
-   my $type = $tb->layout();
+   my $layout = $tb->layout();
    my @result;
-   $query ||= {};
+   $qplain ||= {};
-   $raw   ||= {};
+   $qraw   ||= {};
-   my @ttxt = $type->tag($text);
+   my @ttxt = $layout->tag($buf);
    while (@ttxt) {
      no strict 'refs';
      my %tag = %{shift @ttxt};
-Line 692 
 sub hilight {
+Line 948 
 sub hilight {
      my $fld;
      my %hl;
-     for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
+     for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) {
        my $hp = $tb->hilight_positions($fld, $txt,
-                                       $query->{$fld}, $raw->{$fld});
+                                       $qplain->{$fld}, $qraw->{$fld});
        for (keys %$hp) {
          if (exists $hl{$_}) {   # -w ;-(
            $hl{$_} = max($hl{$_}, $hp->{$_});
-Line 720 
 sub hilight {
+Line 976 
 sub hilight {
  }
 ;

 Legend:



Removed from v.10
 


changed lines


 
Added in v.85
 Legend:



Removed from v.10
 


changed lines


 
Added in v.85
-Removed from v.10
+Added in v.85

	ViewVC Help
Powered by ViewVC 1.1.26