/[wait]/trunk/lib/WAIT/Table.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/Table.pm

Parent Directory | Revision Log | View Patch Patch

-cvs-head/lib/WAIT/Table.pm
revision 10 by ulpfr,
Fri Apr 28 15:40:52 2000 UTC
+branches/CPAN/lib/WAIT/Table.pm
revision 19 by ulpfr,
Tue May  9 11:29:45 2000 UTC
 Line 1
- #                              -*- Mode: Perl -*-
+ #                              -*- Mode: Cperl -*-
  # Table.pm --
  # ITIID           : $ITI$ $Header $__Header$
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug  8 13:05:10 1996
  # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Sun Nov 22 18:44:37 1998
+ # Last Modified On: Mon May  8 20:20:58 2000
  # Language        : CPerl
- # Update Count    : 51
+ # Update Count    : 131
  # Status          : Unknown, Use with caution!
  #
  # Copyright (c) 1996-1997, Ulrich Pfeifer
  #
  =head1 NAME
 Line 25 
 WAIT::Table -- Module for maintaining Ta
  =cut
  package WAIT::Table;
+ use WAIT::Table::Handle ();
  require WAIT::Parse::Base;
  use strict;
  use Carp;
+ # use autouse Carp => qw( croak($) );
  use DB_File;
  use Fcntl;
+ use LockFile::Simple ();
  my $USE_RECNO = 0;
  =head2 Creating a Table.
- The constructor WAIT::Table-<gt>new is normally called via the
+ The constructor WAIT::Table-E<gt>new is normally called via the
  create_table method of a database handle. This is not enforced, but
- creating a table doesn not make any sense unless the table is
+ creating a table does not make any sense unless the table is
  registered by the database because the latter implements persistence
  of the meta data. Registering is done automatically by letting the
- database handle create a table.
+ database handle the creation of a table.
-   my $db = create WAIT::Database name => 'sample';
+   my $db = WAIT::Database->create(name => 'sample');
-   my $tb = $db->create_table (name     => 'test',
+   my $tb = $db->create_table(name     => 'test',
-                               attr     => ['docid', 'headline'],
+                              access   => $access,
-                               layout   => $layout,
+                              layout   => $layout,
-                               access   => $access,
+                              attr     => ['docid', 'headline'],
-                              );
+                             );
  The constructor returns a handle for the table. This handle is hidden by the
  table module, to prevent direct access if called via Table.
  =over 10
- =item C<access> => I<accesobj>
+ =item C<access> => I<accessobj>
- A reference to a acces object for the external parts (attributes) of
+ A reference to an access object for the external parts (attributes) of
  tuples. As you may remember, the WAIT System does not enforce that
  objects are completely stored inside the system to avoid duplication.
- There is no (strong) point in storing all you HTML-Documents inside
+ There is no (strong) point in storing all your HTML documents inside
  the system when indexing your WWW-Server.
+ The access object is designed to work like as a tied hash. You pass
+ the refernce to the object, not the tied hash though. An example
+ implementation of an access class that works for manpages is
+ WAIT::Document::Nroff.
+ The implementation needs to take into account that WAIT will keep this
+ object in a Data::Dumper or Storable database and re-use it when sman
+ is run. So it is not good enough if we can produce the index with it
+ now, when we create or actively access the table, WAIT also must be
+ able to retrieve documents on its own, when we are in a different
+ context. This happens specifically in a retrieval. To get this working
+ seemlessly, the access-defining class must implement a close method.
+ This method will be called before the Data::Dumper dump takes place.
+ In that moment the access-defining class must get rid of all data
+ structures that cannot be reconstructed via the Data::Dumper dump,
+ such as database handles or C pointers.
  =item C<file> => I<fname>
  The filename of the records file. Files for indexes will have I<fname>
- as prefix. I<Mandatory>
+ as prefix. I<Mandatory>, but usually taken care of by the
+ WAIT::Database handle when the constructor is called via
+ WAIT::Database::create_table().
  =item C<name> => I<name>
-Line 73 
 The name of this table. I<Mandatory>
+Line 97 
 The name of this table. I<Mandatory>
  =item C<attr> => [ I<attr> ... ]
- A reference to an array of attribute names. I<Mandatory>
+ A reference to an array of attribute names. WAIT will keep the
+ contents of these attributes in its table. I<Mandatory>
  =item C<djk> => [ I<attr> ... ]
  A reference to an array of attribute names which make up the
- I<disjointness key>. Don't think about it - i's of no use yet;
+ I<disjointness key>. Don't think about it - it's of no use yet;
  =item C<layout> => I<layoutobj>
- A reference to an external parser object. Defaults to anew instance of
+ A reference to an external parser object. Defaults to a new instance
- C<WAIT::Parse::Base>
+ of C<WAIT::Parse::Base>. For an example implementation see
+ WAIT::Parse::Nroff. A layout class can be implemented as a singleton
+ class if you so like.
- =item C<access> => I<accesobj>
+ =item C<keyset> => I<keyset>
- A reference to a acces object for the external parts of tuples.
+ The set of attributes needed to identify a record. Defaults to all
+ attributes.
+ =item C<invindex> => I<inverted index>
+ A reference to an anon array defining attributes of each record that
+ need to be indexed. See the source of smakewhatis for how to set this
+ up.
  =back
-Line 98 
 sub new {
+Line 132 
 sub new {
    my %parm = @_;
    my $self = {};
+   # Check for mandatory attrs early
+   $self->{name}     = $parm{name}     or croak "No name specified";
+   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
    # Do that before we eventually add '_weight' to attributes.
    $self->{keyset}   = $parm{keyset}   || [[@{$parm{attr}}]];
    $self->{mode}     = O_CREAT | O_RDWR;
    # Determine and set up subclass
    $type = ref($type) || $type;
    if (defined $parm{djk}) {
-Line 119 
 sub new {
+Line 159 
 sub new {
    }
    $self->{file}     = $parm{file}     or croak "No file specified";
-   if (-d  $self->{file} or !mkdir($self->{file}, 0775)) {
+   if (-d  $self->{file}){
+     warn "Warning: Directory '$self->{file}' already exists\n";
+   } elsif (!mkdir($self->{file}, 0775)) {
      croak "Could not 'mkdir $self->{file}': $!\n";
    }
-   $self->{name}     = $parm{name}     or croak "No name specified";
-   $self->{attr}     = $parm{attr}     or croak "No attributes specified";
+   my $lockmgr = LockFile::Simple->make(-autoclean => 1);
+   # aquire a write lock
+   $self->{write_lock} = $lockmgr->lock($self->{file} . '/write')
+     or die "Can't lock '$self->{file}/write'";
    $self->{djk}      = $parm{djk}      if defined $parm{djk};
    $self->{layout}   = $parm{layout} || new WAIT::Parse::Base;
    $self->{access}   = $parm{access} if defined $parm{access};
-Line 142 
 sub new {
+Line 188 
 sub new {
      my $att  = shift @{$parm{invindex}};
      my @spec = @{shift @{$parm{invindex}}};
      my @opt;
      if (ref($spec[0])) {
        carp "Secondary pipelines are deprecated\n";
        @opt = %{shift @spec};
      }
      $self->create_inverted_index(attribute => $att, pipeline  => \@spec, @opt);
    }
    $self;
    # end of backwarn compatibility stuff
  }
-Line 168 
 table!
+Line 215 
 table!
  sub create_index {
    my $self= shift;
    croak "Cannot create index for table aready populated"
      if $self->{nextk} > 1;
    require WAIT::Index;
    my $name = join '-', @_;
    $self->{indexes}->{$name} =
      new WAIT::Index file => $self->{file}.'/'.$name, attr => $_;
-Line 196 
 set attributes specified when the table
+Line 243 
 set attributes specified when the table
  =item C<pipeline>
- A piplines specification is a reference to and array of method names
+ A piplines specification is a reference to an array of method names
- (from package C<WAIT::Filter>) which are to applied in sequence to the
+ (from package C<WAIT::Filter>) which are to be applied in sequence to
- contents of the named attribute. The attribute name may not be in the
+ the contents of the named attribute. The attribute name may not be in
- attribute list.
+ the attribute list.
  =item C<predicate>
  An indication which predicate the index implements. This may be
  e.g. 'plain', 'stemming' or 'soundex'. The indicator will be used for
  query processing. Currently there is no standard set of predicate
- names. The predicate defaults to the last member of the ppline if
+ names. The predicate defaults to the last member of the pipeline if
  omitted.
  =back
-Line 224 
 sub create_inverted_index {
+Line 271 
 sub create_inverted_index {
    croak "No pipeline specified"  unless $parm{pipeline};
    $parm{predicate} ||= $parm{pipeline}->[-1];
    croak "Cannot create index for table aready populated"
      if $self->{nextk} > 1;
    require WAIT::InvertedIndex;
    # backward compatibility stuff
-Line 235 
 sub create_inverted_index {
+Line 282 
 sub create_inverted_index {
    for (qw(attribute pipeline predicate)) {
      delete $opt{$_};
    }
    my $name = join '_', ($parm{attribute}, @{$parm{pipeline}});
    my $idx = new WAIT::InvertedIndex(file   => $self->{file}.'/'.$name,
                                      filter => [@{$parm{pipeline}}], # clone
-Line 284 
 sub drop {
+Line 331 
 sub drop {
        $_->drop;
      }
      unlink "$file/records";
+     # $self->unlock;
      ! (!-e $file or rmdir $file);
    } else {
      croak ref($self)."::drop called directly";
-Line 334 
 sub open {
+Line 382 
 sub open {
                           $self->{mode}, 0664, $DB_BTREE);
      }
    }
+   # Locking
+   #
+   # We allow multiple readers to coexists.  But write access excludes
+   # all read access vice versa.  In practice read access on tables
+   # open for writing will mostly work ;-)
+   my $lockmgr = LockFile::Simple->make(-autoclean => 1);
+   # aquire a write lock. We might hold one acquired in create() already
+   $self->{write_lock} ||= $lockmgr->lock($self->{file} . '/write')
+     or die "Can't lock '$self->{file}/write'";
+   my $lockdir = $self->{file} . '/read';
+   unless (-d $lockdir) {
+     mkdir $lockdir, 0755 or die "Could not mkdir $lockdir: $!";
+   }
+   if ($self->{mode} & O_RDWR) {
+     # this is a hack.  We do not check for reopening ...
+     return $self if $self->{write_lock};
+     # If we actually want to write we must check if there are any readers
+     opendir DIR, $lockdir or
+       die "Could not opendir '$lockdir': $!";
+     for my $lockfile (grep { -f "$lockdir/$_" } readdir DIR) {
+       # check if the locks are still valid.
+       # Since we are protected by a write lock, we could use a pline file.
+       # But we want to use the stale testing from LockFile::Simple.
+       if (my $lck = $lockmgr->trylock("$lockdir/$lockfile")) {
+         warn "Removing stale lockfile '$lockdir/$lockfile'";
+         $lck->release;
+       } else {
+         $self->{write_lock}->release;
+         die "Cannot write table '$file' while it's in use";
+       }
+     }
+   } else {
+     # this is a hack.  We do not check for reopening ...
+     return $self if $self->{read_lock};
+     # We are a reader. So we release the write lock
+     my $id = time;
+     while (-f "$lockdir/$id.lock") { # here assume ".lock" format!
+       $id++;
+     }
+     $self->{read_lock} = $lockmgr->lock("$lockdir/$id");
+     $self->{write_lock}->release;
+     delete $self->{write_lock};
+   }
    $self;
  }
  sub fetch_extern {
    my $self  = shift;
-   print "#@_", $self->{'access'}->{Mode}, "\n";
+   # print "#@_", $self->{'access'}->{Mode}, "\n"; # DEBUGGING?
    if (exists $self->{'access'}) {
      mrequire ref($self->{'access'});
      $self->{'access'}->FETCH(@_);
-Line 358 
 sub _find_index {
+Line 457 
 sub _find_index {
    my (@att) = @_;
    my %att;
    my $name;
    @att{@att} = @att;
    KEY: for $name (keys %{$self->{indexes}}) {
-Line 375 
 sub have {
+Line 474 
 sub have {
    my $self  = shift;
    my %parm  = @_;
-   my $index = $self->_find_index(keys %parm);
+   my $index = $self->_find_index(keys %parm) or return; # no index-no have
-   croak "No index found" unless $index;
    defined $self->{db} or $self->open;
    return $index->have(@_);
  }
-Line 387 
 sub insert {
+Line 486 
 sub insert {
    defined $self->{db} or $self->open;
+   # We should move all writing methods to a subclass to check only once
+   $self->{mode} & O_RDWR or croak "Cannot insert into table opened in RD_ONLY mode";
    my $tuple = join($;, map($parm{$_} || '', @{$self->{attr}}));
    my $key;
    my @deleted = keys %{$self->{deleted}};
+   my $gotkey = 0;
    if (@deleted) {
      $key = pop @deleted;
      delete $self->{deleted}->{$key};
+     # Sanity check
+     if ($key && $key>0) {
+       $gotkey=1;
    } else {
+       warn(sprintf("WAIT database inconsistency during insert ".
+                    "key[%s]: Please rebuild index\n",
+                    $key
+                   ));
+     }
+   }
+   unless ($gotkey) {
      $key = $self->{nextk}++;
    }
    if ($USE_RECNO) {
-Line 408 
 sub insert {
+Line 521 
 sub insert {
        if ($key == $self->{nextk}-1) {
          $self->{nextk}--;
        } else {
+         # warn "setting key[$key] deleted during insert";
          $self->{deleted}->{$key}=1;
        }
        my $idx;
-Line 416 
 sub insert {
+Line 530 
 sub insert {
          $idx->remove($key, %parm);
        }
        return undef;
      }
    }
    if (defined $self->{inverted}) {
      my $att;
-Line 432 
 sub insert {
+Line 546 
 sub insert {
  sub sync {
    my $self  = shift;
    for (values %{$self->{indexes}}) {
      map $_->sync, $_;
    }
-Line 449 
 sub fetch {
+Line 563 
 sub fetch {
    my $key   = shift;
    return () if exists $self->{deleted}->{$key};
    defined $self->{db} or $self->open;
    if ($USE_RECNO) {
      $self->unpack($self->{db}->[$key]);
-Line 462 
 sub delete_by_key {
+Line 576 
 sub delete_by_key {
    my $self  = shift;
    my $key   = shift;
+   unless ($key) {
+     Carp::cluck "Warning: delete_by_key called without key. Looks like a bug in WAIT?";
+     return;
+   }
    return $self->{deleted}->{$key} if defined $self->{deleted}->{$key};
    my %tuple = $self->fetch($key);
    for (values %{$self->{indexes}}) {
-Line 478 
 sub delete_by_key {
+Line 597 
 sub delete_by_key {
        }
      }
    }
+   # warn "setting key[$key] deleted during delete_by_key";
    ++$self->{deleted}->{$key};
  }
  sub delete {
    my $self  = shift;
    my $tkey = $self->have(@_);
+   # warn "tkey[$tkey]\@_[@_]";
    defined $tkey && $self->delete_by_key($tkey, @_);
  }
  sub unpack {
    my $self = shift;
    my $tuple = shift;
+   return unless defined $tuple;
    my $att;
    my @result;
-Line 502 
 sub unpack {
+Line 623 
 sub unpack {
    @result;
  }
+ sub set {
+   my ($self, $iattr, $value) = @_;
+   return unless $self->{write_lock};
+   for my $att (keys %{$self->{inverted}}) {
+     if ($] > 5.003) {         # avoid bug in perl up to 5.003_05
+       my $idx;
+       for $idx (@{$self->{inverted}->{$att}}) {
+         $idx->set($iattr, $value);
+       }
+     } else {
+       map $_->set($iattr, $value), @{$self->{inverted}->{$att}};
+     }
+   }
+;
+ }
  sub close {
    my $self = shift;
-Line 509 
 sub close {
+Line 648 
 sub close {
      eval {$self->{'access'}->close}; # dont bother if not opened
    }
    for (values %{$self->{indexes}}) {
+     require WAIT::Index;
      $_->close();
    }
    if (defined $self->{inverted}) {
-Line 535 
 sub close {
+Line 675 
 sub close {
      delete $self->{db};
    }
+   $self->unlock;
 ;
  }
+ sub unlock {
+   my $self = shift;
+   # Either we have a read or a write lock (or we close the table already)
+   # unless ($self->{read_lock} || $self->{write_lock}) {
+   #   warn "WAIT::Table::unlock: Table aparently hold's no lock"
+   # }
+   if ($self->{write_lock}) {
+     $self->{write_lock}->release();
+     delete $self->{write_lock};
+   }
+   if ($self->{read_lock}) {
+     $self->{read_lock}->release();
+     delete $self->{read_lock};
+   }
+ }
+ sub DESTROY {
+   my $self = shift;
+   warn "Table handle destroyed without closing it first"
+     if $self->{write_lock} || $self->{read_lock};
+ }
  sub open_scan {
    my $self = shift;
    my $code = shift;
-Line 593 
 sub intervall {
+Line 760 
 sub intervall {
  }
  sub search {
-   my $self = shift;
+   my $self  = shift;
-   my $attr = shift;
+   my ($query, $attr, $cont, $raw);
-   my $cont = shift;
+   if (ref $_[0]) {
-   my $raw  = shift;
+     $query = shift;
+     $attr = $query->{attr};
+     $cont = $query->{cont};
+     $raw  = $query->{raw};
+   } else {
+     require Carp;
+     Carp::cluck("Using three argument search interface is deprecated, use hashref interface instead");
+     $attr = shift;
+     $cont = shift;
+     $raw  = shift;
+     $query = {
+               attr => $attr,
+               cont => $cont,
+               raw  => $raw,
+              };
+   }
    my %result;
    defined $self->{db} or $self->open; # require layout
-Line 606 
 sub search {
+Line 790 
 sub search {
        my $name = $_->name;
        if (exists $raw->{$name} and @{$raw->{$name}}) {
          my $scale = 1/scalar(@{$raw->{$name}});
-         my %r = $_->search_raw(@{$raw->{$name}});
+         my %r = $_->search_raw($query, @{$raw->{$name}});
          my ($key, $val);
          while (($key, $val) = each %r) {
            if (exists $result{$key}) {
-Line 620 
 sub search {
+Line 804 
 sub search {
    }
    if (defined $cont and $cont ne '') {
      for (@{$self->{inverted}->{$attr}}) {
-       my %r = $_->search($cont);
+       my %r = $_->search($query, $cont);
        my ($key, $val);
        while (($key, $val) = each %r) {
          if (exists $result{$key}) {
-Line 644 
 sub hilight_positions {
+Line 828 
 sub hilight_positions {
    my %pos;
    if (defined $raw) {
-     for (@{$self->{inverted}->{$attr}}) {
+     for (@{$self->{inverted}->{$attr}}) { # objects of type
+                                           # WAIT::InvertedIndex for
+                                           # this index field $attr
        my $name = $_->name;
        if (exists $raw->{$name}) {
          my %qt;
-Line 678 
 sub hilight_positions {
+Line 864 
 sub hilight_positions {
  }
  sub hilight {
-   my ($tb, $text, $query, $raw) = @_;
+   my ($tb, $buf, $qplain, $qraw) = @_;
-   my $type = $tb->layout();
+   my $layout = $tb->layout();
    my @result;
-   $query ||= {};
+   $qplain ||= {};
-   $raw   ||= {};
+   $qraw   ||= {};
-   my @ttxt = $type->tag($text);
+   my @ttxt = $layout->tag($buf);
    while (@ttxt) {
      no strict 'refs';
      my %tag = %{shift @ttxt};
-Line 692 
 sub hilight {
+Line 879 
 sub hilight {
      my $fld;
      my %hl;
-     for $fld (grep defined $tag{$_}, keys %$query, keys %$raw) {
+     for $fld (grep defined $tag{$_}, keys %$qplain, keys %$qraw) {
        my $hp = $tb->hilight_positions($fld, $txt,
-                                       $query->{$fld}, $raw->{$fld});
+                                       $qplain->{$fld}, $qraw->{$fld});
        for (keys %$hp) {
          if (exists $hl{$_}) {   # -w ;-(
            $hl{$_} = max($hl{$_}, $hp->{$_});
-Line 720 
 sub hilight {
+Line 907 
 sub hilight {
  }
 ;

 Legend:



Removed from v.10
 


changed lines


 
Added in v.19
 Legend:



Removed from v.10
 


changed lines


 
Added in v.19
-Removed from v.10
+Added in v.19

	ViewVC Help
Powered by ViewVC 1.1.26