/[pgestraier]/trunk/bin/pgest-index.pl

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/bin/pgest-index.pl

Parent Directory | Revision Log | View Patch Patch

-revision 62 by dpavlin,
Mon Aug  7 13:24:49 2006 UTC
+revision 88 by dpavlin,
Sun Mar 16 20:55:56 2008 UTC
 Line 1
  #!/usr/bin/perl -w
  use strict;
- use Search::Estraier 0.06;
+ use Search::Estraier 0.07;
  use DBI;
  use Data::Dumper;
  use Encode qw/from_to/;
 Line 15 
 pgest-index.pl - create full-text index
  =cut
  my $c = {
-         name => 'imenik',
+         debug => 0,
-         node_url => 'http://localhost:1978/node/imenik',
-         dbi => 'Pg:dbname=vip',
-         sql => qq{
-                 select ime,tel from imenik
-         },
-         pk_col => 'tel',
-         db_encoding => 'iso-8859-2',
-         debug => 1,
-         user => 'admin',
-         passwd => 'admin',
  };
- GetOptions($c, qw/node_url=s sql=s pk_col=s eb_encoding=s debug+ user=s passwd=s/);
+ =head1 SYNOPSIS
+   pgest-index.pl --create movies --sql "select id,title,year from movies"
+   pgsql-index.pl --drop movies
+ Options:
+ =over 4
+ =item --create name
+ Create index C<name> and create triggers on table with same name
+ =item --drop name
+ Remove triggers from table C<name> and node with same name
+ =item --node-url http://localhost:1978/node/name
+ Full URI to node. If it's not specified, it's assumed that you are using
+ Hyper Estraier on C<http://localhost:1978/>.
+ =item --sql "select col1,col2 from name"
+ SQL query which will return names of columns which are included in full-text
+ index. Have in mind that you can't use aliases (as I<something>) in this SQL
+ query (or triggers will be created with wrong fields).
+ If SQL query isn't specified, default one C<< select * from movies >> will
+ be created. That will be B<serious performance hit> if all columns are
+ not needed for search.
+ =item --pk id
+ Specify name of primary key column in SQL query. If you allready have primary
+ key on table or unique index and it consists of simgle column
+ (compaund keys are not supported) it will be picked up automatically.
+ If you specify value which is not unique, you will get just last occurence
+ of that item in index (which might be what you want). That's because specified
+ C<pk> column will be used for C<@uri> in Hyper Estraier.
+ If name of primary key begins with C<_> it will not be added into text
+ indexing (so you won't be able to find prmary key value, but it will still
+ be available as attribute value).
+ =item --user login
+ =item --passwd password
+ Username and password to use when connecting to Hyper Estraier. If not specified,
+ C<admin> and C<admin> will be used.
+ =item --debug
+ Dump debugging output. It may be specified multiple times for more verbose
+ debugging.
+ =back
+ =cut
+ my $usage = "$0 database_name (--create|--drop) table_name [--sql='select id,foo,bar from table'] [--pk=id]\n";
+ GetOptions($c, qw/create=s drop=s node_url=s sql=s pk=s user=s passwd=s debug+/);
+ my $dbname = shift @ARGV || die $usage;
+ $c->{dbi} = 'Pg:dbname=' . $dbname;
  warn "# c: ", Dumper($c) if ($c->{debug});
+ my $table = $c->{create} || $c->{drop} || die $usage;
+ $c->{node_url} = 'http://localhost:1978/node/' . $table;
+ $c->{user} ||= 'admin';
+ $c->{passwd} ||= 'admin';
  # create and configure node
  my $node = new Search::Estraier::Node(
          url => $c->{node_url},
-Line 45 
 my $node = new Search::Estraier::Node(
+Line 111 
 my $node = new Search::Estraier::Node(
  # create DBI connection
  my $dbh = DBI->connect("DBI:$c->{dbi}","","") || die $DBI::errstr;
+ # drop existing triggers
+ sub drop_triggers {
+         my $table = shift || die "no table?";
+         my $sth = $dbh->prepare(qq{
+                 SELECT relname,tgname
+                 FROM pg_trigger JOIN pg_class ON relfilenode = tgrelid
+                 WHERE tgname LIKE 'pgest_trigger_%' AND relname = ?
+         }) || $dbh->errstr;
+         $sth->execute( $table ) || $sth->errstr();
+         warn "there are ", $sth->rows, " triggers instead of just 3, dropping all\n" if ($sth->rows != 3);
+         while (my $row = $sth->fetchrow_hashref) {
+                 my $sql = sprintf(qq{ DROP TRIGGER %s ON %s }, $row->{tgname}, $row->{relname} );
+                 #warn "# $sql\n";
+                 $dbh->do( $sql ) || $dbh->errstr;
+         }
+         warn "removed ", $sth->rows, " triggers from $table\n" if ($sth->rows);
+ }
+ if ($c->{drop}) {
+         drop_triggers( $table );
+         warn "removing node $table\n";
+         $node->master(
+                 action => 'nodedel',
+                 name => $table,
+         );
+         exit;
+ }
+ # clear existing node
+ $node->master(
+         action => 'nodeclr',
+         name => $table,
+ );
+ # create PostgreSQL functions
+ $dbh->do(qq{
+ CREATE OR REPLACE FUNCTION pgest(text, text, text, int, text, text, text, int, int, text[])
+         RETURNS setof record
+         AS 'pgest','pgest_node'
+         LANGUAGE 'C' IMMUTABLE CALLED ON NULL INPUT;
+ CREATE OR REPLACE FUNCTION pgest_trigger() RETURNS TRIGGER
+         AS 'pgest', 'pgest_trigger'
+         LANGUAGE 'C' STRICT;
+ }) || die $dbh->errstr();
+ drop_triggers( $table );
+ if (! $c->{pk}) {
+         warn "# finding primary key for $table\n" if ($c->{debug});
+         my $index_fmt = qq{
+                 SELECT
+                         a.attname, t.typname
+                 FROM pg_type t, pg_attribute a
+                 WHERE t.oid = a.atttypid AND attrelid = (
+                         SELECT indexrelid
+                         FROM pg_class c, pg_index i
+                         WHERE c.relname = '%s'
+                                 AND c.oid = i.indrelid
+                                 AND %s
+                                 AND indnatts = 1
+                 )
+         };
+         $c->{pk} = $dbh->selectrow_array( sprintf($index_fmt, $table, 'indisprimary') );
+         $c->{pk} ||= $dbh->selectrow_array( sprintf($index_fmt, $table, 'indisunique') );
+ }
+ die "$0: can't find single column primary key for table ${table}. Please specify column with --pk\n" unless ($c->{pk});
+ warn "using primary key $c->{pk}", $c->{pk} =~ m/^_/ ? " (not indexed)" : "", "\n";
+ $dbh->begin_work;
+ $c->{sql} ||= "select * from $table";
  my $sth = $dbh->prepare($c->{sql}) || die $dbh->errstr();
- $sth->execute() || die $sth->errstr();
+ $sth->execute() || die $sth->errstr;
  my @cols = @{ $sth->{NAME} };
+ die "SQL '$c->{sql}' didn't include primary key $c->{pk}\n" unless grep(/^\Q$c->{pk}\E$/, @cols);
  warn "# columns: ",join(",", @cols),"\n" if ($c->{debug});
  my $total = $sth->rows;
  my $i = 1;
  my $t = time();
- my $pk_col = $c->{pk_col} || 'id';
+ my $pk = $c->{pk} || 'id';
+ warn "indexing existing ",$sth->rows," rows\n";
  while (my $row = $sth->fetchrow_hashref() ) {
-Line 65 
 while (my $row = $sth->fetchrow_hashref(
+Line 224 
 while (my $row = $sth->fetchrow_hashref(
          # create document
          my $doc = new Search::Estraier::Document;
-         if (my $id = $row->{$pk_col}) {
+         if (my $id = $row->{$pk}) {
                  $doc->add_attr('@uri', $id);
          } else {
-                 die "can't find pk_col column '$pk_col' in results\n";
+                 die "can't find pk column '$pk' in results\n";
          }
-         printf "%4d ",$i;
+         my $log = sprintf "%4d ",$i;
          while (my ($col,$val) = each %{$row}) {
                  if ($val) {
-                         # change encoding?
-                         from_to($val, ($c->{db_encoding} || 'ISO-8859-1'), 'UTF-8');
                          # add attributes (make column usable from attribute search)
                          $doc->add_attr($col, $val);
                          # add body text to document (make it searchable using full-text index)
-                         $doc->add_text($val);
+                         $doc->add_text($val) unless ($col =~ m/^_/);
-                         print "R";
+                         $log .= "R";
                  } else {
-                         print ".";
+                         $log .= ".";
                  }
          }
-Line 96 
 while (my $row = $sth->fetchrow_hashref(
+Line 252 
 while (my $row = $sth->fetchrow_hashref(
          die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });
-         printf (" %d%% %.1f/s\n", int(( $i++ / $total) * 100), ( $i / (time() - $t) ) );
+         $log .= sprintf(" %d%% %.1f/s\r", int(( $i++ / $total) * 100), ( $i / (time() - $t) ) );
- }
+         print STDERR $log;
- my $table = $c->{name} || die "no name?";
+ }
- my $cols = "'" . join("','", @cols) . "'";
+ my $cols = "'" . join("', '", @cols) . "'";
  foreach my $t (qw/UPDATE INSERT DELETE/) {
          my $lc_t = lc($t);
-         $dbh->do(qq{ DROP TRIGGER pgest_trigger_${lc_t} ON ${table} });
          my $sql = qq{
                  CREATE TRIGGER pgest_trigger_${lc_t} AFTER ${t}
                          ON ${table} FOR EACH ROW
                          EXECUTE PROCEDURE pgest_trigger('$c->{node_url}','$c->{user}','$c->{passwd}',
-                                 '$c->{pk_col}', $cols
+                                 '$c->{pk}', $cols
                          )
          };
-         warn "$sql\n";
+         #warn "$sql\n";
-         $dbh->do( $sql ) || die $dbh->errstr();
+         $dbh->do( $sql ) || die $dbh->errstr;
  }
+ warn "created consistency triggers\n";
+ $dbh->commit;
+ =head1 SEARCHING
+ At end of each run, this script will output example search SQL query on STDOUT.
+ You can use it to quickly construct queries for your application.
+ =cut
+ my $col_names = join(', ', @cols);
+ my $col_def = join(', ', map { "$_ text" } @cols);
+ print "
+ -- example SQL search query:
+ SELECT $col_names
+ FROM pgest(
+         -- node, login, passwd, depth
+         '$c->{node_url}', '$c->{user}', '$c->{passwd}', 0,
+         -- full text search
+         'foo bar',
+         -- attribute filter, order, limit, offset
+         null, null, null, null,
+         -- return columns
+         array[$cols]
+ ) as ($col_def);
+ ";
+ __END__
+ =head1 AUTHOR
+ Dobrica Pavlinusic <dpavlin@rot13.org>
+ L<http://www.rot13.org/~dpavlin/>
+ =head1 LICENSE
+ This product is licensed under GNU Public License (GPL) v2 or later.
+ =cut

 Legend:



Removed from v.62
 


changed lines


 
Added in v.88
 Legend:



Removed from v.62
 


changed lines


 
Added in v.88
-Removed from v.62
+Added in v.88

	ViewVC Help
Powered by ViewVC 1.1.26