Annotation of /trunk/script/smakewhatis

#!/usr/bin/perl -w
######################### -*- Mode: Cperl -*- #########################
##
## $Basename: smakewhatis $
## $Revision: 1.11 $
##
## Author           : Ulrich Pfeifer
## Created On       : Mon Sep  2 12:57:12 1996
##
## Last Modified By : Ulrich Pfeifer
## Last Modified On : Tue May  9 08:52:03 2000
##
## Copyright (c) 1996-1997, Ulrich Pfeifer
##
##
######################################################################

use strict;

use FileHandle;
use File::Path;
use DB_File;
use Getopt::Long;

use lib '/data/wait/lib';

require WAIT::Database;
require WAIT::Config;
require WAIT::Parse::Nroff;
require WAIT::Document::Nroff;


my %OPT = (database => 'DB',
           dir      => $WAIT::Config->{WAIT_home} || '/tmp',
           table    => 'man',
           clean    => 0,
           remove   => 0,
          );

GetOptions(\%OPT,
           'database=s',
           'dir=s',
           'table=s',
           'clean!',
           'remove',
          ) || die "Usage: ...\n";

if ($OPT{clean}) {
  if (-d "$OPT{dir}/$OPT{database}") {
    eval {
      my $tmp = WAIT::Database->open(name        => $OPT{database},
                                     'directory' => $OPT{dir})
          or die "Could not open table $OPT{table}: $@";
      my $tbl = $tmp->table(name => $OPT{table});
      $tbl->drop if $tbl;
      $tmp->close;
      rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
          if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
    };
    die $@ if $@;
  } else {
    die "Database $OPT{dir}/$OPT{database} doesn't exist,
nothing to clean, nothing done.\n";
  }
  exit;
}

my $db = WAIT::Database->open(
                              name        => $OPT{database},
                              'directory' => $OPT{dir},
                             )
    ||
    WAIT::Database->create(
                           name        => $OPT{database},
                           'directory' => $OPT{dir},
                          );
unless ($db) {
  require Carp;
  Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
}


# We need a class that allows the index to access each document.
# Remember, all documents in this collection are values of a single
# tied hash. An especially cool feature is that the tie may return the
# whole document as a single string or as an object or anything that
# fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
# class can work. See WAIT::Table for a manpage (W:D:Nroff has none).

my %D;
my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
die $@ unless defined $access;

# While WAIT::Document::Nroff ignored the contents of the scalar it
# accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
# mind:

#    access    =>   Document
#    layout    =>   Parse

# The access to a document is provided by a Document class just as
# the layout of a document is provided by a Parser class. Makes sense?

my $layout= WAIT::Parse::Nroff->new;

# The definition of filters is something that will be tought in the
# advanced techniques course. For now, just copy and paste the
# something from here and try out alternatives.
my $stem = [{
             'prefix'    => ['unroff', 'isotr', 'isolc'],
             'intervall' => ['unroff', 'isotr', 'isolc'],
            },'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
# unroff it as the first because nroff markup isn't very helpful for
# indexing, turn into lowercase, eliminate the stopwords before isotr
# because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
# replace line noise ith space, eliminate anything left with less than
# 2 letters, find the word's stem.
my $text = [{
             'prefix'    => ['unroff', 'isotr', 'isolc'],
             'intervall' => ['unroff', 'isotr', 'isolc'],
            },
             'unroff', 'isolc', 'stop', 'isotr', 'split2'];
my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;

my $tb;
eval { $tb = $db->table(name => $OPT{table}) };
$tb ||=
  $db->create_table
  (

   name     => $OPT{table},
   # mandatory argument like a tablename in a relational database

   access   => $access,
   # see above

   layout   => $layout,
   # see above

   attr     => ['docid', 'headline', 'size'],
   # the attr argument determines which attributes WAIT will store for
   # us for later retrieval. A docid is a must, of course, so that we
   # can retrieve the document later. The more attributes you name
   # here, the bigger gets the database. For your first experiences it
   # is highly recommended to have the two items C<docid> and
   # C<headline> here, so that you can use sman for debugging as soon
   # as you are through smakewhatis. In the sman program these two
   # column names are hardcoded. You have the opportunity to create
   # the two attributes for every record in the Layout/Parser class

   keyset   => [['docid']],
   # which keys are necessary to unambiguously identify a record and
   # access it through $access?

   invindex =>
   [
    'name'         => $stem,
    'synopsis'     => $stem,
    'bugs'         => $stem,
    'description'  => $stem,
    'text'         => $stem,
    'environment'  => $text,
    'example'      => $text,  'example' => $stem,
    'author'       => $sound, 'author'  => $stem,
   ]
   # without this argument, WAIT will be able to run a pass through
   # the indexer but it won't do anything useful. This argument is the
   # heart of your indexing task and the place where you will start
   # tuning once your indexes are working. For the impatent user, it's
   # recommended to just have them all be text.

  );

die unless $tb;

my @DIRS;
if (@ARGV) {
  @DIRS = @ARGV;
} else {
  @DIRS  = @{$WAIT::Config->{manpath}};
}

$tb->set(top=>1);
my $mandir;
for $mandir (grep -d $_, @DIRS) {
  opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
  my @mdir =  grep -d "$mandir/$_", grep /^man/, readdir(DIR);
  closedir DIR;
  my $section;
  for $section (@mdir) {
    my $file;
    print STDERR "Scanning '$mandir/$section' ...\n";
    opendir(DIR, "$mandir/$section")
      or warn "Could not open dir '$mandir/section': $!";
    my @files =  grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
    closedir DIR;
    for $file ( @files ) {
      print STDERR "Indexing '$mandir/$section/$file' ... ";
      &index("$mandir/$section/$file");
    }
  }
}
my $now = time;
warn "Starting reorg\n";
$tb->set(top=>1);
warn sprintf "Finished reorg %d seconds\n", time - $now;

# Do not forget to close the database after the extreme job you just finished.

$db->close();
exit;

# Now that you have created a database, lean back. To verify that it
# sort of worked and to understand what you actually did, I'd
# recommend to run sman through the debugger. Sman has options to
# choose databases and tables unrelated to its original task. You can
# run e.g.

# perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable

# Step through the debugger to the place where a query object is
# created. Expect huge, self-referential datastrucures if you dump any
# of these object with the x command. It's quite instructive to watch
# the debugger print them for several minutes or hours.

# Once you have established a working querying with sman, you will
# want to write your own sman.

my $NO;
sub index {
  my $did = shift;

  if ($tb->have('docid' => $did)) {
    #die "$@" if $2 ne '';
    if (!$OPT{remove}) {
      print "duplicate\n";
      return;
    }
  } elsif ($OPT{remove}) {
    print "missing\n";
    return;
  }

  if (-s $did < 100) {
    print "too small\n";
    return;
  }

  my $value = $D{$did};
  unless (defined $value) {
    print "unavailable\n";
  }
  printf STDERR "ok [%d]\n", ++$NO;

  my $record = $layout->split($value);
  $record->{size} = length($value);
  my $headline = $record->{name} || $did;
  $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
  printf "%s\n", substr($headline,0,80);
  if ($OPT{remove}) {
    $tb->delete('docid' => $did, headline => $headline, %{$record});
  } else {
    $tb->insert('docid' => $did, headline => $headline, %{$record});
  }
}


__END__
## ###################################################################
## pod
## ###################################################################

=head1 NAME

smakewhatis - generate a manual database for sman

=head1 SYNOPSIS

B<smakewhatis>
[B<-database> I<database name>]
[B<-dir> I<database directory>]
[B<-table> I<name>]
[B<-remove>]
[I<mandir> ...]

=head1 DESCRIPTION

B<Smakewhatis> generates/updates databases for B<sman>(1). If
I<mandir>s are specified, these are used. Otherwise the confiigured
default directories are indexed.

=head2 OPTIONS

=over 10

=item B<-database> I<database name>

Change the default database name to I<database name>.

=item B<-dir> I<database directory>

Change the default database directory to I<database directory>.

=item B<-table> I<name>

Use I<name> instead of C<man> as table name.

=item B<-clean>

Clean B<database> before indexing.

=item B<-remove>

Remove the selected directories from the database instead of
adding/updating. This works only for the manuals which are unchanged
since the indexing.

=head1 SEE ALSO

L<sman>.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
1	ulpfr	10	#!/usr/bin/perl -w
2			######################### -- Mode: Cperl -- #########################
3			##
4			## $Basename: smakewhatis $
5	ulpfr	19	## $Revision: 1.11 $
6	ulpfr	10	##
7			## Author : Ulrich Pfeifer
8			## Created On : Mon Sep 2 12:57:12 1996
9			##
10			## Last Modified By : Ulrich Pfeifer
11	ulpfr	19	## Last Modified On : Tue May 9 08:52:03 2000
12	ulpfr	10	##
13			## Copyright (c) 1996-1997, Ulrich Pfeifer
14			##
15			##
16			######################################################################
17
18			use strict;
19
20			use FileHandle;
21			use File::Path;
22			use DB_File;
23			use Getopt::Long;
24
25	dpavlin	95	use lib '/data/wait/lib';
26
27	ulpfr	10	require WAIT::Database;
28			require WAIT::Config;
29			require WAIT::Parse::Nroff;
30			require WAIT::Document::Nroff;
31
32
33			my %OPT = (database => 'DB',
34			dir => $WAIT::Config->{WAIT_home} \|\| '/tmp',
35			table => 'man',
36			clean => 0,
37			remove => 0,
38			);
39
40			GetOptions(\%OPT,
41			'database=s',
42			'dir=s',
43			'table=s',
44			'clean!',
45			'remove',
46			) \|\| die "Usage: ...\n";
47
48			if ($OPT{clean}) {
49			if (-d "$OPT{dir}/$OPT{database}") {
50			eval {
51			my $tmp = WAIT::Database->open(name => $OPT{database},
52			'directory' => $OPT{dir})
53			or die "Could not open table $OPT{table}: $@";
54			my $tbl = $tmp->table(name => $OPT{table});
55			$tbl->drop if $tbl;
56			$tmp->close;
57			rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
58			if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
59			};
60			die $@ if $@;
61			} else {
62			die "Database $OPT{dir}/$OPT{database} doesn't exist,
63			nothing to clean, nothing done.\n";
64			}
65			exit;
66			}
67
68			my $db = WAIT::Database->open(
69			name => $OPT{database},
70			'directory' => $OPT{dir},
71			)
72			\|\|
73			WAIT::Database->create(
74			name => $OPT{database},
75			'directory' => $OPT{dir},
76			);
77			unless ($db) {
78			require Carp;
79			Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
80			}
81
82
83			# We need a class that allows the index to access each document.
84			# Remember, all documents in this collection are values of a single
85			# tied hash. An especially cool feature is that the tie may return the
86			# whole document as a single string or as an object or anything that
87			# fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
88			# class can work. See WAIT::Table for a manpage (W:D:Nroff has none).
89
90			my %D;
91			my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
92			die $@ unless defined $access;
93
94			# While WAIT::Document::Nroff ignored the contents of the scalar it
95			# accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
96			# mind:
97
98			# access => Document
99			# layout => Parse
100
101			# The access to a document is provided by a Document class just as
102			# the layout of a document is provided by a Parser class. Makes sense?
103
104			my $layout= WAIT::Parse::Nroff->new;
105
106			# The definition of filters is something that will be tought in the
107			# advanced techniques course. For now, just copy and paste the
108			# something from here and try out alternatives.
109			my $stem = [{
110			'prefix' => ['unroff', 'isotr', 'isolc'],
111			'intervall' => ['unroff', 'isotr', 'isolc'],
112			},'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
113			# unroff it as the first because nroff markup isn't very helpful for
114			# indexing, turn into lowercase, eliminate the stopwords before isotr
115			# because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
116			# replace line noise ith space, eliminate anything left with less than
117			# 2 letters, find the word's stem.
118			my $text = [{
119			'prefix' => ['unroff', 'isotr', 'isolc'],
120			'intervall' => ['unroff', 'isotr', 'isolc'],
121			},
122			'unroff', 'isolc', 'stop', 'isotr', 'split2'];
123			my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;
124
125			my $tb;
126			eval { $tb = $db->table(name => $OPT{table}) };
127			$tb \|\|=
128			$db->create_table
129			(
130
131			name => $OPT{table},
132			# mandatory argument like a tablename in a relational database
133
134			access => $access,
135			# see above
136
137			layout => $layout,
138			# see above
139
140			attr => ['docid', 'headline', 'size'],
141			# the attr argument determines which attributes WAIT will store for
142			# us for later retrieval. A docid is a must, of course, so that we
143			# can retrieve the document later. The more attributes you name
144			# here, the bigger gets the database. For your first experiences it
145			# is highly recommended to have the two items C<docid> and
146			# C<headline> here, so that you can use sman for debugging as soon
147			# as you are through smakewhatis. In the sman program these two
148			# column names are hardcoded. You have the opportunity to create
149			# the two attributes for every record in the Layout/Parser class
150
151			keyset => [['docid']],
152			# which keys are necessary to unambiguously identify a record and
153			# access it through $access?
154
155			invindex =>
156			[
157			'name' => $stem,
158			'synopsis' => $stem,
159			'bugs' => $stem,
160			'description' => $stem,
161			'text' => $stem,
162			'environment' => $text,
163			'example' => $text, 'example' => $stem,
164			'author' => $sound, 'author' => $stem,
165			]
166			# without this argument, WAIT will be able to run a pass through
167			# the indexer but it won't do anything useful. This argument is the
168			# heart of your indexing task and the place where you will start
169			# tuning once your indexes are working. For the impatent user, it's
170			# recommended to just have them all be text.
171
172			);
173
174			die unless $tb;
175
176			my @DIRS;
177			if (@ARGV) {
178			@DIRS = @ARGV;
179			} else {
180			@DIRS = @{$WAIT::Config->{manpath}};
181			}
182
183	ulpfr	19	$tb->set(top=>1);
184	ulpfr	10	my $mandir;
185			for $mandir (grep -d $_, @DIRS) {
186			opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
187			my @mdir = grep -d "$mandir/$_", grep /^man/, readdir(DIR);
188			closedir DIR;
189			my $section;
190			for $section (@mdir) {
191			my $file;
192			print STDERR "Scanning '$mandir/$section' ...\n";
193			opendir(DIR, "$mandir/$section")
194			or warn "Could not open dir '$mandir/section': $!";
195			my @files = grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
196			closedir DIR;
197			for $file ( @files ) {
198			print STDERR "Indexing '$mandir/$section/$file' ... ";
199			&index("$mandir/$section/$file");
200			}
201			}
202			}
203	ulpfr	19	my $now = time;
204			warn "Starting reorg\n";
205			$tb->set(top=>1);
206			warn sprintf "Finished reorg %d seconds\n", time - $now;
207	ulpfr	10
208			# Do not forget to close the database after the extreme job you just finished.
209
210			$db->close();
211			exit;
212
213			# Now that you have created a database, lean back. To verify that it
214			# sort of worked and to understand what you actually did, I'd
215			# recommend to run sman through the debugger. Sman has options to
216			# choose databases and tables unrelated to its original task. You can
217			# run e.g.
218
219			# perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable
220
221			# Step through the debugger to the place where a query object is
222			# created. Expect huge, self-referential datastrucures if you dump any
223			# of these object with the x command. It's quite instructive to watch
224			# the debugger print them for several minutes or hours.
225
226			# Once you have established a working querying with sman, you will
227			# want to write your own sman.
228
229			my $NO;
230			sub index {
231			my $did = shift;
232
233			if ($tb->have('docid' => $did)) {
234			#die "$@" if $2 ne '';
235			if (!$OPT{remove}) {
236			print "duplicate\n";
237			return;
238			}
239			} elsif ($OPT{remove}) {
240			print "missing\n";
241			return;
242			}
243
244			if (-s $did < 100) {
245			print "too small\n";
246			return;
247			}
248
249			my $value = $D{$did};
250			unless (defined $value) {
251			print "unavailable\n";
252			}
253			printf STDERR "ok [%d]\n", ++$NO;
254
255			my $record = $layout->split($value);
256			$record->{size} = length($value);
257			my $headline = $record->{name} \|\| $did;
258			$headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
259			printf "%s\n", substr($headline,0,80);
260			if ($OPT{remove}) {
261			$tb->delete('docid' => $did, headline => $headline, %{$record});
262			} else {
263			$tb->insert('docid' => $did, headline => $headline, %{$record});
264			}
265			}
266
267
268			__END__
269			## ###################################################################
270			## pod
271			## ###################################################################
272
273			=head1 NAME
274
275			smakewhatis - generate a manual database for sman
276
277			=head1 SYNOPSIS
278
279			B<smakewhatis>
280			[B<-database> I<database name>]
281			[B<-dir> I<database directory>]
282			[B<-table> I<name>]
283			[B<-remove>]
284			[I<mandir> ...]
285
286			=head1 DESCRIPTION
287
288			B<Smakewhatis> generates/updates databases for B<sman>(1). If
289			I<mandir>s are specified, these are used. Otherwise the confiigured
290			default directories are indexed.
291
292			=head2 OPTIONS
293
294			=over 10
295
296			=item B<-database> I<database name>
297
298			Change the default database name to I<database name>.
299
300			=item B<-dir> I<database directory>
301
302			Change the default database directory to I<database directory>.
303
304			=item B<-table> I<name>
305
306			Use I<name> instead of C<man> as table name.
307
308			=item B<-clean>
309
310			Clean B<database> before indexing.
311
312			=item B<-remove>
313
314			Remove the selected directories from the database instead of
315			adding/updating. This works only for the manuals which are unchanged
316			since the indexing.
317
318			=head1 SEE ALSO
319
320			L<sman>.
321
322			=head1 AUTHOR
323
324			Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>