/[wait]/cvs-head/script/smakewhatis
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/script/smakewhatis

Parent Directory Parent Directory | Revision Log Revision Log


Revision 20 - (hide annotations)
Tue May 9 11:29:45 2000 UTC (24 years ago) by cvs2svn
File size: 9119 byte(s)
This commit was generated by cvs2svn to compensate for changes in r10,
which included commits to RCS files with non-trunk default branches.

1 ulpfr 10 #!/usr/bin/perl -w
2     ######################### -*- Mode: Cperl -*- #########################
3     ##
4     ## $Basename: smakewhatis $
5 ulpfr 19 ## $Revision: 1.11 $
6 ulpfr 10 ##
7     ## Author : Ulrich Pfeifer
8     ## Created On : Mon Sep 2 12:57:12 1996
9     ##
10     ## Last Modified By : Ulrich Pfeifer
11 ulpfr 19 ## Last Modified On : Tue May 9 08:52:03 2000
12 ulpfr 10 ##
13     ## Copyright (c) 1996-1997, Ulrich Pfeifer
14     ##
15     ##
16     ######################################################################
17    
18     use strict;
19    
20     use FileHandle;
21     use File::Path;
22     use DB_File;
23     use Getopt::Long;
24    
25     require WAIT::Database;
26     require WAIT::Config;
27     require WAIT::Parse::Nroff;
28     require WAIT::Document::Nroff;
29    
30    
31     my %OPT = (database => 'DB',
32     dir => $WAIT::Config->{WAIT_home} || '/tmp',
33     table => 'man',
34     clean => 0,
35     remove => 0,
36     );
37    
38     GetOptions(\%OPT,
39     'database=s',
40     'dir=s',
41     'table=s',
42     'clean!',
43     'remove',
44     ) || die "Usage: ...\n";
45    
46     if ($OPT{clean}) {
47     if (-d "$OPT{dir}/$OPT{database}") {
48     eval {
49     my $tmp = WAIT::Database->open(name => $OPT{database},
50     'directory' => $OPT{dir})
51     or die "Could not open table $OPT{table}: $@";
52     my $tbl = $tmp->table(name => $OPT{table});
53     $tbl->drop if $tbl;
54     $tmp->close;
55     rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
56     if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
57     };
58     die $@ if $@;
59     } else {
60     die "Database $OPT{dir}/$OPT{database} doesn't exist,
61     nothing to clean, nothing done.\n";
62     }
63     exit;
64     }
65    
66     my $db = WAIT::Database->open(
67     name => $OPT{database},
68     'directory' => $OPT{dir},
69     )
70     ||
71     WAIT::Database->create(
72     name => $OPT{database},
73     'directory' => $OPT{dir},
74     );
75     unless ($db) {
76     require Carp;
77     Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
78     }
79    
80    
81     # We need a class that allows the index to access each document.
82     # Remember, all documents in this collection are values of a single
83     # tied hash. An especially cool feature is that the tie may return the
84     # whole document as a single string or as an object or anything that
85     # fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
86     # class can work. See WAIT::Table for a manpage (W:D:Nroff has none).
87    
88     my %D;
89     my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
90     die $@ unless defined $access;
91    
92     # While WAIT::Document::Nroff ignored the contents of the scalar it
93     # accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
94     # mind:
95    
96     # access => Document
97     # layout => Parse
98    
99     # The access to a document is provided by a Document class just as
100     # the layout of a document is provided by a Parser class. Makes sense?
101    
102     my $layout= WAIT::Parse::Nroff->new;
103    
104     # The definition of filters is something that will be tought in the
105     # advanced techniques course. For now, just copy and paste the
106     # something from here and try out alternatives.
107     my $stem = [{
108     'prefix' => ['unroff', 'isotr', 'isolc'],
109     'intervall' => ['unroff', 'isotr', 'isolc'],
110     },'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
111     # unroff it as the first because nroff markup isn't very helpful for
112     # indexing, turn into lowercase, eliminate the stopwords before isotr
113     # because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
114     # replace line noise ith space, eliminate anything left with less than
115     # 2 letters, find the word's stem.
116     my $text = [{
117     'prefix' => ['unroff', 'isotr', 'isolc'],
118     'intervall' => ['unroff', 'isotr', 'isolc'],
119     },
120     'unroff', 'isolc', 'stop', 'isotr', 'split2'];
121     my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;
122    
123     my $tb;
124     eval { $tb = $db->table(name => $OPT{table}) };
125     $tb ||=
126     $db->create_table
127     (
128    
129     name => $OPT{table},
130     # mandatory argument like a tablename in a relational database
131    
132     access => $access,
133     # see above
134    
135     layout => $layout,
136     # see above
137    
138     attr => ['docid', 'headline', 'size'],
139     # the attr argument determines which attributes WAIT will store for
140     # us for later retrieval. A docid is a must, of course, so that we
141     # can retrieve the document later. The more attributes you name
142     # here, the bigger gets the database. For your first experiences it
143     # is highly recommended to have the two items C<docid> and
144     # C<headline> here, so that you can use sman for debugging as soon
145     # as you are through smakewhatis. In the sman program these two
146     # column names are hardcoded. You have the opportunity to create
147     # the two attributes for every record in the Layout/Parser class
148    
149     keyset => [['docid']],
150     # which keys are necessary to unambiguously identify a record and
151     # access it through $access?
152    
153     invindex =>
154     [
155     'name' => $stem,
156     'synopsis' => $stem,
157     'bugs' => $stem,
158     'description' => $stem,
159     'text' => $stem,
160     'environment' => $text,
161     'example' => $text, 'example' => $stem,
162     'author' => $sound, 'author' => $stem,
163     ]
164     # without this argument, WAIT will be able to run a pass through
165     # the indexer but it won't do anything useful. This argument is the
166     # heart of your indexing task and the place where you will start
167     # tuning once your indexes are working. For the impatent user, it's
168     # recommended to just have them all be text.
169    
170     );
171    
172     die unless $tb;
173    
174     my @DIRS;
175     if (@ARGV) {
176     @DIRS = @ARGV;
177     } else {
178     @DIRS = @{$WAIT::Config->{manpath}};
179     }
180    
181 ulpfr 19 $tb->set(top=>1);
182 ulpfr 10 my $mandir;
183     for $mandir (grep -d $_, @DIRS) {
184     opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
185     my @mdir = grep -d "$mandir/$_", grep /^man/, readdir(DIR);
186     closedir DIR;
187     my $section;
188     for $section (@mdir) {
189     my $file;
190     print STDERR "Scanning '$mandir/$section' ...\n";
191     opendir(DIR, "$mandir/$section")
192     or warn "Could not open dir '$mandir/section': $!";
193     my @files = grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
194     closedir DIR;
195     for $file ( @files ) {
196     print STDERR "Indexing '$mandir/$section/$file' ... ";
197     &index("$mandir/$section/$file");
198     }
199     }
200     }
201 ulpfr 19 my $now = time;
202     warn "Starting reorg\n";
203     $tb->set(top=>1);
204     warn sprintf "Finished reorg %d seconds\n", time - $now;
205 ulpfr 10
206     # Do not forget to close the database after the extreme job you just finished.
207    
208     $db->close();
209     exit;
210    
211     # Now that you have created a database, lean back. To verify that it
212     # sort of worked and to understand what you actually did, I'd
213     # recommend to run sman through the debugger. Sman has options to
214     # choose databases and tables unrelated to its original task. You can
215     # run e.g.
216    
217     # perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable
218    
219     # Step through the debugger to the place where a query object is
220     # created. Expect huge, self-referential datastrucures if you dump any
221     # of these object with the x command. It's quite instructive to watch
222     # the debugger print them for several minutes or hours.
223    
224     # Once you have established a working querying with sman, you will
225     # want to write your own sman.
226    
227     my $NO;
228     sub index {
229     my $did = shift;
230    
231     if ($tb->have('docid' => $did)) {
232     #die "$@" if $2 ne '';
233     if (!$OPT{remove}) {
234     print "duplicate\n";
235     return;
236     }
237     } elsif ($OPT{remove}) {
238     print "missing\n";
239     return;
240     }
241    
242     if (-s $did < 100) {
243     print "too small\n";
244     return;
245     }
246    
247     my $value = $D{$did};
248     unless (defined $value) {
249     print "unavailable\n";
250     }
251     printf STDERR "ok [%d]\n", ++$NO;
252    
253     my $record = $layout->split($value);
254     $record->{size} = length($value);
255     my $headline = $record->{name} || $did;
256     $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
257     printf "%s\n", substr($headline,0,80);
258     if ($OPT{remove}) {
259     $tb->delete('docid' => $did, headline => $headline, %{$record});
260     } else {
261     $tb->insert('docid' => $did, headline => $headline, %{$record});
262     }
263     }
264    
265    
266     __END__
267     ## ###################################################################
268     ## pod
269     ## ###################################################################
270    
271     =head1 NAME
272    
273     smakewhatis - generate a manual database for sman
274    
275     =head1 SYNOPSIS
276    
277     B<smakewhatis>
278     [B<-database> I<database name>]
279     [B<-dir> I<database directory>]
280     [B<-table> I<name>]
281     [B<-remove>]
282     [I<mandir> ...]
283    
284     =head1 DESCRIPTION
285    
286     B<Smakewhatis> generates/updates databases for B<sman>(1). If
287     I<mandir>s are specified, these are used. Otherwise the confiigured
288     default directories are indexed.
289    
290     =head2 OPTIONS
291    
292     =over 10
293    
294     =item B<-database> I<database name>
295    
296     Change the default database name to I<database name>.
297    
298     =item B<-dir> I<database directory>
299    
300     Change the default database directory to I<database directory>.
301    
302     =item B<-table> I<name>
303    
304     Use I<name> instead of C<man> as table name.
305    
306     =item B<-clean>
307    
308     Clean B<database> before indexing.
309    
310     =item B<-remove>
311    
312     Remove the selected directories from the database instead of
313     adding/updating. This works only for the manuals which are unchanged
314     since the indexing.
315    
316     =head1 SEE ALSO
317    
318     L<sman>.
319    
320     =head1 AUTHOR
321    
322     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26