/[wait]/trunk/script/smakewhatis
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/script/smakewhatis

Parent Directory Parent Directory | Revision Log Revision Log


Revision 95 - (hide annotations)
Wed May 26 19:01:15 2004 UTC (19 years, 11 months ago) by dpavlin
File size: 9146 byte(s)
add local development path to libs

1 ulpfr 10 #!/usr/bin/perl -w
2     ######################### -*- Mode: Cperl -*- #########################
3     ##
4     ## $Basename: smakewhatis $
5 ulpfr 19 ## $Revision: 1.11 $
6 ulpfr 10 ##
7     ## Author : Ulrich Pfeifer
8     ## Created On : Mon Sep 2 12:57:12 1996
9     ##
10     ## Last Modified By : Ulrich Pfeifer
11 ulpfr 19 ## Last Modified On : Tue May 9 08:52:03 2000
12 ulpfr 10 ##
13     ## Copyright (c) 1996-1997, Ulrich Pfeifer
14     ##
15     ##
16     ######################################################################
17    
18     use strict;
19    
20     use FileHandle;
21     use File::Path;
22     use DB_File;
23     use Getopt::Long;
24    
25 dpavlin 95 use lib '/data/wait/lib';
26    
27 ulpfr 10 require WAIT::Database;
28     require WAIT::Config;
29     require WAIT::Parse::Nroff;
30     require WAIT::Document::Nroff;
31    
32    
33     my %OPT = (database => 'DB',
34     dir => $WAIT::Config->{WAIT_home} || '/tmp',
35     table => 'man',
36     clean => 0,
37     remove => 0,
38     );
39    
40     GetOptions(\%OPT,
41     'database=s',
42     'dir=s',
43     'table=s',
44     'clean!',
45     'remove',
46     ) || die "Usage: ...\n";
47    
48     if ($OPT{clean}) {
49     if (-d "$OPT{dir}/$OPT{database}") {
50     eval {
51     my $tmp = WAIT::Database->open(name => $OPT{database},
52     'directory' => $OPT{dir})
53     or die "Could not open table $OPT{table}: $@";
54     my $tbl = $tmp->table(name => $OPT{table});
55     $tbl->drop if $tbl;
56     $tmp->close;
57     rmtree("$OPT{dir}/$OPT{database}/$OPT{table}",1,1)
58     if -d "$OPT{dir}/$OPT{database}/$OPT{table}";
59     };
60     die $@ if $@;
61     } else {
62     die "Database $OPT{dir}/$OPT{database} doesn't exist,
63     nothing to clean, nothing done.\n";
64     }
65     exit;
66     }
67    
68     my $db = WAIT::Database->open(
69     name => $OPT{database},
70     'directory' => $OPT{dir},
71     )
72     ||
73     WAIT::Database->create(
74     name => $OPT{database},
75     'directory' => $OPT{dir},
76     );
77     unless ($db) {
78     require Carp;
79     Carp::croak("Could not open/create database '$OPT{dir}/$OPT{database}': $@");
80     }
81    
82    
83     # We need a class that allows the index to access each document.
84     # Remember, all documents in this collection are values of a single
85     # tied hash. An especially cool feature is that the tie may return the
86     # whole document as a single string or as an object or anything that
87     # fits into a scalar. WAIT::Document::Nroff illustrates how the tieing
88     # class can work. See WAIT::Table for a manpage (W:D:Nroff has none).
89    
90     my %D;
91     my $access = tie %D, 'WAIT::Document::Nroff', 'nroff -man';
92     die $@ unless defined $access;
93    
94     # While WAIT::Document::Nroff ignored the contents of the scalar it
95     # accessed, WAIT::Parse::Nroff knows how to understand it. So bear in
96     # mind:
97    
98     # access => Document
99     # layout => Parse
100    
101     # The access to a document is provided by a Document class just as
102     # the layout of a document is provided by a Parser class. Makes sense?
103    
104     my $layout= WAIT::Parse::Nroff->new;
105    
106     # The definition of filters is something that will be tought in the
107     # advanced techniques course. For now, just copy and paste the
108     # something from here and try out alternatives.
109     my $stem = [{
110     'prefix' => ['unroff', 'isotr', 'isolc'],
111     'intervall' => ['unroff', 'isotr', 'isolc'],
112     },'unroff', 'isolc', 'stop', 'isotr', 'split2', 'Stem'];
113     # unroff it as the first because nroff markup isn't very helpful for
114     # indexing, turn into lowercase, eliminate the stopwords before isotr
115     # because our stopwords contain ticks (isn't, i'm, wouldn't, etc.),
116     # replace line noise ith space, eliminate anything left with less than
117     # 2 letters, find the word's stem.
118     my $text = [{
119     'prefix' => ['unroff', 'isotr', 'isolc'],
120     'intervall' => ['unroff', 'isotr', 'isolc'],
121     },
122     'unroff', 'isolc', 'stop', 'isotr', 'split2'];
123     my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex'],;
124    
125     my $tb;
126     eval { $tb = $db->table(name => $OPT{table}) };
127     $tb ||=
128     $db->create_table
129     (
130    
131     name => $OPT{table},
132     # mandatory argument like a tablename in a relational database
133    
134     access => $access,
135     # see above
136    
137     layout => $layout,
138     # see above
139    
140     attr => ['docid', 'headline', 'size'],
141     # the attr argument determines which attributes WAIT will store for
142     # us for later retrieval. A docid is a must, of course, so that we
143     # can retrieve the document later. The more attributes you name
144     # here, the bigger gets the database. For your first experiences it
145     # is highly recommended to have the two items C<docid> and
146     # C<headline> here, so that you can use sman for debugging as soon
147     # as you are through smakewhatis. In the sman program these two
148     # column names are hardcoded. You have the opportunity to create
149     # the two attributes for every record in the Layout/Parser class
150    
151     keyset => [['docid']],
152     # which keys are necessary to unambiguously identify a record and
153     # access it through $access?
154    
155     invindex =>
156     [
157     'name' => $stem,
158     'synopsis' => $stem,
159     'bugs' => $stem,
160     'description' => $stem,
161     'text' => $stem,
162     'environment' => $text,
163     'example' => $text, 'example' => $stem,
164     'author' => $sound, 'author' => $stem,
165     ]
166     # without this argument, WAIT will be able to run a pass through
167     # the indexer but it won't do anything useful. This argument is the
168     # heart of your indexing task and the place where you will start
169     # tuning once your indexes are working. For the impatent user, it's
170     # recommended to just have them all be text.
171    
172     );
173    
174     die unless $tb;
175    
176     my @DIRS;
177     if (@ARGV) {
178     @DIRS = @ARGV;
179     } else {
180     @DIRS = @{$WAIT::Config->{manpath}};
181     }
182    
183 ulpfr 19 $tb->set(top=>1);
184 ulpfr 10 my $mandir;
185     for $mandir (grep -d $_, @DIRS) {
186     opendir(DIR, $mandir) or warn "Could not open dir '$mandir': $!";
187     my @mdir = grep -d "$mandir/$_", grep /^man/, readdir(DIR);
188     closedir DIR;
189     my $section;
190     for $section (@mdir) {
191     my $file;
192     print STDERR "Scanning '$mandir/$section' ...\n";
193     opendir(DIR, "$mandir/$section")
194     or warn "Could not open dir '$mandir/section': $!";
195     my @files = grep -f "$mandir/$section/$_", grep $_ !~ /^\./, readdir(DIR);
196     closedir DIR;
197     for $file ( @files ) {
198     print STDERR "Indexing '$mandir/$section/$file' ... ";
199     &index("$mandir/$section/$file");
200     }
201     }
202     }
203 ulpfr 19 my $now = time;
204     warn "Starting reorg\n";
205     $tb->set(top=>1);
206     warn sprintf "Finished reorg %d seconds\n", time - $now;
207 ulpfr 10
208     # Do not forget to close the database after the extreme job you just finished.
209    
210     $db->close();
211     exit;
212    
213     # Now that you have created a database, lean back. To verify that it
214     # sort of worked and to understand what you actually did, I'd
215     # recommend to run sman through the debugger. Sman has options to
216     # choose databases and tables unrelated to its original task. You can
217     # run e.g.
218    
219     # perl -Sd sman -dir /usr/local/yourwaitdir -database yourdatabase -table yourtable
220    
221     # Step through the debugger to the place where a query object is
222     # created. Expect huge, self-referential datastrucures if you dump any
223     # of these object with the x command. It's quite instructive to watch
224     # the debugger print them for several minutes or hours.
225    
226     # Once you have established a working querying with sman, you will
227     # want to write your own sman.
228    
229     my $NO;
230     sub index {
231     my $did = shift;
232    
233     if ($tb->have('docid' => $did)) {
234     #die "$@" if $2 ne '';
235     if (!$OPT{remove}) {
236     print "duplicate\n";
237     return;
238     }
239     } elsif ($OPT{remove}) {
240     print "missing\n";
241     return;
242     }
243    
244     if (-s $did < 100) {
245     print "too small\n";
246     return;
247     }
248    
249     my $value = $D{$did};
250     unless (defined $value) {
251     print "unavailable\n";
252     }
253     printf STDERR "ok [%d]\n", ++$NO;
254    
255     my $record = $layout->split($value);
256     $record->{size} = length($value);
257     my $headline = $record->{name} || $did;
258     $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
259     printf "%s\n", substr($headline,0,80);
260     if ($OPT{remove}) {
261     $tb->delete('docid' => $did, headline => $headline, %{$record});
262     } else {
263     $tb->insert('docid' => $did, headline => $headline, %{$record});
264     }
265     }
266    
267    
268     __END__
269     ## ###################################################################
270     ## pod
271     ## ###################################################################
272    
273     =head1 NAME
274    
275     smakewhatis - generate a manual database for sman
276    
277     =head1 SYNOPSIS
278    
279     B<smakewhatis>
280     [B<-database> I<database name>]
281     [B<-dir> I<database directory>]
282     [B<-table> I<name>]
283     [B<-remove>]
284     [I<mandir> ...]
285    
286     =head1 DESCRIPTION
287    
288     B<Smakewhatis> generates/updates databases for B<sman>(1). If
289     I<mandir>s are specified, these are used. Otherwise the confiigured
290     default directories are indexed.
291    
292     =head2 OPTIONS
293    
294     =over 10
295    
296     =item B<-database> I<database name>
297    
298     Change the default database name to I<database name>.
299    
300     =item B<-dir> I<database directory>
301    
302     Change the default database directory to I<database directory>.
303    
304     =item B<-table> I<name>
305    
306     Use I<name> instead of C<man> as table name.
307    
308     =item B<-clean>
309    
310     Clean B<database> before indexing.
311    
312     =item B<-remove>
313    
314     Remove the selected directories from the database instead of
315     adding/updating. This works only for the manuals which are unchanged
316     since the indexing.
317    
318     =head1 SEE ALSO
319    
320     L<sman>.
321    
322     =head1 AUTHOR
323    
324     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1.1.2
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26