1 |
/* |
2 |
** SWISH++ |
3 |
** config.h |
4 |
** |
5 |
** Copyright (C) 1998 Paul J. Lucas |
6 |
** |
7 |
** This program is free software; you can redistribute it and/or modify |
8 |
** it under the terms of the GNU General Public License as published by |
9 |
** the Free Software Foundation; either version 2 of the License, or |
10 |
** (at your option) any later version. |
11 |
** |
12 |
** This program is distributed in the hope that it will be useful, |
13 |
** but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 |
** GNU General Public License for more details. |
16 |
** |
17 |
** You should have received a copy of the GNU General Public License |
18 |
** along with this program; if not, write to the Free Software |
19 |
** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
20 |
*/ |
21 |
|
22 |
#ifndef config_H |
23 |
#define config_H |
24 |
|
25 |
////////// Word determination ///////////////////////////////////////////////// |
26 |
|
27 |
int const Word_Hard_Min_Size = 1; |
28 |
int const Word_Hard_Max_Size = 9999; |
29 |
// The minimum and maximum lengths a word must be in order even to |
30 |
// bother doing more aggressive checks on it to determine if it |
31 |
// should be indexed. |
32 |
|
33 |
int const Word_Min_Size = 1; |
34 |
// The minimum length a non-acronym word must be in order to be |
35 |
// considered for indexing. |
36 |
|
37 |
int const Word_Min_Vowels = 0; |
38 |
// The minimum number of vowels a word must have in order to be |
39 |
// indexed. |
40 |
|
41 |
int const Word_Hex_Max_Size = 9999; |
42 |
// The maximum length a string composed entirely of hexadecimal |
43 |
// digits i.e., ASCII hex data, can be before it is discarded. |
44 |
// Note that the word "cafe" is a legitimate English word composed |
45 |
// entirely of hexedecimal digits. This parameter is used only by |
46 |
// extract(1) in extract.c. |
47 |
|
48 |
// I don't think there is a word in English that has more than... |
49 |
|
50 |
int const Word_Max_Consec_Consonants = 9999; |
51 |
// ...this many consecutive consonants (like "symphysis") |
52 |
|
53 |
int const Word_Max_Consec_Vowels = 9999; |
54 |
// ...this many consecutive vowels (like "queueing") |
55 |
|
56 |
int const Word_Max_Consec_Same = 9999; |
57 |
// ...this many of the same alphabetic character consecutively |
58 |
|
59 |
int const Word_Max_Consec_Puncts = 9999; |
60 |
// ...this many punctuation character in a row |
61 |
|
62 |
// Characters that are permissible in words: letters must be lower case and |
63 |
// upper case letters would be redundant. |
64 |
// |
65 |
char const Word_Chars[] = "&'-0123456789abcdefghijklmnopqrstuvwxyz_"; |
66 |
// Characters that may be in a word. Note that '&' is here so |
67 |
// acronyms like "AT&T" are treated as one word. Unlike SWISH-E, |
68 |
// ';' does not need to be here to recognize and convert character |
69 |
// entity references. |
70 |
|
71 |
#define OPTIMIZE_WORD_CHARS 1 |
72 |
// If you are using the default set of characters, that is the |
73 |
// alphanumerics and "&'-_" characters, then having this macro set |
74 |
// to 1 will optimize the is_word_char() function yielding about a |
75 |
// 10% performance improvement; alternatively, you can also edit |
76 |
// that function to keep the optimization if you are not using the |
77 |
// default set of characters. See word_util.h for details. |
78 |
|
79 |
char const Word_Begin_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; |
80 |
// Characters that may begin a word; should be a subset of the |
81 |
// above. |
82 |
|
83 |
#define OPTIMIZE_WORD_BEGIN_CHARS 1 |
84 |
// Same deal as with OPTIMIZE_WORD_CHARS. |
85 |
|
86 |
char const Word_End_Chars[] = "0123456789abcdefghijklmnopqrstuvwxyz"; |
87 |
// Characters that may end a word; usually the same as the above. |
88 |
|
89 |
#define OPTIMIZE_WORD_END_CHARS 1 |
90 |
// Same deal as with OPTIMIZE_WORD_CHARS. |
91 |
|
92 |
#ifdef SEARCH_DAEMON |
93 |
////////// Search server daemon parameters //////////////////////////////////// |
94 |
|
95 |
char const SocketFile_Default[] = "/tmp/search.socket"; |
96 |
// Default name of the Unix domain socket file; this can be |
97 |
// overridden either in a config. file or on the command line. |
98 |
|
99 |
int const SocketPort_Default = 1967; |
100 |
// Default port number of the TCP socket; this can be overridden |
101 |
// either in a config. file or on the command line. |
102 |
|
103 |
int const SocketQueueSize_Default = 511; |
104 |
// Maximum number of queued connections for a socket. From |
105 |
// [Stevens 1998], p. 96: |
106 |
// |
107 |
// Historically, sample code always shows a backlog of 5, |
108 |
// as that was the maximum value supported by 4.2BSD. |
109 |
// This was adequate in the 1980s when busy servers would |
110 |
// handle only a few hundred connections per day. But |
111 |
// with the growth of the World Wide Web (WWW), where busy |
112 |
// servers handle millions of connections per day, this |
113 |
// small number is completely inadequate. Busy HTTP |
114 |
// servers must specify a much larger backlog, and newer |
115 |
// kernels must support larger values. |
116 |
// |
117 |
// Unfortunately, Stevens doesn't say what a good value is. The |
118 |
// default 511 value is taken from httpd.h in Apache: |
119 |
// |
120 |
// It defaults to 511 instead of 512 because some systems |
121 |
// store it as an 8-bit datatype; 512 truncated to 8-bits |
122 |
// is 0, while 511 is 255 when truncated. |
123 |
// |
124 |
// If it's good enough for Apache, it's good enough for us. This |
125 |
// can be overridden either in a config. file or on the command |
126 |
// line. |
127 |
|
128 |
int const SocketTimeout_Default = 10; // seconds |
129 |
// The number of seconds a client has to complete a search request |
130 |
// before being disconnected. This is to prevent a client from |
131 |
// connecting, not completing a request, and causing the thread |
132 |
// servicing the request to wait forever. This can be overridden |
133 |
// either in a config. file or on the command line. |
134 |
|
135 |
int const ThreadsMin_Default = 5; |
136 |
// The minimum number of simultanous threads; this can be |
137 |
// overridden either in a config. file or on the command line. |
138 |
|
139 |
int const ThreadsMax_Default = 100; |
140 |
// The maximum number of simultanous threads; this can be |
141 |
// overridden either in a config. file or on the command line. |
142 |
|
143 |
int const ThreadTimeout_Default = 30; // seconds |
144 |
// The number of seconds until an idle spare thread times out and |
145 |
// destroys itself. This can be overridden either in a config. |
146 |
// file or on the command line. |
147 |
|
148 |
char const User_Default[] = "nobody"; |
149 |
char const Group_Default[] = "nobody"; |
150 |
// The user and group to switch to after initialization (if root |
151 |
// to begin with). This can be overridden either in a config. |
152 |
// file or on the command line. |
153 |
#endif |
154 |
|
155 |
////////// Miscellaneous parameters /////////////////////////////////////////// |
156 |
|
157 |
char const ConfigFile_Default[] = "swish++.conf"; |
158 |
// Default name of the configuration file; this can be overridden |
159 |
// on the command line. |
160 |
|
161 |
char const ExtractExtension_Default[] = "txt"; |
162 |
// Default extension to append to filenames during extraction. |
163 |
// This can be overridden either in a config. file or on the |
164 |
// command line. |
165 |
|
166 |
int const FilesGrow_Default = 100; |
167 |
// Default number of files to grow reserved space for when |
168 |
// incrementally indexing. This can be overridden either in a |
169 |
// config. file or on the command line. |
170 |
|
171 |
int const FilesReserve_Default = 1000; |
172 |
// Default maximum number of files to reserve space for; see |
173 |
// file_info.c for details. This can be overridden either in a |
174 |
// config. file or on the command line. |
175 |
|
176 |
int const Fork_Attempts = 5; |
177 |
// Number of times to try to fork before giving up. This |
178 |
// parameter is used only in filter.c. |
179 |
|
180 |
int const Fork_Sleep = 5; // seconds |
181 |
// Number of seconds to sleep before retrying to fork. This |
182 |
// parameter is used only in filter.c. |
183 |
|
184 |
char const IndexFile_Default[] = "swish++.index"; |
185 |
// Default name of the index file generated/searched; can be |
186 |
// overridden either in a config. file or on the command line. |
187 |
|
188 |
int const ResultsMax_Default = 999999; |
189 |
// Default maximum number of search results; this can be |
190 |
// overridden either in a config. file or on the command line. |
191 |
|
192 |
char const ShellFilenameDelimChars[] = " \t&;<>|"; |
193 |
// Characters in a Unix shell command that delimit file names. |
194 |
// Note that this says "file" (not "path") names. |
195 |
|
196 |
char const ShellFilenameEscapeChars[] = " !\"#$&'()*/;<>?[\\]^`{|}~"; |
197 |
// Characters in a file name that must be escaped when passed to a |
198 |
// Unix shell. This is a superset of what are commonly referred |
199 |
// to as "meta-characers" because the space and tab characters are |
200 |
// included. Note again that this says "file" (not "path") name. |
201 |
|
202 |
#ifdef __CYGWIN__ |
203 |
char const TempDirectory_Default[] = "/temp"; |
204 |
#else |
205 |
|
206 |
char const TempDirectory_Default[] = "/tmp"; |
207 |
#endif |
208 |
// Default directory to use for temporary files during indexing. |
209 |
// If your OS mounts swap space via /tmp (e.g., Solaris), as |
210 |
// indexing progresses and more files get created in /tmp, you |
211 |
// will have less swap space, indexing will get slower, and you |
212 |
// may run out of memory. If this is the case, you can either |
213 |
// change this default here for all users (preferred) or override |
214 |
// it either in a config. file or on the command line to use a |
215 |
// directory on a real filesystem, i.e., one on a physical disk, |
216 |
// e.g., /var/tmp on some OSs. The directory must exist. |
217 |
|
218 |
int const TitleLines_Default = 12; |
219 |
// Specifies the maximum number of lines into a file for its |
220 |
// "title" (whatever that means for a given file format); this can |
221 |
// be overridden either in a config. file or on the command line. |
222 |
|
223 |
int const Title_Max_Size = 200; |
224 |
// Maximum length of a file "title" (whatever that means for a |
225 |
// given file format). |
226 |
|
227 |
#ifdef FEATURE_word_pos |
228 |
int const WordsNear_Default = 10; |
229 |
// The maximum number of words apart two words can be to be |
230 |
// considered "near" each other; this can be overridden either in |
231 |
// a config. file or on the command line. |
232 |
#endif |
233 |
|
234 |
int const WordPercentMax_Default = 100; |
235 |
// Default maximum percentage of files a word may occur in before |
236 |
// it is discarded as being too frequent; this can be overridden |
237 |
// either in a config. file or on the command line. |
238 |
|
239 |
int const WordThreshold_Default = 250000; |
240 |
// The word count past which partial indicies are generated and |
241 |
// merged since all the words are too big to fit into memory at |
242 |
// the same time. If you index and your machine begins to swap |
243 |
// like mad, lower this value. The above works OK in a 64MB |
244 |
// machine. A rule of thumb is to add 250000 words for each |
245 |
// additional 64MB of RAM you have. These numbers are for a SPARC |
246 |
// machine running Solaris. Other machines running other |
247 |
// operating systems use memory differently. You simply have to |
248 |
// experiment. Only the super-user can increase this either in a |
249 |
// config. file or on the command line. |
250 |
|
251 |
#endif /* config_H */ |
252 |
/* vim:set noet sw=8 ts=8: */ |