1 |
/* WIDE AREA INFORMATION SERVER SOFTWARE: |
2 |
No guarantees or restrictions. See the readme file for the full standard |
3 |
disclaimer. |
4 |
|
5 |
francois@welchgate.welch.jhu.edu |
6 |
*/ |
7 |
|
8 |
/* |
9 |
Title: COPYRIGHT freeWAIS-0.2 |
10 |
Author: Jane Smith |
11 |
Copyright: Copyright 1993 CNIDR |
12 |
Last update: 10.01.93 |
13 |
Abstract: This file contains the copyright statement for freeWAIS 0.2 |
14 |
|
15 |
Copyright Statement for freeWAIS 0.2 and subsquent freeWAIS |
16 |
releases: |
17 |
|
18 |
Copyright (c) MCNC, Clearinghouse for Networked Information Discovery |
19 |
and Retrieval, 1993. |
20 |
|
21 |
Permission to use, copy, modify, distribute, and sell this software |
22 |
and its documentation, in whole or in part, for any purpose is hereby |
23 |
granted without fee, provided that |
24 |
|
25 |
1. The above copyright notice and this permission notice appear in all |
26 |
copies of the software and related documentation. Notices of copyright |
27 |
and/or attribution which appear at the beginning of any file included |
28 |
in this distribution must remain intact. |
29 |
|
30 |
2. Users of this software agree to make their best efforts (a) to |
31 |
return to MCNC any improvements or extensions that they make, so that |
32 |
these may be included in future releases; and (b) to inform MCNC/CNIDR |
33 |
of noteworthy uses of this software. |
34 |
|
35 |
3. The names of MCNC and Clearinghouse for Networked Information |
36 |
Discovery and Retrieval may not be used in any advertising or publicity |
37 |
relating to the software without the specific, prior written permission |
38 |
of MCNC/CNIDR. |
39 |
|
40 |
THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND, |
41 |
EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY |
42 |
WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. |
43 |
|
44 |
IN NO EVENT SHALL MCNC/CNIDR BE LIABLE FOR ANY SPECIAL, INCIDENTAL, |
45 |
INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY DAMAGES WHATSOEVER |
46 |
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER OR NOT ADVISED OF |
47 |
THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF LIABILITY, ARISING OUT |
48 |
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
49 |
*/ |
50 |
|
51 |
#ifdef __cplusplus |
52 |
extern "C" { |
53 |
#endif |
54 |
#include "EXTERN.h" |
55 |
#include "perl.h" |
56 |
#include "XSUB.h" |
57 |
#ifdef __cplusplus |
58 |
} |
59 |
|
60 |
#endif |
61 |
#include "stemmer.h" |
62 |
|
63 |
/* |
64 |
|
65 |
Purpose: Implementation of the Porter stemming algorithm documented |
66 |
in: Porter, M.F., "An Algorithm For Suffix Stripping," |
67 |
Program 14 (3), July 1980, pp. 130-137. |
68 |
|
69 |
Provenance: Written by B. Frakes and C. Cox, 1986. |
70 |
Changed by C. Fox, 1990. |
71 |
- made measure function a DFA |
72 |
- restructured structs |
73 |
- renamed functions and variables |
74 |
- restricted function and variable scopes |
75 |
Changed by C. Fox, July, 1991. |
76 |
- added ANSI C declarations |
77 |
- branch tested to 90% coverage |
78 |
|
79 |
Notes: This code will make little sense without the the Porter |
80 |
article. The stemming function converts its input to |
81 |
lower case. |
82 |
*/ |
83 |
|
84 |
#define EOS '\0' |
85 |
#define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c)) |
86 |
|
87 |
typedef struct { |
88 |
int id; /* returned if rule fired */ |
89 |
char *old_end; /* suffix replaced */ |
90 |
char *new_end; /* suffix replacement */ |
91 |
int old_offset; /* from end of word to start of suffix */ |
92 |
int new_offset; /* from beginning to end of new suffix */ |
93 |
int min_root_size; /* min root word size for replacement */ |
94 |
int (*condition) (); /* the replacement test function */ |
95 |
} RuleList; |
96 |
|
97 |
static char LAMBDA[1] = ""; /* the constant empty string */ |
98 |
static char *end; /* pointer to the end of the word */ |
99 |
|
100 |
/*****************************************************************************/ |
101 |
/******************** Private Function Declarations **********************/ |
102 |
|
103 |
static int WordSize _((char *word)); |
104 |
static int ContainsVowel _((char *word)); |
105 |
static int EndsWithCVC _((char *word)); |
106 |
static int AddAnE _((char *word)); |
107 |
static int RemoveAnE _((char *word)); |
108 |
static int ReplaceEnd _((char *word, RuleList * rule)); |
109 |
|
110 |
/******************************************************************************/ |
111 |
/***************** Initialized Private Data Structures ********************/ |
112 |
|
113 |
static RuleList step1a_rules[] = |
114 |
{ |
115 |
101, "sses", "ss", 3, 1, -1, NULL, |
116 |
102, "ies", "i", 2, 0, -1, NULL, |
117 |
103, "ss", "ss", 1, 1, -1, NULL, |
118 |
104, "s", LAMBDA, 0, -1, -1, NULL, |
119 |
000, NULL, NULL, 0, 0, 0, NULL, |
120 |
}; |
121 |
|
122 |
static RuleList step1b_rules[] = |
123 |
{ |
124 |
105, "eed", "ee", 2, 1, 0, NULL, |
125 |
106, "ed", LAMBDA, 1, -1, -1, ContainsVowel, |
126 |
107, "ing", LAMBDA, 2, -1, -1, ContainsVowel, |
127 |
000, NULL, NULL, 0, 0, 0, NULL, |
128 |
}; |
129 |
|
130 |
static RuleList step1b1_rules[] = |
131 |
{ |
132 |
108, "at", "ate", 1, 2, -1, NULL, |
133 |
109, "bl", "ble", 1, 2, -1, NULL, |
134 |
110, "iz", "ize", 1, 2, -1, NULL, |
135 |
111, "bb", "b", 1, 0, -1, NULL, |
136 |
112, "dd", "d", 1, 0, -1, NULL, |
137 |
113, "ff", "f", 1, 0, -1, NULL, |
138 |
114, "gg", "g", 1, 0, -1, NULL, |
139 |
115, "mm", "m", 1, 0, -1, NULL, |
140 |
116, "nn", "n", 1, 0, -1, NULL, |
141 |
117, "pp", "p", 1, 0, -1, NULL, |
142 |
118, "rr", "r", 1, 0, -1, NULL, |
143 |
119, "tt", "t", 1, 0, -1, NULL, |
144 |
120, "ww", "w", 1, 0, -1, NULL, |
145 |
121, "xx", "x", 1, 0, -1, NULL, |
146 |
122, LAMBDA, "e", -1, 0, -1, AddAnE, |
147 |
000, NULL, NULL, 0, 0, 0, NULL, |
148 |
}; |
149 |
|
150 |
static RuleList step1c_rules[] = |
151 |
{ |
152 |
123, "y", "i", 0, 0, -1, ContainsVowel, |
153 |
000, NULL, NULL, 0, 0, 0, NULL, |
154 |
}; |
155 |
|
156 |
static RuleList step2_rules[] = |
157 |
{ |
158 |
203, "ational", "ate", 6, 2, 0, NULL, |
159 |
204, "tional", "tion", 5, 3, 0, NULL, |
160 |
205, "enci", "ence", 3, 3, 0, NULL, |
161 |
206, "anci", "ance", 3, 3, 0, NULL, |
162 |
207, "izer", "ize", 3, 2, 0, NULL, |
163 |
208, "abli", "able", 3, 3, 0, NULL, |
164 |
209, "alli", "al", 3, 1, 0, NULL, |
165 |
210, "entli", "ent", 4, 2, 0, NULL, |
166 |
211, "eli", "e", 2, 0, 0, NULL, |
167 |
213, "ousli", "ous", 4, 2, 0, NULL, |
168 |
214, "ization", "ize", 6, 2, 0, NULL, |
169 |
215, "ation", "ate", 4, 2, 0, NULL, |
170 |
216, "ator", "ate", 3, 2, 0, NULL, |
171 |
217, "alism", "al", 4, 1, 0, NULL, |
172 |
218, "iveness", "ive", 6, 2, 0, NULL, |
173 |
219, "fulnes", "ful", 5, 2, 0, NULL, |
174 |
220, "ousness", "ous", 6, 2, 0, NULL, |
175 |
221, "aliti", "al", 4, 1, 0, NULL, |
176 |
222, "iviti", "ive", 4, 2, 0, NULL, |
177 |
223, "biliti", "ble", 5, 2, 0, NULL, |
178 |
000, NULL, NULL, 0, 0, 0, NULL, |
179 |
}; |
180 |
|
181 |
static RuleList step3_rules[] = |
182 |
{ |
183 |
301, "icate", "ic", 4, 1, 0, NULL, |
184 |
302, "ative", LAMBDA, 4, -1, 0, NULL, |
185 |
303, "alize", "al", 4, 1, 0, NULL, |
186 |
304, "iciti", "ic", 4, 1, 0, NULL, |
187 |
305, "ical", "ic", 3, 1, 0, NULL, |
188 |
308, "ful", LAMBDA, 2, -1, 0, NULL, |
189 |
309, "ness", LAMBDA, 3, -1, 0, NULL, |
190 |
000, NULL, NULL, 0, 0, 0, NULL, |
191 |
}; |
192 |
|
193 |
static RuleList step4_rules[] = |
194 |
{ |
195 |
401, "al", LAMBDA, 1, -1, 1, NULL, |
196 |
402, "ance", LAMBDA, 3, -1, 1, NULL, |
197 |
403, "ence", LAMBDA, 3, -1, 1, NULL, |
198 |
405, "er", LAMBDA, 1, -1, 1, NULL, |
199 |
406, "ic", LAMBDA, 1, -1, 1, NULL, |
200 |
407, "able", LAMBDA, 3, -1, 1, NULL, |
201 |
408, "ible", LAMBDA, 3, -1, 1, NULL, |
202 |
409, "ant", LAMBDA, 2, -1, 1, NULL, |
203 |
410, "ement", LAMBDA, 4, -1, 1, NULL, |
204 |
411, "ment", LAMBDA, 3, -1, 1, NULL, |
205 |
412, "ent", LAMBDA, 2, -1, 1, NULL, |
206 |
423, "sion", "s", 3, 0, 1, NULL, |
207 |
424, "tion", "t", 3, 0, 1, NULL, |
208 |
415, "ou", LAMBDA, 1, -1, 1, NULL, |
209 |
416, "ism", LAMBDA, 2, -1, 1, NULL, |
210 |
417, "ate", LAMBDA, 2, -1, 1, NULL, |
211 |
418, "iti", LAMBDA, 2, -1, 1, NULL, |
212 |
419, "ous", LAMBDA, 2, -1, 1, NULL, |
213 |
420, "ive", LAMBDA, 2, -1, 1, NULL, |
214 |
421, "ize", LAMBDA, 2, -1, 1, NULL, |
215 |
000, NULL, NULL, 0, 0, 0, NULL, |
216 |
}; |
217 |
|
218 |
static RuleList step5a_rules[] = |
219 |
{ |
220 |
501, "e", LAMBDA, 0, -1, 1, NULL, |
221 |
502, "e", LAMBDA, 0, -1, -1, RemoveAnE, |
222 |
000, NULL, NULL, 0, 0, 0, NULL, |
223 |
}; |
224 |
|
225 |
static RuleList step5b_rules[] = |
226 |
{ |
227 |
503, "ll", "l", 1, 0, 1, NULL, |
228 |
000, NULL, NULL, 0, 0, 0, NULL, |
229 |
}; |
230 |
|
231 |
|
232 |
/* |
233 |
|
234 |
WordSize( word ) |
235 |
|
236 |
Returns: int -- a weird count of word size in adjusted syllables |
237 |
|
238 |
Purpose: Count syllables in a special way: count the number |
239 |
vowel-consonant pairs in a word, disregarding initial |
240 |
consonants and final vowels. The letter "y" counts as a |
241 |
consonant at the beginning of a word and when it has a vowel |
242 |
in front of it; otherwise (when it follows a consonant) it |
243 |
is treated as a vowel. For example, the WordSize of "cat" |
244 |
is 1, of "any" is 1, of "amount" is 2, of "anything" is 3. |
245 |
|
246 |
Plan: Run a DFA to compute the word size |
247 |
|
248 |
Notes: The easiest and fastest way to compute this funny measure is |
249 |
with a finite state machine. The initial state 0 checks |
250 |
the first letter. If it is a vowel, then the machine changes |
251 |
to state 1, which is the "last letter was a vowel" state. |
252 |
If the first letter is a consonant or y, then it changes |
253 |
to state 2, the "last letter was a consonant state". In |
254 |
state 1, a y is treated as a consonant (since it follows |
255 |
a vowel), but in state 2, y is treated as a vowel (since |
256 |
it follows a consonant. The result counter is incremented |
257 |
on the transition from state 1 to state 2, since this |
258 |
transition only occurs after a vowel-consonant pair, which |
259 |
is what we are counting. |
260 |
*/ |
261 |
|
262 |
static int |
263 |
WordSize(word) |
264 |
char *word; /* in: word having its WordSize taken */ |
265 |
{ |
266 |
register int result; /* WordSize of the word */ |
267 |
register int state; /* current state in machine */ |
268 |
|
269 |
result = 0; |
270 |
state = 0; |
271 |
|
272 |
/* Run a DFA to compute the word size */ |
273 |
while (EOS != *word) { |
274 |
switch (state) { |
275 |
case 0: |
276 |
state = (IsVowel(*word)) ? 1 : 2; |
277 |
break; |
278 |
case 1: |
279 |
state = (IsVowel(*word)) ? 1 : 2; |
280 |
if (2 == state) |
281 |
result++; |
282 |
break; |
283 |
case 2: |
284 |
state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2; |
285 |
break; |
286 |
} |
287 |
word++; |
288 |
} |
289 |
|
290 |
return (result); |
291 |
|
292 |
} |
293 |
|
294 |
/* |
295 |
ContainsVowel( word ) |
296 |
|
297 |
Returns: int -- TRUE (1) if the word parameter contains a vowel, |
298 |
FALSE (0) otherwise. |
299 |
|
300 |
Purpose: Some of the rewrite rules apply only to a root containing |
301 |
a vowel, where a vowel is one of "aeiou" or y with a |
302 |
consonant in front of it. |
303 |
|
304 |
Plan: Obviously, under the definition of a vowel, a word contains |
305 |
a vowel iff either its first letter is one of "aeiou", or |
306 |
any of its other letters are "aeiouy". The plan is to |
307 |
test this condition. |
308 |
|
309 |
Notes: None |
310 |
*/ |
311 |
|
312 |
static int |
313 |
ContainsVowel(word) |
314 |
char *word; /* in: buffer with word checked */ |
315 |
{ |
316 |
|
317 |
if (EOS == *word) |
318 |
return (FALSE); |
319 |
else |
320 |
return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy"))); |
321 |
|
322 |
} /* ContainsVowel */ |
323 |
|
324 |
/* |
325 |
|
326 |
EndsWithCVC( word ) |
327 |
|
328 |
Returns: int -- TRUE (1) if the current word ends with a |
329 |
consonant-vowel-consonant combination, and the second |
330 |
consonant is not w, x, or y, FALSE (0) otherwise. |
331 |
|
332 |
Purpose: Some of the rewrite rules apply only to a root with |
333 |
this characteristic. |
334 |
|
335 |
Plan: Look at the last three characters. |
336 |
|
337 |
Notes: None |
338 |
*/ |
339 |
|
340 |
static int |
341 |
EndsWithCVC(word) |
342 |
char *word; /* in: buffer with the word checked */ |
343 |
{ |
344 |
int length; /* for finding the last three characters */ |
345 |
|
346 |
if ((length = strlen(word)) < 2) |
347 |
return (FALSE); |
348 |
else { |
349 |
end = word + length - 1; |
350 |
return ((NULL == strchr("aeiouwxy", *end--)) /* consonant */ |
351 |
&&(NULL != strchr("aeiouy", *end--)) /* vowel */ |
352 |
&&(NULL == strchr("aeiou", *end))); /* consonant */ |
353 |
} |
354 |
|
355 |
} |
356 |
|
357 |
/* |
358 |
AddAnE( word ) |
359 |
|
360 |
Returns: int -- TRUE (1) if the current word meets special conditions |
361 |
for adding an e. |
362 |
|
363 |
Purpose: Rule 122 applies only to a root with this characteristic. |
364 |
|
365 |
Plan: Check for size of 1 and a consonant-vowel-consonant ending. |
366 |
|
367 |
Notes: None |
368 |
*/ |
369 |
|
370 |
static int |
371 |
AddAnE(word) |
372 |
char *word; |
373 |
{ |
374 |
return ((1 == WordSize(word)) && EndsWithCVC(word)); |
375 |
} |
376 |
|
377 |
/* |
378 |
RemoveAnE( word ) |
379 |
|
380 |
Returns: int -- TRUE (1) if the current word meets special conditions |
381 |
for removing an e. |
382 |
|
383 |
Purpose: Rule 502 applies only to a root with this characteristic. |
384 |
|
385 |
Plan: Check for size of 1 and no consonant-vowel-consonant ending. |
386 |
|
387 |
Notes: None |
388 |
*/ |
389 |
|
390 |
static int |
391 |
RemoveAnE(word) |
392 |
char *word; |
393 |
{ |
394 |
return ((1 == WordSize(word)) && !EndsWithCVC(word)); |
395 |
} /* RemoveAnE */ |
396 |
|
397 |
/* |
398 |
ReplaceEnd( word, rule ) |
399 |
|
400 |
Returns: int -- the id for the rule fired, 0 is none is fired |
401 |
|
402 |
Purpose: Apply a set of rules to replace the suffix of a word |
403 |
|
404 |
Plan: Loop through the rule set until a match meeting all conditions |
405 |
is found. If a rule fires, return its id, otherwise return 0. |
406 |
Connditions on the length of the root are checked as part of this |
407 |
function's processing because this check is so often made. |
408 |
|
409 |
Notes: This is the main routine driving the stemmer. It goes through |
410 |
a set of suffix replacement rules looking for a match on the |
411 |
current suffix. When it finds one, if the root of the word |
412 |
is long enough, and it meets whatever other conditions are |
413 |
required, then the suffix is replaced, and the function returns. |
414 |
* */ |
415 |
|
416 |
static int |
417 |
ReplaceEnd(word, rule) |
418 |
char *word; /* in/out: buffer with the stemmed word */ |
419 |
RuleList *rule; /* in: data structure with replacement rules */ |
420 |
{ |
421 |
register char *ending; /* set to start of possible stemmed suffix */ |
422 |
char tmp_ch; /* save replaced character when testing */ |
423 |
|
424 |
while (0 != rule->id) { |
425 |
ending = end - rule->old_offset; |
426 |
if (word <= ending) |
427 |
if ((*ending == *rule->old_end) && /* for speed */ |
428 |
!strcmp(ending, rule->old_end)) { |
429 |
tmp_ch = *ending; |
430 |
*ending = EOS; |
431 |
if (rule->min_root_size < WordSize(word)) |
432 |
if (!rule->condition || (*rule->condition) (word)) { |
433 |
(void) strcat(word, rule->new_end); |
434 |
end = ending + rule->new_offset; |
435 |
break; |
436 |
} |
437 |
*ending = tmp_ch; |
438 |
} |
439 |
rule++; |
440 |
} |
441 |
|
442 |
return (rule->id); |
443 |
} |
444 |
|
445 |
/* |
446 |
Stem( word ) |
447 |
|
448 |
Returns: int -- FALSE (0) if the word contains non-alphabetic characters |
449 |
and hence is not stemmed, TRUE (1) otherwise |
450 |
|
451 |
Purpose: Stem a word |
452 |
|
453 |
Plan: Part 1: Check to ensure the word is all alphabetic |
454 |
Part 2: Run through the Porter algorithm |
455 |
Part 3: Return an indication of successful stemming |
456 |
|
457 |
Notes: This function implements the Porter stemming algorithm, with |
458 |
a few additions here and there. See: |
459 |
|
460 |
Porter, M.F., "An Algorithm For Suffix Stripping," |
461 |
Program 14 (3), July 1980, pp. 130-137. |
462 |
|
463 |
Porter's algorithm is an ad hoc set of rewrite rules with |
464 |
various conditions on rule firing. The terminology of |
465 |
"step 1a" and so on, is taken directly from Porter's |
466 |
article, which unfortunately gives almost no justification |
467 |
for the various steps. Thus this function more or less |
468 |
faithfully refects the opaque presentation in the article. |
469 |
Changes from the article amount to a few additions to the |
470 |
rewrite rules; these are marked in the RuleList data |
471 |
structures with comments. |
472 |
*/ |
473 |
|
474 |
int |
475 |
Stem(word) |
476 |
char *word; /* in/out: the word stemmed */ |
477 |
{ |
478 |
int rule; /* which rule is fired in replacing an end */ |
479 |
|
480 |
/* Part 1: Check to ensure the word is all alphabetic */ |
481 |
for (end = word; *end != EOS; end++) |
482 |
if (!isalpha(*end)) |
483 |
return (FALSE); |
484 |
else |
485 |
*end = tolower(*end); |
486 |
end--; |
487 |
|
488 |
/* Part 2: Run through the Porter algorithm */ |
489 |
(void) ReplaceEnd(word, step1a_rules); |
490 |
rule = ReplaceEnd(word, step1b_rules); |
491 |
if ((106 == rule) || (107 == rule)) |
492 |
(void) ReplaceEnd(word, step1b1_rules); |
493 |
(void) ReplaceEnd(word, step1c_rules); |
494 |
|
495 |
(void) ReplaceEnd(word, step2_rules); |
496 |
|
497 |
(void) ReplaceEnd(word, step3_rules); |
498 |
|
499 |
(void) ReplaceEnd(word, step4_rules); |
500 |
|
501 |
(void) ReplaceEnd(word, step5a_rules); |
502 |
(void) ReplaceEnd(word, step5b_rules); |
503 |
|
504 |
/* Part 3: Return an indication of successful stemming */ |
505 |
return (TRUE); |
506 |
} |