/[wait]/trunk/stemmer.c
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/stemmer.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 88 - (show annotations)
Mon May 24 13:44:01 2004 UTC (19 years, 11 months ago) by dpavlin
File MIME type: text/plain
File size: 17701 byte(s)
move cvs-head to trunk

1 /* WIDE AREA INFORMATION SERVER SOFTWARE:
2 No guarantees or restrictions. See the readme file for the full standard
3 disclaimer.
4
5 francois@welchgate.welch.jhu.edu
6 */
7
8 /*
9 Title: COPYRIGHT freeWAIS-0.2
10 Author: Jane Smith
11 Copyright: Copyright 1993 CNIDR
12 Last update: 10.01.93
13 Abstract: This file contains the copyright statement for freeWAIS 0.2
14
15 Copyright Statement for freeWAIS 0.2 and subsquent freeWAIS
16 releases:
17
18 Copyright (c) MCNC, Clearinghouse for Networked Information Discovery
19 and Retrieval, 1993.
20
21 Permission to use, copy, modify, distribute, and sell this software
22 and its documentation, in whole or in part, for any purpose is hereby
23 granted without fee, provided that
24
25 1. The above copyright notice and this permission notice appear in all
26 copies of the software and related documentation. Notices of copyright
27 and/or attribution which appear at the beginning of any file included
28 in this distribution must remain intact.
29
30 2. Users of this software agree to make their best efforts (a) to
31 return to MCNC any improvements or extensions that they make, so that
32 these may be included in future releases; and (b) to inform MCNC/CNIDR
33 of noteworthy uses of this software.
34
35 3. The names of MCNC and Clearinghouse for Networked Information
36 Discovery and Retrieval may not be used in any advertising or publicity
37 relating to the software without the specific, prior written permission
38 of MCNC/CNIDR.
39
40 THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
41 EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
42 WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
43
44 IN NO EVENT SHALL MCNC/CNIDR BE LIABLE FOR ANY SPECIAL, INCIDENTAL,
45 INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND, OR ANY DAMAGES WHATSOEVER
46 RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER OR NOT ADVISED OF
47 THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF LIABILITY, ARISING OUT
48 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
49 */
50
51 #ifdef __cplusplus
52 extern "C" {
53 #endif
54 #include "EXTERN.h"
55 #include "perl.h"
56 #include "XSUB.h"
57 #ifdef __cplusplus
58 }
59
60 #endif
61 #include "stemmer.h"
62
63 /*
64
65 Purpose: Implementation of the Porter stemming algorithm documented
66 in: Porter, M.F., "An Algorithm For Suffix Stripping,"
67 Program 14 (3), July 1980, pp. 130-137.
68
69 Provenance: Written by B. Frakes and C. Cox, 1986.
70 Changed by C. Fox, 1990.
71 - made measure function a DFA
72 - restructured structs
73 - renamed functions and variables
74 - restricted function and variable scopes
75 Changed by C. Fox, July, 1991.
76 - added ANSI C declarations
77 - branch tested to 90% coverage
78
79 Notes: This code will make little sense without the the Porter
80 article. The stemming function converts its input to
81 lower case.
82 */
83
84 #define EOS '\0'
85 #define IsVowel(c) ('a'==(c)||'e'==(c)||'i'==(c)||'o'==(c)||'u'==(c))
86
87 typedef struct {
88 int id; /* returned if rule fired */
89 char *old_end; /* suffix replaced */
90 char *new_end; /* suffix replacement */
91 int old_offset; /* from end of word to start of suffix */
92 int new_offset; /* from beginning to end of new suffix */
93 int min_root_size; /* min root word size for replacement */
94 int (*condition) (); /* the replacement test function */
95 } RuleList;
96
97 static char LAMBDA[1] = ""; /* the constant empty string */
98 static char *end; /* pointer to the end of the word */
99
100 /*****************************************************************************/
101 /******************** Private Function Declarations **********************/
102
103 static int WordSize _((char *word));
104 static int ContainsVowel _((char *word));
105 static int EndsWithCVC _((char *word));
106 static int AddAnE _((char *word));
107 static int RemoveAnE _((char *word));
108 static int ReplaceEnd _((char *word, RuleList * rule));
109
110 /******************************************************************************/
111 /***************** Initialized Private Data Structures ********************/
112
113 static RuleList step1a_rules[] =
114 {
115 101, "sses", "ss", 3, 1, -1, NULL,
116 102, "ies", "i", 2, 0, -1, NULL,
117 103, "ss", "ss", 1, 1, -1, NULL,
118 104, "s", LAMBDA, 0, -1, -1, NULL,
119 000, NULL, NULL, 0, 0, 0, NULL,
120 };
121
122 static RuleList step1b_rules[] =
123 {
124 105, "eed", "ee", 2, 1, 0, NULL,
125 106, "ed", LAMBDA, 1, -1, -1, ContainsVowel,
126 107, "ing", LAMBDA, 2, -1, -1, ContainsVowel,
127 000, NULL, NULL, 0, 0, 0, NULL,
128 };
129
130 static RuleList step1b1_rules[] =
131 {
132 108, "at", "ate", 1, 2, -1, NULL,
133 109, "bl", "ble", 1, 2, -1, NULL,
134 110, "iz", "ize", 1, 2, -1, NULL,
135 111, "bb", "b", 1, 0, -1, NULL,
136 112, "dd", "d", 1, 0, -1, NULL,
137 113, "ff", "f", 1, 0, -1, NULL,
138 114, "gg", "g", 1, 0, -1, NULL,
139 115, "mm", "m", 1, 0, -1, NULL,
140 116, "nn", "n", 1, 0, -1, NULL,
141 117, "pp", "p", 1, 0, -1, NULL,
142 118, "rr", "r", 1, 0, -1, NULL,
143 119, "tt", "t", 1, 0, -1, NULL,
144 120, "ww", "w", 1, 0, -1, NULL,
145 121, "xx", "x", 1, 0, -1, NULL,
146 122, LAMBDA, "e", -1, 0, -1, AddAnE,
147 000, NULL, NULL, 0, 0, 0, NULL,
148 };
149
150 static RuleList step1c_rules[] =
151 {
152 123, "y", "i", 0, 0, -1, ContainsVowel,
153 000, NULL, NULL, 0, 0, 0, NULL,
154 };
155
156 static RuleList step2_rules[] =
157 {
158 203, "ational", "ate", 6, 2, 0, NULL,
159 204, "tional", "tion", 5, 3, 0, NULL,
160 205, "enci", "ence", 3, 3, 0, NULL,
161 206, "anci", "ance", 3, 3, 0, NULL,
162 207, "izer", "ize", 3, 2, 0, NULL,
163 208, "abli", "able", 3, 3, 0, NULL,
164 209, "alli", "al", 3, 1, 0, NULL,
165 210, "entli", "ent", 4, 2, 0, NULL,
166 211, "eli", "e", 2, 0, 0, NULL,
167 213, "ousli", "ous", 4, 2, 0, NULL,
168 214, "ization", "ize", 6, 2, 0, NULL,
169 215, "ation", "ate", 4, 2, 0, NULL,
170 216, "ator", "ate", 3, 2, 0, NULL,
171 217, "alism", "al", 4, 1, 0, NULL,
172 218, "iveness", "ive", 6, 2, 0, NULL,
173 219, "fulnes", "ful", 5, 2, 0, NULL,
174 220, "ousness", "ous", 6, 2, 0, NULL,
175 221, "aliti", "al", 4, 1, 0, NULL,
176 222, "iviti", "ive", 4, 2, 0, NULL,
177 223, "biliti", "ble", 5, 2, 0, NULL,
178 000, NULL, NULL, 0, 0, 0, NULL,
179 };
180
181 static RuleList step3_rules[] =
182 {
183 301, "icate", "ic", 4, 1, 0, NULL,
184 302, "ative", LAMBDA, 4, -1, 0, NULL,
185 303, "alize", "al", 4, 1, 0, NULL,
186 304, "iciti", "ic", 4, 1, 0, NULL,
187 305, "ical", "ic", 3, 1, 0, NULL,
188 308, "ful", LAMBDA, 2, -1, 0, NULL,
189 309, "ness", LAMBDA, 3, -1, 0, NULL,
190 000, NULL, NULL, 0, 0, 0, NULL,
191 };
192
193 static RuleList step4_rules[] =
194 {
195 401, "al", LAMBDA, 1, -1, 1, NULL,
196 402, "ance", LAMBDA, 3, -1, 1, NULL,
197 403, "ence", LAMBDA, 3, -1, 1, NULL,
198 405, "er", LAMBDA, 1, -1, 1, NULL,
199 406, "ic", LAMBDA, 1, -1, 1, NULL,
200 407, "able", LAMBDA, 3, -1, 1, NULL,
201 408, "ible", LAMBDA, 3, -1, 1, NULL,
202 409, "ant", LAMBDA, 2, -1, 1, NULL,
203 410, "ement", LAMBDA, 4, -1, 1, NULL,
204 411, "ment", LAMBDA, 3, -1, 1, NULL,
205 412, "ent", LAMBDA, 2, -1, 1, NULL,
206 423, "sion", "s", 3, 0, 1, NULL,
207 424, "tion", "t", 3, 0, 1, NULL,
208 415, "ou", LAMBDA, 1, -1, 1, NULL,
209 416, "ism", LAMBDA, 2, -1, 1, NULL,
210 417, "ate", LAMBDA, 2, -1, 1, NULL,
211 418, "iti", LAMBDA, 2, -1, 1, NULL,
212 419, "ous", LAMBDA, 2, -1, 1, NULL,
213 420, "ive", LAMBDA, 2, -1, 1, NULL,
214 421, "ize", LAMBDA, 2, -1, 1, NULL,
215 000, NULL, NULL, 0, 0, 0, NULL,
216 };
217
218 static RuleList step5a_rules[] =
219 {
220 501, "e", LAMBDA, 0, -1, 1, NULL,
221 502, "e", LAMBDA, 0, -1, -1, RemoveAnE,
222 000, NULL, NULL, 0, 0, 0, NULL,
223 };
224
225 static RuleList step5b_rules[] =
226 {
227 503, "ll", "l", 1, 0, 1, NULL,
228 000, NULL, NULL, 0, 0, 0, NULL,
229 };
230
231
232 /*
233
234 WordSize( word )
235
236 Returns: int -- a weird count of word size in adjusted syllables
237
238 Purpose: Count syllables in a special way: count the number
239 vowel-consonant pairs in a word, disregarding initial
240 consonants and final vowels. The letter "y" counts as a
241 consonant at the beginning of a word and when it has a vowel
242 in front of it; otherwise (when it follows a consonant) it
243 is treated as a vowel. For example, the WordSize of "cat"
244 is 1, of "any" is 1, of "amount" is 2, of "anything" is 3.
245
246 Plan: Run a DFA to compute the word size
247
248 Notes: The easiest and fastest way to compute this funny measure is
249 with a finite state machine. The initial state 0 checks
250 the first letter. If it is a vowel, then the machine changes
251 to state 1, which is the "last letter was a vowel" state.
252 If the first letter is a consonant or y, then it changes
253 to state 2, the "last letter was a consonant state". In
254 state 1, a y is treated as a consonant (since it follows
255 a vowel), but in state 2, y is treated as a vowel (since
256 it follows a consonant. The result counter is incremented
257 on the transition from state 1 to state 2, since this
258 transition only occurs after a vowel-consonant pair, which
259 is what we are counting.
260 */
261
262 static int
263 WordSize(word)
264 char *word; /* in: word having its WordSize taken */
265 {
266 register int result; /* WordSize of the word */
267 register int state; /* current state in machine */
268
269 result = 0;
270 state = 0;
271
272 /* Run a DFA to compute the word size */
273 while (EOS != *word) {
274 switch (state) {
275 case 0:
276 state = (IsVowel(*word)) ? 1 : 2;
277 break;
278 case 1:
279 state = (IsVowel(*word)) ? 1 : 2;
280 if (2 == state)
281 result++;
282 break;
283 case 2:
284 state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2;
285 break;
286 }
287 word++;
288 }
289
290 return (result);
291
292 }
293
294 /*
295 ContainsVowel( word )
296
297 Returns: int -- TRUE (1) if the word parameter contains a vowel,
298 FALSE (0) otherwise.
299
300 Purpose: Some of the rewrite rules apply only to a root containing
301 a vowel, where a vowel is one of "aeiou" or y with a
302 consonant in front of it.
303
304 Plan: Obviously, under the definition of a vowel, a word contains
305 a vowel iff either its first letter is one of "aeiou", or
306 any of its other letters are "aeiouy". The plan is to
307 test this condition.
308
309 Notes: None
310 */
311
312 static int
313 ContainsVowel(word)
314 char *word; /* in: buffer with word checked */
315 {
316
317 if (EOS == *word)
318 return (FALSE);
319 else
320 return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy")));
321
322 } /* ContainsVowel */
323
324 /*
325
326 EndsWithCVC( word )
327
328 Returns: int -- TRUE (1) if the current word ends with a
329 consonant-vowel-consonant combination, and the second
330 consonant is not w, x, or y, FALSE (0) otherwise.
331
332 Purpose: Some of the rewrite rules apply only to a root with
333 this characteristic.
334
335 Plan: Look at the last three characters.
336
337 Notes: None
338 */
339
340 static int
341 EndsWithCVC(word)
342 char *word; /* in: buffer with the word checked */
343 {
344 int length; /* for finding the last three characters */
345
346 if ((length = strlen(word)) < 2)
347 return (FALSE);
348 else {
349 end = word + length - 1;
350 return ((NULL == strchr("aeiouwxy", *end--)) /* consonant */
351 &&(NULL != strchr("aeiouy", *end--)) /* vowel */
352 &&(NULL == strchr("aeiou", *end))); /* consonant */
353 }
354
355 }
356
357 /*
358 AddAnE( word )
359
360 Returns: int -- TRUE (1) if the current word meets special conditions
361 for adding an e.
362
363 Purpose: Rule 122 applies only to a root with this characteristic.
364
365 Plan: Check for size of 1 and a consonant-vowel-consonant ending.
366
367 Notes: None
368 */
369
370 static int
371 AddAnE(word)
372 char *word;
373 {
374 return ((1 == WordSize(word)) && EndsWithCVC(word));
375 }
376
377 /*
378 RemoveAnE( word )
379
380 Returns: int -- TRUE (1) if the current word meets special conditions
381 for removing an e.
382
383 Purpose: Rule 502 applies only to a root with this characteristic.
384
385 Plan: Check for size of 1 and no consonant-vowel-consonant ending.
386
387 Notes: None
388 */
389
390 static int
391 RemoveAnE(word)
392 char *word;
393 {
394 return ((1 == WordSize(word)) && !EndsWithCVC(word));
395 } /* RemoveAnE */
396
397 /*
398 ReplaceEnd( word, rule )
399
400 Returns: int -- the id for the rule fired, 0 is none is fired
401
402 Purpose: Apply a set of rules to replace the suffix of a word
403
404 Plan: Loop through the rule set until a match meeting all conditions
405 is found. If a rule fires, return its id, otherwise return 0.
406 Connditions on the length of the root are checked as part of this
407 function's processing because this check is so often made.
408
409 Notes: This is the main routine driving the stemmer. It goes through
410 a set of suffix replacement rules looking for a match on the
411 current suffix. When it finds one, if the root of the word
412 is long enough, and it meets whatever other conditions are
413 required, then the suffix is replaced, and the function returns.
414 * */
415
416 static int
417 ReplaceEnd(word, rule)
418 char *word; /* in/out: buffer with the stemmed word */
419 RuleList *rule; /* in: data structure with replacement rules */
420 {
421 register char *ending; /* set to start of possible stemmed suffix */
422 char tmp_ch; /* save replaced character when testing */
423
424 while (0 != rule->id) {
425 ending = end - rule->old_offset;
426 if (word <= ending)
427 if ((*ending == *rule->old_end) && /* for speed */
428 !strcmp(ending, rule->old_end)) {
429 tmp_ch = *ending;
430 *ending = EOS;
431 if (rule->min_root_size < WordSize(word))
432 if (!rule->condition || (*rule->condition) (word)) {
433 (void) strcat(word, rule->new_end);
434 end = ending + rule->new_offset;
435 break;
436 }
437 *ending = tmp_ch;
438 }
439 rule++;
440 }
441
442 return (rule->id);
443 }
444
445 /*
446 Stem( word )
447
448 Returns: int -- FALSE (0) if the word contains non-alphabetic characters
449 and hence is not stemmed, TRUE (1) otherwise
450
451 Purpose: Stem a word
452
453 Plan: Part 1: Check to ensure the word is all alphabetic
454 Part 2: Run through the Porter algorithm
455 Part 3: Return an indication of successful stemming
456
457 Notes: This function implements the Porter stemming algorithm, with
458 a few additions here and there. See:
459
460 Porter, M.F., "An Algorithm For Suffix Stripping,"
461 Program 14 (3), July 1980, pp. 130-137.
462
463 Porter's algorithm is an ad hoc set of rewrite rules with
464 various conditions on rule firing. The terminology of
465 "step 1a" and so on, is taken directly from Porter's
466 article, which unfortunately gives almost no justification
467 for the various steps. Thus this function more or less
468 faithfully refects the opaque presentation in the article.
469 Changes from the article amount to a few additions to the
470 rewrite rules; these are marked in the RuleList data
471 structures with comments.
472 */
473
474 int
475 Stem(word)
476 char *word; /* in/out: the word stemmed */
477 {
478 int rule; /* which rule is fired in replacing an end */
479
480 /* Part 1: Check to ensure the word is all alphabetic */
481 for (end = word; *end != EOS; end++)
482 if (!isalpha(*end))
483 return (FALSE);
484 else
485 *end = tolower(*end);
486 end--;
487
488 /* Part 2: Run through the Porter algorithm */
489 (void) ReplaceEnd(word, step1a_rules);
490 rule = ReplaceEnd(word, step1b_rules);
491 if ((106 == rule) || (107 == rule))
492 (void) ReplaceEnd(word, step1b1_rules);
493 (void) ReplaceEnd(word, step1c_rules);
494
495 (void) ReplaceEnd(word, step2_rules);
496
497 (void) ReplaceEnd(word, step3_rules);
498
499 (void) ReplaceEnd(word, step4_rules);
500
501 (void) ReplaceEnd(word, step5a_rules);
502 (void) ReplaceEnd(word, step5b_rules);
503
504 /* Part 3: Return an indication of successful stemming */
505 return (TRUE);
506 }

Properties

Name Value
cvs2svn:cvs-rev 1.1

  ViewVC Help
Powered by ViewVC 1.1.26