/[Lingua-Spelling-Alternative]/Alternative.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /Alternative.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations)
Tue Feb 12 12:41:31 2002 UTC (22 years, 1 month ago) by dpavlin
Branch: MAIN
Changes since 1.3: +38 -7 lines
new minimal function which returns alternative with minimal length (it's
a poor man's version or normalizing a word)

1 # Documentation and Copyright exist after __END__
2
3 package Lingua::Spelling::Alternative;
4 require 5.001;
5
6 use strict;
7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
8
9 use Exporter;
10 $VERSION = '0.01';
11 @ISA = ('Exporter');
12
13 #@EXPORT = qw();
14 @EXPORT_OK = qw(
15 &alternatives
16 );
17
18 my $debug=0;
19
20 #
21 # make new instance of language, get args
22 #
23 sub new {
24 my $class = shift;
25 my $self = {};
26 bless($self, $class);
27 $self->{ARGS} = {@_};
28 $debug = $self->{ARGS}->{DEBUG};
29 @{$self->{affix_regexp}} = ();
30 @{$self->{affix_add}} = ();
31 @{$self->{affix_sub}} = ();
32 $self ? return $self : return undef;
33 }
34
35
36 #
37 # load affix file in internal structures
38 #
39
40 sub load_affix {
41 my $self = shift;
42 my $filename = shift;
43
44 my $suffixes=0;
45
46 my ($regexp,$add,$sub);
47
48 print STDERR "reading affix file $filename\n" if ($debug);
49
50 open (A,$filename) || die "Can't open affix file $filename: $!";
51 while(<A>) {
52 chomp;
53 next if (/^#|^[\s\t\n\r]*$/);
54
55 if (/^suffixes/i) {
56 $suffixes++;
57 next;
58 }
59
60 next if (! $suffixes);
61
62 if (/^flag[\s\t]+\*{0,1}(.):/i) {
63 undef $regexp;
64 undef $add;
65 undef $sub;
66 next;
67 }
68
69 if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
70 $regexp = $1;
71 $add = $2;
72 $sub = $3 if ($3 ne "-");
73 } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
74 $regexp = $1;
75 $sub = $2;
76 }
77
78 sub nuke_s {
79 my $tmp = $_[0];
80 return if (!$tmp);
81 # $tmp=~s/^\s+//g;
82 # $tmp=~s/\s+$//g;
83 $tmp=~s/\s+//g;
84 return $tmp;
85 }
86
87 push @{$self->{affix_regexp}},nuke_s($regexp);
88 push @{$self->{affix_add}},nuke_s($add);
89 push @{$self->{affix_sub}},nuke_s($sub);
90 }
91 return 1;
92 }
93
94 #
95 # function for reading raw findaffix output
96 #
97
98 sub load_findaffix {
99 my $self = shift;
100 my $filename = shift;
101
102 print STDERR "reading findaffix output $filename\n" if ($debug);
103
104 open (A,$filename) || die "Can't open findaffix output $filename: $!";
105 while(<A>) {
106 chomp;
107 my @line=split(m;/;,$_,4);
108 if ($#line > 2) {
109 push @{$self->{affix_regexp}},'.';
110 push @{$self->{affix_sub}},$line[0];
111 push @{$self->{affix_add}},$line[1];
112 }
113 }
114 return 1;
115 }
116
117 #
118 # function which returns original word and all alternatives
119 #
120
121 sub alternatives {
122 my $self = shift;
123 my @out;
124 foreach my $word (@_) {
125 push @out,$word; # save original word
126 next if (length($word) < 3); # cludge: preskoci kratke
127 for(my $i=0; $i<=$#{$self->{affix_regexp}}; $i++) {
128 my $regexp = $self->{affix_regexp}[$i];
129 my $add = $self->{affix_add}[$i];
130 my $sub = $self->{affix_sub}[$i];
131 print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug);
132 next if length($word) < length($sub);
133 my $tmp_word = $word;
134 if ($sub) {
135 next if ($word !~ m/$sub$/i);
136 if ($add) {
137 $tmp_word =~ s/$sub$/$add/i;
138 } else {
139 $tmp_word =~ s/$sub$//i;
140 }
141 } else {
142 $tmp_word = $word.$add;
143 }
144 print STDERR "\t ?:$tmp_word\n" if ($debug);
145 if ($tmp_word =~ m/$regexp/ix) {
146 # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
147 push @out,lc($tmp_word);
148 }
149 }
150 }
151 return @out;
152 }
153
154 #
155 # function which return minimal word of all alternatives
156 #
157
158 sub minimal {
159 my $self = shift;
160 my @out;
161 foreach my $word (@_) {
162 my @alt = $self->alternatives($word);
163 my $minimal = shift @alt;
164 foreach (@alt) {
165 $minimal=$_ if (length($_) < length($minimal));
166 }
167 push @out,$minimal;
168 }
169 return @out;
170 }
171
172 ###############################################################################
173 1;
174 __END__
175
176 =head1 NAME
177
178 Alternative.pm - alternative spelling of a given word in a given language
179
180 =head1 SYNOPSIS
181
182 use Lingua::Spelling:Alternative;
183
184 my $en = new Alternative;
185 $en->load_affix('/usr/lib/ispell/english.aff') or die $!;
186 print $en->alternatives("cars");
187
188 =head1 DESCRIPTION
189
190 This module is designed to return all valid forms of a given word
191 (for example when you want to see all possible forms of some word
192 entered in search engine)
193
194 =head1 PUBLIC METHODS
195
196 =over 4
197
198 =item new
199
200 The new() constructor (without parameters) create container for new language.
201 Only parametar it supports is DEBUG which turns on (some) debugging output.
202
203 =item load_affix
204
205 Function load_affix loads ispell's affix file for later usage.
206
207 =item load_findaffix
208
209 This function loads output of findaffix program from ispell package.
210 This is better idea (if you are creating affix file for particular language
211 yourself) because affix file from ispell (which is created from data returned
212 by findaffix) is limited to 26 entries (because each entry is denoted by
213 single character).
214
215 =item alternatives
216
217 Function alternatives return all alternative spellings of particular
218 word(s). It will also return spelling which are not correct if there is
219 rule like that in affix file.
220
221 =item minimal
222
223 This function returns minimal of all alternatives of a given word(s). It's
224 a poor man's version of normalize (because we don't know gramatic of
225 particular language, just some spelling rules).
226
227 =head1 PRIVATE METHODS
228
229 Documented as being not documented.
230
231 =head1 EXAMPLES
232
233 Please see the test.pl program in distribution which exercises some
234 aspects of Alternative.pm.
235
236 =head1 BUGS
237
238 There are no known bugs.
239
240 =head1 CONTACT AND COPYRIGHT
241
242 Copyright 2002 Dobrica Pavlinusic (dpavlin@rot13.org). All
243 rights reserved. This program is free software; you can redistribute
244 it and/or modify it under the same terms as Perl itself.
245
246 =cut

  ViewVC Help
Powered by ViewVC 1.1.26