/[Lingua-Spelling-Alternative]/Alternative.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /Alternative.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (show annotations)
Tue Nov 18 13:54:14 2003 UTC (19 years ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +7 -6 lines
added README file, spelling fixes

1 # Documentation and Copyright exist after __END__
2
3 package Lingua::Spelling::Alternative;
4 require 5.001;
5
6 use strict;
7 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
8
9 use Exporter;
10 $VERSION = '0.02';
11 @ISA = ('Exporter');
12
13 #@EXPORT = qw();
14 @EXPORT_OK = qw(
15 &alternatives
16 );
17
18 my $debug=0;
19
20 #
21 # make new instance of language, get args
22 #
23 sub new {
24 my $class = shift;
25 my $self = {};
26 bless($self, $class);
27 $self->{ARGS} = {@_};
28 $debug = $self->{ARGS}->{DEBUG};
29 @{$self->{affix_regexp}} = ();
30 @{$self->{affix_add}} = ();
31 @{$self->{affix_sub}} = ();
32 $self ? return $self : return undef;
33 }
34
35
36 #
37 # load affix file in internal structures
38 #
39
40 sub load_affix {
41 my $self = shift;
42 my $filename = shift;
43
44 my $suffixes=0;
45
46 my ($regexp,$add,$sub);
47
48 print STDERR "reading affix file $filename\n" if ($debug);
49
50 open (A,$filename) || die "Can't open affix file $filename: $!";
51 while(<A>) {
52 chomp;
53 next if (/^#|^[\s\t\n\r]*$/);
54
55 if (/^suffixes/i) {
56 $suffixes++;
57 next;
58 }
59
60 next if (! $suffixes);
61
62 if (/^flag[\s\t]+\*{0,1}(.):/i) {
63 undef $regexp;
64 undef $add;
65 undef $sub;
66 next;
67 }
68
69 if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
70 $regexp = $1;
71 $add = $2;
72 $sub = $3 if ($3 ne "-");
73 } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
74 $regexp = $1;
75 $sub = $2;
76 }
77
78 sub nuke_s {
79 my $tmp = $_[0];
80 return if (!$tmp);
81 # $tmp=~s/^\s+//g;
82 # $tmp=~s/\s+$//g;
83 $tmp=~s/\s+//g;
84 return $tmp;
85 }
86
87 push @{$self->{affix_regexp}},nuke_s($regexp);
88 push @{$self->{affix_add}},nuke_s($add);
89 push @{$self->{affix_sub}},nuke_s($sub);
90 }
91 return 1;
92 }
93
94 #
95 # function for reading raw findaffix output
96 #
97
98 sub load_findaffix {
99 my $self = shift;
100 my $filename = shift;
101
102 print STDERR "reading findaffix output $filename\n" if ($debug);
103
104 open (A,$filename) || die "Can't open findaffix output $filename: $!";
105 while(<A>) {
106 chomp;
107 my @line=split(m;/;,$_,4);
108 if ($#line > 2) {
109 push @{$self->{affix_regexp}},'.';
110 push @{$self->{affix_sub}},$line[0];
111 push @{$self->{affix_add}},$line[1];
112 }
113 }
114 return 1;
115 }
116
117 #
118 # function which returns original word and all alternatives
119 #
120
121 sub alternatives {
122 my $self = shift;
123 my @out;
124 foreach my $word (@_) {
125 push @out,$word; # save original word
126 next if (length($word) < 3); # cludge: preskoci kratke
127 for(my $i=0; $i<=$#{$self->{affix_regexp}}; $i++) {
128 my $regexp = $self->{affix_regexp}[$i];
129 my $add = $self->{affix_add}[$i];
130 my $sub = $self->{affix_sub}[$i];
131 print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug);
132 next if length($word) < length($sub);
133 my $tmp_word = $word;
134 if ($sub) {
135 next if ($word !~ m/$sub$/i);
136 if ($add) {
137 $tmp_word =~ s/$sub$/$add/i;
138 } else {
139 $tmp_word =~ s/$sub$//i;
140 }
141 } else {
142 $tmp_word = $word.$add;
143 }
144 print STDERR "\t ?:$tmp_word\n" if ($debug);
145 if ($tmp_word =~ m/$regexp/ix) {
146 # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
147 push @out,lc($tmp_word);
148 }
149 }
150 }
151 return @out;
152 }
153
154 #
155 # function which return minimal word of all alternatives
156 #
157
158 sub minimal {
159 my $self = shift;
160 my @out;
161 foreach my $word (@_) {
162 my @alt = $self->alternatives($word);
163 my $minimal = shift @alt;
164 foreach (@alt) {
165 $minimal=$_ if (length($_) < length($minimal));
166 }
167 push @out,$minimal;
168 }
169 return @out;
170 }
171
172 ###############################################################################
173 1;
174 __END__
175
176 =head1 NAME
177
178 Alternative.pm - alternative spelling of a given word in a given language
179
180 =head1 SYNOPSIS
181
182 use Lingua::Spelling::Alternative;
183
184 my $en = new Lingua::Spelling::Alternative;
185 $en->load_affix('/usr/lib/ispell/default.aff') or die $!;
186 print join(" ",$en->alternatives("cars")),"\n";
187
188 =head1 DESCRIPTION
189
190 This module is designed to return all forms of a given word
191 (for example when you want to see all possible forms of some word
192 entered in search engine) which can be generated using affix file (from
193 ispell) or using findaffix output file (also part of ispell package)
194
195 =head1 PUBLIC METHODS
196
197 =over 4
198
199 =item new
200
201 The new() constructor (without parameters) create container for new language.
202 Only parameter it supports is DEBUG which turns on (some) debugging output.
203
204 =item load_affix
205
206 Function load_affix() loads ispell's affix file for later usage.
207
208 =item load_findaffix
209
210 This function loads output of findaffix program from ispell package.
211 This is better idea (if you are creating affix file for particular language
212 yourself or you can get your hands on one) because affix file from ispell
213 is limited to 26 entries (because each entry is denoted by single character).
214
215 =item alternatives
216
217 Function alternatives return array of all alternative spellings of particular
218 word(s). It will also return spelling which are not correct if there is
219 rule like that in affix file.
220
221 =item minimal
222
223 This function returns minimal of all alternatives of a given word(s). It's
224 a poor man's version of normalize (because we don't know grammatic of
225 particular language, just some spelling rules).
226
227 =back
228
229 =head1 PRIVATE METHODS
230
231 Documented as being not documented.
232
233 =head1 EXAMPLES
234
235 Please see the test.pl program in distribution which exercises some
236 aspects of Alternative.pm.
237
238 =head1 BUGS
239
240 There are no known bugs. If you find any, please report it in CPAN's
241 request tracker at: http://rt.cpan.org/
242
243 =head1 CONTACT AND COPYRIGHT
244
245 Copyright 2002-2003 Dobrica Pavlinusic (dpavlin@rot13.org). All
246 rights reserved. This program is free software; you can redistribute
247 it and/or modify it under the same terms as Perl itself.
248
249 =cut

  ViewVC Help
Powered by ViewVC 1.1.26