/[Lingua-Spelling-Alternative]/Alternative.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /Alternative.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (hide annotations)
Tue Nov 18 13:54:14 2003 UTC (20 years, 4 months ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +7 -6 lines
added README file, spelling fixes

1 dpavlin 1.1 # Documentation and Copyright exist after __END__
2    
3     package Lingua::Spelling::Alternative;
4     require 5.001;
5    
6     use strict;
7     use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
8    
9     use Exporter;
10 dpavlin 1.9 $VERSION = '0.02';
11 dpavlin 1.1 @ISA = ('Exporter');
12    
13     #@EXPORT = qw();
14     @EXPORT_OK = qw(
15     &alternatives
16     );
17    
18 dpavlin 1.2 my $debug=0;
19 dpavlin 1.1
20 dpavlin 1.3 #
21     # make new instance of language, get args
22     #
23 dpavlin 1.1 sub new {
24     my $class = shift;
25     my $self = {};
26     bless($self, $class);
27     $self->{ARGS} = {@_};
28 dpavlin 1.2 $debug = $self->{ARGS}->{DEBUG};
29 dpavlin 1.3 @{$self->{affix_regexp}} = ();
30     @{$self->{affix_add}} = ();
31     @{$self->{affix_sub}} = ();
32 dpavlin 1.1 $self ? return $self : return undef;
33     }
34    
35    
36     #
37     # load affix file in internal structures
38     #
39    
40     sub load_affix {
41     my $self = shift;
42     my $filename = shift;
43    
44     my $suffixes=0;
45    
46     my ($regexp,$add,$sub);
47    
48     print STDERR "reading affix file $filename\n" if ($debug);
49    
50     open (A,$filename) || die "Can't open affix file $filename: $!";
51     while(<A>) {
52     chomp;
53     next if (/^#|^[\s\t\n\r]*$/);
54    
55     if (/^suffixes/i) {
56     $suffixes++;
57     next;
58     }
59    
60     next if (! $suffixes);
61    
62     if (/^flag[\s\t]+\*{0,1}(.):/i) {
63     undef $regexp;
64     undef $add;
65     undef $sub;
66     next;
67     }
68    
69     if (/^[\s\t]*([^>#]+)>[\s\t]+-([^\,\s\t]+),([^\s\t]+)/) {
70     $regexp = $1;
71     $add = $2;
72     $sub = $3 if ($3 ne "-");
73     } elsif (/^[\s\t]*([^>#]+)>[\s\t]+([^\s\t\#]+)/) {
74     $regexp = $1;
75     $sub = $2;
76     }
77    
78     sub nuke_s {
79     my $tmp = $_[0];
80     return if (!$tmp);
81 dpavlin 1.3 # $tmp=~s/^\s+//g;
82     # $tmp=~s/\s+$//g;
83     $tmp=~s/\s+//g;
84 dpavlin 1.1 return $tmp;
85     }
86    
87     push @{$self->{affix_regexp}},nuke_s($regexp);
88     push @{$self->{affix_add}},nuke_s($add);
89     push @{$self->{affix_sub}},nuke_s($sub);
90     }
91     return 1;
92     }
93    
94     #
95 dpavlin 1.3 # function for reading raw findaffix output
96     #
97    
98     sub load_findaffix {
99     my $self = shift;
100     my $filename = shift;
101    
102     print STDERR "reading findaffix output $filename\n" if ($debug);
103    
104     open (A,$filename) || die "Can't open findaffix output $filename: $!";
105     while(<A>) {
106     chomp;
107     my @line=split(m;/;,$_,4);
108     if ($#line > 2) {
109     push @{$self->{affix_regexp}},'.';
110     push @{$self->{affix_sub}},$line[0];
111     push @{$self->{affix_add}},$line[1];
112     }
113     }
114     return 1;
115     }
116    
117     #
118 dpavlin 1.1 # function which returns original word and all alternatives
119     #
120    
121     sub alternatives {
122     my $self = shift;
123     my @out;
124     foreach my $word (@_) {
125     push @out,$word; # save original word
126     next if (length($word) < 3); # cludge: preskoci kratke
127     for(my $i=0; $i<=$#{$self->{affix_regexp}}; $i++) {
128     my $regexp = $self->{affix_regexp}[$i];
129     my $add = $self->{affix_add}[$i];
130     my $sub = $self->{affix_sub}[$i];
131 dpavlin 1.3 print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug);
132 dpavlin 1.1 next if length($word) < length($sub);
133     my $tmp_word = $word;
134     if ($sub) {
135     next if ($word !~ m/$sub$/i);
136     if ($add) {
137     $tmp_word =~ s/$sub$/$add/i;
138     } else {
139     $tmp_word =~ s/$sub$//i;
140     }
141     } else {
142     $tmp_word = $word.$add;
143     }
144 dpavlin 1.3 print STDERR "\t ?:$tmp_word\n" if ($debug);
145 dpavlin 1.1 if ($tmp_word =~ m/$regexp/ix) {
146     # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n";
147     push @out,lc($tmp_word);
148     }
149     }
150     }
151     return @out;
152     }
153    
154 dpavlin 1.4 #
155     # function which return minimal word of all alternatives
156     #
157    
158     sub minimal {
159     my $self = shift;
160     my @out;
161     foreach my $word (@_) {
162     my @alt = $self->alternatives($word);
163     my $minimal = shift @alt;
164     foreach (@alt) {
165     $minimal=$_ if (length($_) < length($minimal));
166     }
167     push @out,$minimal;
168     }
169     return @out;
170     }
171    
172 dpavlin 1.1 ###############################################################################
173     1;
174     __END__
175    
176     =head1 NAME
177    
178 dpavlin 1.3 Alternative.pm - alternative spelling of a given word in a given language
179 dpavlin 1.1
180     =head1 SYNOPSIS
181    
182 dpavlin 1.6 use Lingua::Spelling::Alternative;
183 dpavlin 1.1
184 dpavlin 1.8 my $en = new Lingua::Spelling::Alternative;
185     $en->load_affix('/usr/lib/ispell/default.aff') or die $!;
186     print join(" ",$en->alternatives("cars")),"\n";
187 dpavlin 1.1
188     =head1 DESCRIPTION
189    
190 dpavlin 1.9 This module is designed to return all forms of a given word
191 dpavlin 1.1 (for example when you want to see all possible forms of some word
192 dpavlin 1.9 entered in search engine) which can be generated using affix file (from
193     ispell) or using findaffix output file (also part of ispell package)
194 dpavlin 1.1
195     =head1 PUBLIC METHODS
196    
197     =over 4
198    
199     =item new
200    
201 dpavlin 1.4 The new() constructor (without parameters) create container for new language.
202 dpavlin 1.9 Only parameter it supports is DEBUG which turns on (some) debugging output.
203 dpavlin 1.1
204     =item load_affix
205    
206 dpavlin 1.5 Function load_affix() loads ispell's affix file for later usage.
207 dpavlin 1.4
208     =item load_findaffix
209    
210     This function loads output of findaffix program from ispell package.
211     This is better idea (if you are creating affix file for particular language
212 dpavlin 1.9 yourself or you can get your hands on one) because affix file from ispell
213 dpavlin 1.5 is limited to 26 entries (because each entry is denoted by single character).
214 dpavlin 1.1
215     =item alternatives
216    
217 dpavlin 1.8 Function alternatives return array of all alternative spellings of particular
218 dpavlin 1.4 word(s). It will also return spelling which are not correct if there is
219     rule like that in affix file.
220    
221     =item minimal
222    
223     This function returns minimal of all alternatives of a given word(s). It's
224 dpavlin 1.9 a poor man's version of normalize (because we don't know grammatic of
225 dpavlin 1.4 particular language, just some spelling rules).
226 dpavlin 1.1
227 dpavlin 1.7 =back
228    
229 dpavlin 1.1 =head1 PRIVATE METHODS
230    
231     Documented as being not documented.
232    
233     =head1 EXAMPLES
234    
235     Please see the test.pl program in distribution which exercises some
236     aspects of Alternative.pm.
237    
238     =head1 BUGS
239    
240 dpavlin 1.8 There are no known bugs. If you find any, please report it in CPAN's
241     request tracker at: http://rt.cpan.org/
242 dpavlin 1.1
243     =head1 CONTACT AND COPYRIGHT
244    
245 dpavlin 1.8 Copyright 2002-2003 Dobrica Pavlinusic (dpavlin@rot13.org). All
246 dpavlin 1.1 rights reserved. This program is free software; you can redistribute
247     it and/or modify it under the same terms as Perl itself.
248    
249     =cut

  ViewVC Help
Powered by ViewVC 1.1.26