--- Alternative.pm 2002/02/11 14:26:22 1.1 +++ Alternative.pm 2003/11/18 13:54:14 1.9 @@ -7,28 +7,28 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); use Exporter; -$VERSION = '0.01'; +$VERSION = '0.02'; @ISA = ('Exporter'); #@EXPORT = qw(); @EXPORT_OK = qw( &alternatives - &load_affix ); -#my @affix_regexp; -#my @affix_add; -#my @affix_sub; - -my $debug=1; - -# stub +my $debug=0; +# +# make new instance of language, get args +# sub new { my $class = shift; my $self = {}; bless($self, $class); $self->{ARGS} = {@_}; + $debug = $self->{ARGS}->{DEBUG}; + @{$self->{affix_regexp}} = (); + @{$self->{affix_add}} = (); + @{$self->{affix_sub}} = (); $self ? return $self : return undef; } @@ -78,9 +78,9 @@ sub nuke_s { my $tmp = $_[0]; return if (!$tmp); - $tmp=~s/^ *//g; - $tmp=~s/ *$//g; - $tmp=~s/ *//g; + # $tmp=~s/^\s+//g; + # $tmp=~s/\s+$//g; + $tmp=~s/\s+//g; return $tmp; } @@ -92,6 +92,29 @@ } # +# function for reading raw findaffix output +# + +sub load_findaffix { + my $self = shift; + my $filename = shift; + + print STDERR "reading findaffix output $filename\n" if ($debug); + + open (A,$filename) || die "Can't open findaffix output $filename: $!"; + while() { + chomp; + my @line=split(m;/;,$_,4); + if ($#line > 2) { + push @{$self->{affix_regexp}},'.'; + push @{$self->{affix_sub}},$line[0]; + push @{$self->{affix_add}},$line[1]; + } + } + return 1; +} + +# # function which returns original word and all alternatives # @@ -105,6 +128,7 @@ my $regexp = $self->{affix_regexp}[$i]; my $add = $self->{affix_add}[$i]; my $sub = $self->{affix_sub}[$i]; + print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug); next if length($word) < length($sub); my $tmp_word = $word; if ($sub) { @@ -117,6 +141,7 @@ } else { $tmp_word = $word.$add; } + print STDERR "\t ?:$tmp_word\n" if ($debug); if ($tmp_word =~ m/$regexp/ix) { # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; push @out,lc($tmp_word); @@ -126,27 +151,46 @@ return @out; } +# +# function which return minimal word of all alternatives +# + +sub minimal { + my $self = shift; + my @out; + foreach my $word (@_) { + my @alt = $self->alternatives($word); + my $minimal = shift @alt; + foreach (@alt) { + $minimal=$_ if (length($_) < length($minimal)); + } + push @out,$minimal; + } + return @out; +} + ############################################################################### 1; __END__ =head1 NAME -Alternative.pm - see all alternatives of a given word in a given language +Alternative.pm - alternative spelling of a given word in a given language =head1 SYNOPSIS - use Lingua::Spelling:Alternative; + use Lingua::Spelling::Alternative; - my $en = new Alternative; - $en->load_affix('/usr/lib/ispell/english.aff') or die $!; - print $en->alternatives("cars"); + my $en = new Lingua::Spelling::Alternative; + $en->load_affix('/usr/lib/ispell/default.aff') or die $!; + print join(" ",$en->alternatives("cars")),"\n"; =head1 DESCRIPTION -This module is designed to return all valid forms of a given word +This module is designed to return all forms of a given word (for example when you want to see all possible forms of some word -entered in search engine) +entered in search engine) which can be generated using affix file (from +ispell) or using findaffix output file (also part of ispell package) =head1 PUBLIC METHODS @@ -154,16 +198,33 @@ =item new -The new() constructor (without parameters) create container for new -language. +The new() constructor (without parameters) create container for new language. +Only parameter it supports is DEBUG which turns on (some) debugging output. =item load_affix -Function load_affix loads ispell's affix file. +Function load_affix() loads ispell's affix file for later usage. + +=item load_findaffix + +This function loads output of findaffix program from ispell package. +This is better idea (if you are creating affix file for particular language +yourself or you can get your hands on one) because affix file from ispell +is limited to 26 entries (because each entry is denoted by single character). =item alternatives -Function alternatives +Function alternatives return array of all alternative spellings of particular +word(s). It will also return spelling which are not correct if there is +rule like that in affix file. + +=item minimal + +This function returns minimal of all alternatives of a given word(s). It's +a poor man's version of normalize (because we don't know grammatic of +particular language, just some spelling rules). + +=back =head1 PRIVATE METHODS @@ -176,11 +237,12 @@ =head1 BUGS -There are no known bugs. +There are no known bugs. If you find any, please report it in CPAN's +request tracker at: http://rt.cpan.org/ =head1 CONTACT AND COPYRIGHT -Copyright 2002 Dobrica Pavlinusic (dpavlin@rot13.org). All +Copyright 2002-2003 Dobrica Pavlinusic (dpavlin@rot13.org). All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.