--- Alternative.pm 2002/02/11 14:26:22 1.1 +++ Alternative.pm 2002/02/12 14:31:15 1.5 @@ -13,22 +13,22 @@ #@EXPORT = qw(); @EXPORT_OK = qw( &alternatives - &load_affix ); -#my @affix_regexp; -#my @affix_add; -#my @affix_sub; - -my $debug=1; - -# stub +my $debug=0; +# +# make new instance of language, get args +# sub new { my $class = shift; my $self = {}; bless($self, $class); $self->{ARGS} = {@_}; + $debug = $self->{ARGS}->{DEBUG}; + @{$self->{affix_regexp}} = (); + @{$self->{affix_add}} = (); + @{$self->{affix_sub}} = (); $self ? return $self : return undef; } @@ -78,9 +78,9 @@ sub nuke_s { my $tmp = $_[0]; return if (!$tmp); - $tmp=~s/^ *//g; - $tmp=~s/ *$//g; - $tmp=~s/ *//g; + # $tmp=~s/^\s+//g; + # $tmp=~s/\s+$//g; + $tmp=~s/\s+//g; return $tmp; } @@ -92,6 +92,29 @@ } # +# function for reading raw findaffix output +# + +sub load_findaffix { + my $self = shift; + my $filename = shift; + + print STDERR "reading findaffix output $filename\n" if ($debug); + + open (A,$filename) || die "Can't open findaffix output $filename: $!"; + while() { + chomp; + my @line=split(m;/;,$_,4); + if ($#line > 2) { + push @{$self->{affix_regexp}},'.'; + push @{$self->{affix_sub}},$line[0]; + push @{$self->{affix_add}},$line[1]; + } + } + return 1; +} + +# # function which returns original word and all alternatives # @@ -105,6 +128,7 @@ my $regexp = $self->{affix_regexp}[$i]; my $add = $self->{affix_add}[$i]; my $sub = $self->{affix_sub}[$i]; + print STDERR "r:'$regexp'\t-'",$sub||'',"'\t+'",$add||'',"'\n" if ($debug); next if length($word) < length($sub); my $tmp_word = $word; if ($sub) { @@ -117,6 +141,7 @@ } else { $tmp_word = $word.$add; } + print STDERR "\t ?:$tmp_word\n" if ($debug); if ($tmp_word =~ m/$regexp/ix) { # print "$word -> $tmp_word\t-$sub, +$add, regexp: $regexp\n"; push @out,lc($tmp_word); @@ -126,13 +151,31 @@ return @out; } +# +# function which return minimal word of all alternatives +# + +sub minimal { + my $self = shift; + my @out; + foreach my $word (@_) { + my @alt = $self->alternatives($word); + my $minimal = shift @alt; + foreach (@alt) { + $minimal=$_ if (length($_) < length($minimal)); + } + push @out,$minimal; + } + return @out; +} + ############################################################################### 1; __END__ =head1 NAME -Alternative.pm - see all alternatives of a given word in a given language +Alternative.pm - alternative spelling of a given word in a given language =head1 SYNOPSIS @@ -154,16 +197,31 @@ =item new -The new() constructor (without parameters) create container for new -language. +The new() constructor (without parameters) create container for new language. +Only parametar it supports is DEBUG which turns on (some) debugging output. =item load_affix -Function load_affix loads ispell's affix file. +Function load_affix() loads ispell's affix file for later usage. + +=item load_findaffix + +This function loads output of findaffix program from ispell package. +This is better idea (if you are creating affix file for particular language +yourself or you can get your hands on one) because affix file from ispel +is limited to 26 entries (because each entry is denoted by single character). =item alternatives -Function alternatives +Function alternatives return all alternative spellings of particular +word(s). It will also return spelling which are not correct if there is +rule like that in affix file. + +=item minimal + +This function returns minimal of all alternatives of a given word(s). It's +a poor man's version of normalize (because we don't know gramatic of +particular language, just some spelling rules). =head1 PRIVATE METHODS