--- stem-hr.pl 2005/02/25 23:14:44 1 +++ stem-hr.pl 2005/02/26 00:37:18 4 @@ -18,16 +18,14 @@ # - uzvici # -sub rm_a { - s/a//g; -} +use strict; sub kgh { my ($pre,$replace,$post) = @_; $replace =~ s/[cè]/k/g; $replace =~ s/[z¾]/g/g; $replace =~ s/[s¹]/h/g; - return $pre . $replace . $post . "\tkgh"; + return $pre . $replace . $post; } # samoglasnici @@ -35,8 +33,19 @@ # suglasnici my $su = '[^aeiou]'; +my %rules; +my %stem_words; +my $words = 0; +my $stems = 0; + while(<>) { chomp; + if (/^$/) { + print "\n"; + next; + } + + $words++; my $orig = $_; @@ -44,21 +53,37 @@ # vrsta a unless ( + # kgh + s/(${sa})([è¾¹czs])(i|e|ima)$/kgh($1,$2,' 1')/gex || + s/(${sa}[kgh])(a|u|om)$/$1 2/g || + s/(${su})([è¾¹czs])(i|e|ima)$/kgh($1,"a$2",' 3')/gex || # nepostojano a - s/(${su}a{$su})a/$1 1/g || - s/(${su})(${su})(a|u|i|e|om|ima)$/$1a$2\t3/g + s/(${su}a{$su})a/$1 4/g || + s/(${su})(${su})(a|u|i|e|om|ima)$/$1a$2 5/g ) { - # kgh - s/(${su}${sa})([è¾¹czs])(i|e|ima)$/kgh($1,$2,'')/gex; -# s/${su}${sa}[c](i|ima) 4//g; - s/(${su}${sa}[kgh])(a|u|om)\t5/$1/g; # mno¾ina - s/(${su})(ov|ev)*(i|a|ima|e|in|ina)$/$1\t6/g; + s/(${su})(ov|ev)*(i|a|ima|e|in|ina)$/$1 6/g; # jednina - s/(o|e|a|u|om|em)$/\t7/g; + s/(o|e|a|u|om|em)$/ 7/g; } + if (s/^(.+)\s(\d+)$/$1\t$2/g) { + $rules{$2}++; + $stems++; + $stem_words{$1}++; + } printf("%-15s %s\n",$orig,$_); + +} +my $nr_stems = keys(%stem_words); +printf "\n# %d words, %d stems in %d ops, %.2f%% size\n",$words,$nr_stems,$stems,($nr_stems*100/$words); + +foreach my $s (keys %stem_words) { + print "#stem $stem_words{$s} $s\n"; +} + +foreach my $r (keys %rules) { + print "#rule $rules{$r} $r\n"; }