31 |
isouc disouc |
isouc disouc |
32 |
isotr disotr |
isotr disotr |
33 |
stop grundform |
stop grundform |
34 |
utf8iso |
utf8iso |
35 |
); |
); |
36 |
# (most implemented in WAIT.xs) |
# (most implemented in WAIT.xs) |
37 |
|
|
78 |
*decode_entities = HTML::Entities->can('decode_entities'); |
*decode_entities = HTML::Entities->can('decode_entities'); |
79 |
goto &decode_entities; |
goto &decode_entities; |
80 |
} elsif ($func =~ /^d?utf8iso$/) { |
} elsif ($func =~ /^d?utf8iso$/) { |
|
require WAIT::Filter::utf8iso; |
|
|
croak "Your perl version must at least be 5.00556 to use '$func'" |
|
|
if $] < 5.00556; |
|
81 |
no strict 'refs'; |
no strict 'refs'; |
82 |
*$func = \&{"WAIT::Filter::utf8iso::$func"}; |
*$func = sub { |
83 |
goto &utf8iso; |
# Courtesy JHI |
84 |
|
my $s = shift; |
85 |
|
$s =~ s{([\xC0-\xDF])([\x80-\xBF])} |
86 |
|
{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; |
87 |
|
$s; |
88 |
|
}; |
89 |
|
goto \&$func; |
90 |
} |
} |
91 |
Carp::confess "Class WAIT::Filter::$func not found"; |
Carp::confess "Class WAIT::Filter::$func not found"; |
92 |
} |
} |
97 |
next if /^\s*#/; # there's a comment |
next if /^\s*#/; # there's a comment |
98 |
$STOP{$_}++; |
$STOP{$_}++; |
99 |
} |
} |
100 |
|
close DATA; |
101 |
|
|
102 |
sub stop { |
sub stop { |
103 |
if (exists $STOP{$_[0]}) { |
if (exists $STOP{$_[0]}) { |
115 |
} |
} |
116 |
|
|
117 |
1; |
1; |
118 |
|
|
119 |
__DATA__ |
__DATA__ |
120 |
a |
a |
121 |
about |
about |
575 |
with |
with |
576 |
you |
you |
577 |
__END__ |
__END__ |
|
# Below is the stub of documentation for your module. You better edit it! |
|
578 |
|
|
579 |
=head1 NAME |
=head1 NAME |
580 |
|
|
583 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
584 |
|
|
585 |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
586 |
isotr disotr stop grundform utf8iso); |
isotr disotr stop grundform); |
587 |
|
|
588 |
$stem = Stem($word); |
$stem = Stem($word); |
589 |
$scode = Soundex($word); |
$scode = Soundex($word); |
649 |
PY: 1990 |
PY: 1990 |
650 |
PM: OCT |
PM: OCT |
651 |
|
|
652 |
|
=back |
653 |
|
|
654 |
=head1 ISO charcater case functions |
=head1 ISO charcater case functions |
655 |
|
|
656 |
There are some additional function which transpose some/most ISOlatin1 |
There are some additional function which transpose some/most ISOlatin1 |
669 |
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
670 |
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
671 |
|
|
672 |
|
=over 5 |
673 |
|
|
674 |
=item C<$new = >B<isolc>C<($word)> |
=item C<$new = >B<isolc>C<($word)> |
675 |
|
|
676 |
=item B<disolc>C<($word)> |
=item B<disolc>C<($word)> |
699 |
|
|
700 |
=item C<$new = >B<utf8iso>C<($word)> |
=item C<$new = >B<utf8iso>C<($word)> |
701 |
|
|
702 |
Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is |
Deprecated due to flux in perl versions between 5.005 and 5.8. The |
703 |
internally based on the Latin1 character set, so if you process |
function converts UTF8 encoded strings to ISO-8859-1. WAIT is |
704 |
|
internally still based on the Latin1 character set, so if you process |
705 |
anything in a different encoding, you should convert to Latin1 as the |
anything in a different encoding, you should convert to Latin1 as the |
706 |
first filter. |
first filter or refrain from using the iso-latin-1 based filter |
707 |
|
functions. It is recommended that you use your own converter based on |
708 |
|
the perl version you're using. |
709 |
|
|
710 |
=item split, split2, split3, ... |
=item split, split2, split3, ... |
711 |
|
|
714 |
words from the list that are shorter than 2 characters (bytes), split3 |
words from the list that are shorter than 2 characters (bytes), split3 |
715 |
eliminates those shorter than 3 characters (bytes) and so on. |
eliminates those shorter than 3 characters (bytes) and so on. |
716 |
|
|
717 |
|
=back |
718 |
|
|
719 |
=head1 AUTHOR |
=head1 AUTHOR |
720 |
|
|
721 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |