1 |
# -*- Mode: Cperl -*- |
# -*- Mode: Cperl -*- |
2 |
# $Basename: Filter.pm $ |
# $Basename: Filter.pm $ |
3 |
# $Revision: 1.8 $ |
# $Revision: 1.9 $ |
4 |
# ITIID : $ITI$ $Header $__Header$ |
# ITIID : $ITI$ $Header $__Header$ |
5 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
6 |
# Created On : Thu Aug 15 18:09:51 1996 |
# Created On : Thu Aug 15 18:09:51 1996 |
16 |
require WAIT; |
require WAIT; |
17 |
use strict; |
use strict; |
18 |
use Carp; |
use Carp; |
19 |
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD); |
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD); |
20 |
use subs qw(grundform); |
use subs qw(grundform); |
21 |
|
|
22 |
|
use Text::Unaccent; |
23 |
|
use Text::Iconv; |
24 |
|
|
25 |
require Exporter; |
require Exporter; |
26 |
|
|
27 |
@ISA = qw(Exporter); |
@ISA = qw(Exporter); |
34 |
isouc disouc |
isouc disouc |
35 |
isotr disotr |
isotr disotr |
36 |
stop grundform |
stop grundform |
37 |
utf8iso |
utf8iso |
38 |
); |
); |
39 |
# (most implemented in WAIT.xs) |
# (most implemented in WAIT.xs) |
40 |
|
|
41 |
$VERSION = substr q$Revision: 1.8 $, 10; |
$VERSION = substr q$Revision: 1.9 $, 10; |
42 |
|
|
43 |
sub split { |
sub split { |
44 |
map split(' ', $_), @_; |
map split(' ', $_), @_; |
50 |
} |
} |
51 |
]; |
]; |
52 |
|
|
53 |
|
$UNAC = q[ |
54 |
|
sub unac_CHARSET { |
55 |
|
map split(' ',unac_string('CHARSET', $_) || $_), @_; |
56 |
|
} |
57 |
|
]; |
58 |
|
|
59 |
|
my $iconv; |
60 |
|
|
61 |
|
$ICONV = q[ |
62 |
|
sub iconv_CHARSETfrom_CHARSETto { |
63 |
|
my $ic = $iconv->{'CHARSETfrom_CHARSETto'}); |
64 |
|
$ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto'); |
65 |
|
map split(' ',$ic->convert($_) || $_), @_; |
66 |
|
} |
67 |
|
]; |
68 |
|
|
69 |
|
|
70 |
sub AUTOLOAD { |
sub AUTOLOAD { |
71 |
my $func = $AUTOLOAD; $func =~ s/.*:://; |
my $func = $AUTOLOAD; $func =~ s/.*:://; |
72 |
|
|
93 |
goto \&date; |
goto \&date; |
94 |
} elsif ($func eq 'decode_entities') { |
} elsif ($func eq 'decode_entities') { |
95 |
eval {require HTML::Entities;}; |
eval {require HTML::Entities;}; |
96 |
croak "You must have HTML::Entities to use 'date'" |
croak "You must have HTML::Entities to use 'decode_entities'" |
97 |
if $@ ne ''; |
if $@ ne ''; |
98 |
*decode_entities = HTML::Entities->can('decode_entities'); |
*decode_entities = HTML::Entities->can('decode_entities'); |
99 |
goto &decode_entities; |
goto &decode_entities; |
100 |
} elsif ($func =~ /^d?utf8iso$/) { |
} elsif ($func =~ /^d?utf8iso$/) { |
|
require WAIT::Filter::utf8iso; |
|
|
croak "Your perl version must at least be 5.00556 to use '$func'" |
|
|
if $] < 5.00556; |
|
101 |
no strict 'refs'; |
no strict 'refs'; |
102 |
*$func = \&{"WAIT::Filter::utf8iso::$func"}; |
*$func = sub { |
103 |
goto &utf8iso; |
# Courtesy JHI |
104 |
|
my $s = shift; |
105 |
|
$s =~ s{([\xC0-\xDF])([\x80-\xBF])} |
106 |
|
{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; |
107 |
|
$s; |
108 |
|
}; |
109 |
|
goto \&$func; |
110 |
|
} elsif ($func =~ /unac_(.+)/) { |
111 |
|
my $charset = $1; |
112 |
|
my $unac = $UNAC; |
113 |
|
$unac =~ s/CHARSET/$charset/g; |
114 |
|
print "### $unac ###\n"; |
115 |
|
eval $unac; |
116 |
|
if ($@ eq '') { |
117 |
|
goto &$func; |
118 |
|
} |
119 |
|
} elsif ($func =~ /iconv_([^_]+)_([^_]+)/) { |
120 |
|
my ($cf,$ct) = ($1,$2); |
121 |
|
my $iconv = $ICONV; |
122 |
|
print "### $cf -> $ct\n"; |
123 |
|
$iconv =~ s/CHARSETfrom/$cf/gs; |
124 |
|
$iconv =~ s/CHARSETto/$ct/gs; |
125 |
|
print "### $iconv ###\n"; |
126 |
|
eval $iconv; |
127 |
|
if ($@ eq '') { |
128 |
|
goto &$func; |
129 |
|
} |
130 |
} |
} |
131 |
Carp::confess "Class WAIT::Filter::$func not found"; |
Carp::confess "Class WAIT::Filter::$func not found"; |
132 |
} |
} |
134 |
while (<DATA>) { |
while (<DATA>) { |
135 |
chomp; |
chomp; |
136 |
last if /__END__/; |
last if /__END__/; |
137 |
|
next if /^\s*#/; # there's a comment |
138 |
$STOP{$_}++; |
$STOP{$_}++; |
139 |
} |
} |
140 |
|
close DATA; |
141 |
|
|
142 |
sub stop { |
sub stop { |
143 |
if (exists $STOP{$_[0]}) { |
if (exists $STOP{$_[0]}) { |
155 |
} |
} |
156 |
|
|
157 |
1; |
1; |
158 |
|
|
159 |
__DATA__ |
__DATA__ |
160 |
a |
a |
161 |
about |
about |
615 |
with |
with |
616 |
you |
you |
617 |
__END__ |
__END__ |
|
# Below is the stub of documentation for your module. You better edit it! |
|
618 |
|
|
619 |
=head1 NAME |
=head1 NAME |
620 |
|
|
623 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
624 |
|
|
625 |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
626 |
isotr disotr stop grundform utf8iso); |
isotr disotr stop grundform); |
627 |
|
|
628 |
$stem = Stem($word); |
$stem = Stem($word); |
629 |
$scode = Soundex($word); |
$scode = Soundex($word); |
689 |
PY: 1990 |
PY: 1990 |
690 |
PM: OCT |
PM: OCT |
691 |
|
|
692 |
|
=back |
693 |
|
|
694 |
=head1 ISO charcater case functions |
=head1 ISO charcater case functions |
695 |
|
|
696 |
There are some additional function which transpose some/most ISOlatin1 |
There are some additional function which transpose some/most ISOlatin1 |
709 |
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß |
710 |
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß |
711 |
|
|
712 |
|
=over 5 |
713 |
|
|
714 |
=item C<$new = >B<isolc>C<($word)> |
=item C<$new = >B<isolc>C<($word)> |
715 |
|
|
716 |
=item B<disolc>C<($word)> |
=item B<disolc>C<($word)> |
739 |
|
|
740 |
=item C<$new = >B<utf8iso>C<($word)> |
=item C<$new = >B<utf8iso>C<($word)> |
741 |
|
|
742 |
Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is |
Deprecated due to flux in perl versions between 5.005 and 5.8. The |
743 |
internally based on the Latin1 character set, so if you process |
function converts UTF8 encoded strings to ISO-8859-1. WAIT is |
744 |
|
internally still based on the Latin1 character set, so if you process |
745 |
anything in a different encoding, you should convert to Latin1 as the |
anything in a different encoding, you should convert to Latin1 as the |
746 |
first filter. |
first filter or refrain from using the iso-latin-1 based filter |
747 |
|
functions. It is recommended that you use your own converter based on |
748 |
|
the perl version you're using. |
749 |
|
|
750 |
=item split, split2, split3, ... |
=item split, split2, split3, ... |
751 |
|
|
754 |
words from the list that are shorter than 2 characters (bytes), split3 |
words from the list that are shorter than 2 characters (bytes), split3 |
755 |
eliminates those shorter than 3 characters (bytes) and so on. |
eliminates those shorter than 3 characters (bytes) and so on. |
756 |
|
|
757 |
|
=back |
758 |
|
|
759 |
=head1 AUTHOR |
=head1 AUTHOR |
760 |
|
|
761 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |