1 |
# -*- Mode: Perl -*- |
# -*- Mode: Cperl -*- |
2 |
# $Basename: Filter.pm $ |
# $Basename: Filter.pm $ |
3 |
# $Revision: 1.7 $ |
# $Revision: 1.9 $ |
4 |
# ITIID : $ITI$ $Header $__Header$ |
# ITIID : $ITI$ $Header $__Header$ |
5 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
6 |
# Created On : Thu Aug 15 18:09:51 1996 |
# Created On : Thu Aug 15 18:09:51 1996 |
9 |
# Language : CPerl |
# Language : CPerl |
10 |
# Update Count : 105 |
# Update Count : 105 |
11 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
12 |
# |
# |
13 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
14 |
# |
# |
15 |
package WAIT::Filter; |
package WAIT::Filter; |
16 |
require WAIT; |
require WAIT; |
17 |
use strict; |
use strict; |
31 |
isouc disouc |
isouc disouc |
32 |
isotr disotr |
isotr disotr |
33 |
stop grundform |
stop grundform |
34 |
|
utf8iso |
35 |
); |
); |
36 |
|
# (most implemented in WAIT.xs) |
37 |
|
|
38 |
$VERSION = substr q$Revision: 1.7 $, 10; |
$VERSION = substr q$Revision: 1.9 $, 10; |
39 |
|
|
40 |
sub split { |
sub split { |
41 |
map split(' ', $_), @_; |
map split(' ', $_), @_; |
77 |
if $@ ne ''; |
if $@ ne ''; |
78 |
*decode_entities = HTML::Entities->can('decode_entities'); |
*decode_entities = HTML::Entities->can('decode_entities'); |
79 |
goto &decode_entities; |
goto &decode_entities; |
80 |
|
} elsif ($func =~ /^d?utf8iso$/) { |
81 |
|
require WAIT::Filter::utf8iso; |
82 |
|
croak "Your perl version must at least be 5.00556 to use '$func'" |
83 |
|
if $] < 5.00556; |
84 |
|
no strict 'refs'; |
85 |
|
*$func = \&{"WAIT::Filter::utf8iso::$func"}; |
86 |
|
goto &utf8iso; |
87 |
} |
} |
88 |
croak "Your vendor has not defined WAIT::Filter::$func"; |
Carp::confess "Class WAIT::Filter::$func not found"; |
89 |
} |
} |
90 |
|
|
91 |
while (<DATA>) { |
while (<DATA>) { |
92 |
chomp; |
chomp; |
93 |
last if /__END__/; |
last if /__END__/; |
94 |
|
next if /^\s*#/; # there's a comment |
95 |
$STOP{$_}++; |
$STOP{$_}++; |
96 |
} |
} |
97 |
|
|
214 |
former |
former |
215 |
formerly |
formerly |
216 |
forty |
forty |
217 |
found " |
found |
218 |
four |
four |
219 |
from |
from |
220 |
further |
further |
578 |
|
|
579 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
580 |
|
|
581 |
use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc); |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
582 |
|
isotr disotr stop grundform utf8iso); |
583 |
|
|
584 |
$stem = Stem($word); |
$stem = Stem($word); |
585 |
$scode = Soundex($word); |
$scode = Soundex($word); |
586 |
$pcode = Phonix($word); |
$pcode = Phonix($word); |
587 |
$lword = isolc($word); |
$lword = isolc($word); |
|
$uword = isouc($word); |
|
588 |
disolc($word); |
disolc($word); |
589 |
|
$uword = isouc($word); |
590 |
disouc($word); |
disouc($word); |
591 |
|
$trword = isotr($word); |
592 |
|
disotr($word); |
593 |
|
$word = stop($word); |
594 |
|
$word = grundform($word); |
595 |
|
|
596 |
|
@words = WAIT::Filter::split($word); |
597 |
|
@words = WAIT::Filter::split2($word); |
598 |
|
@words = WAIT::Filter::split3($word); |
599 |
|
@words = WAIT::Filter::split4($word); # arbitrary numbers allowed |
600 |
|
|
601 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
602 |
|
|
651 |
characters to upper and lower case. To allow for maximum speed there |
characters to upper and lower case. To allow for maximum speed there |
652 |
are also I<destructive> versions which change the argument instead of |
are also I<destructive> versions which change the argument instead of |
653 |
allocating a copy which is returned. For convenience, the destructive |
allocating a copy which is returned. For convenience, the destructive |
654 |
version also B<returns> the argument. So both of the following is |
version also B<returns> the argument. So all of the following is |
655 |
valid and C<$word> will contain the lowercased string. |
valid and C<$word> will contain the lowercased string. |
656 |
|
|
657 |
|
$word = isolc($word); |
658 |
$word = disolc($word); |
$word = disolc($word); |
659 |
disolc($word); |
disolc($word); |
|
|
|
660 |
|
|
661 |
Here are the hardcoded characters which are recognized: |
Here are the hardcoded characters which are recognized: |
662 |
|
|
675 |
|
|
676 |
transposes to upper case. |
transposes to upper case. |
677 |
|
|
678 |
|
=item C<$new = >B<isotr>C<($word)> |
679 |
|
|
680 |
|
=item B<disotr>C<($word)> |
681 |
|
|
682 |
|
Remove non-letters according to the above table. |
683 |
|
|
684 |
|
=item C<$new = >B<stop>C<($word)> |
685 |
|
|
686 |
|
Returns an empty string if $word is a stopword. |
687 |
|
|
688 |
|
=item C<$new = >B<grundform>C<($word)> |
689 |
|
|
690 |
|
Calls Text::German::reduce |
691 |
|
|
692 |
|
=item C<$new = >B<utf8iso>C<($word)> |
693 |
|
|
694 |
|
Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is |
695 |
|
internally based on the Latin1 character set, so if you process |
696 |
|
anything in a different encoding, you should convert to Latin1 as the |
697 |
|
first filter. |
698 |
|
|
699 |
|
=item split, split2, split3, ... |
700 |
|
|
701 |
|
The splitN funtions all take a scalar as input and return a list of |
702 |
|
words. Split acts just like the perl split(' '). Split2 eliminates all |
703 |
|
words from the list that are shorter than 2 characters (bytes), split3 |
704 |
|
eliminates those shorter than 3 characters (bytes) and so on. |
705 |
|
|
706 |
=head1 AUTHOR |
=head1 AUTHOR |
707 |
|
|
708 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |