1 |
# -*- Mode: Perl -*- |
# -*- Mode: Cperl -*- |
2 |
# $Basename: Filter.pm $ |
# $Basename: Filter.pm $ |
3 |
# $Revision: 1.7 $ |
# $Revision: 1.8 $ |
4 |
# ITIID : $ITI$ $Header $__Header$ |
# ITIID : $ITI$ $Header $__Header$ |
5 |
# Author : Ulrich Pfeifer |
# Author : Ulrich Pfeifer |
6 |
# Created On : Thu Aug 15 18:09:51 1996 |
# Created On : Thu Aug 15 18:09:51 1996 |
9 |
# Language : CPerl |
# Language : CPerl |
10 |
# Update Count : 105 |
# Update Count : 105 |
11 |
# Status : Unknown, Use with caution! |
# Status : Unknown, Use with caution! |
12 |
# |
# |
13 |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
# Copyright (c) 1996-1997, Ulrich Pfeifer |
14 |
# |
# |
15 |
package WAIT::Filter; |
package WAIT::Filter; |
16 |
require WAIT; |
require WAIT; |
17 |
use strict; |
use strict; |
31 |
isouc disouc |
isouc disouc |
32 |
isotr disotr |
isotr disotr |
33 |
stop grundform |
stop grundform |
34 |
|
utf8iso |
35 |
); |
); |
36 |
|
# (most implemented in WAIT.xs) |
37 |
|
|
38 |
$VERSION = substr q$Revision: 1.7 $, 10; |
$VERSION = substr q$Revision: 1.8 $, 10; |
39 |
|
|
40 |
sub split { |
sub split { |
41 |
map split(' ', $_), @_; |
map split(' ', $_), @_; |
77 |
if $@ ne ''; |
if $@ ne ''; |
78 |
*decode_entities = HTML::Entities->can('decode_entities'); |
*decode_entities = HTML::Entities->can('decode_entities'); |
79 |
goto &decode_entities; |
goto &decode_entities; |
80 |
|
} elsif ($func =~ /^d?utf8iso$/) { |
81 |
|
require WAIT::Filter::utf8iso; |
82 |
|
croak "Your perl version must at least be 5.00556 to use '$func'" |
83 |
|
if $] < 5.00556; |
84 |
|
no strict 'refs'; |
85 |
|
*$func = \&{"WAIT::Filter::utf8iso::$func"}; |
86 |
|
goto &utf8iso; |
87 |
} |
} |
88 |
croak "Your vendor has not defined WAIT::Filter::$func"; |
Carp::confess "Class WAIT::Filter::$func not found"; |
89 |
} |
} |
90 |
|
|
91 |
while (<DATA>) { |
while (<DATA>) { |
213 |
former |
former |
214 |
formerly |
formerly |
215 |
forty |
forty |
216 |
found " |
found |
217 |
four |
four |
218 |
from |
from |
219 |
further |
further |
577 |
|
|
578 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
579 |
|
|
580 |
use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc); |
use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc |
581 |
|
isotr disotr stop grundform utf8iso); |
582 |
|
|
583 |
$stem = Stem($word); |
$stem = Stem($word); |
584 |
$scode = Soundex($word); |
$scode = Soundex($word); |
585 |
$pcode = Phonix($word); |
$pcode = Phonix($word); |
586 |
$lword = isolc($word); |
$lword = isolc($word); |
|
$uword = isouc($word); |
|
587 |
disolc($word); |
disolc($word); |
588 |
|
$uword = isouc($word); |
589 |
disouc($word); |
disouc($word); |
590 |
|
$trword = isotr($word); |
591 |
|
disotr($word); |
592 |
|
$word = stop($word); |
593 |
|
$word = grundform($word); |
594 |
|
|
595 |
|
@words = WAIT::Filter::split($word); |
596 |
|
@words = WAIT::Filter::split2($word); |
597 |
|
@words = WAIT::Filter::split3($word); |
598 |
|
@words = WAIT::Filter::split4($word); # arbitrary numbers allowed |
599 |
|
|
600 |
=head1 DESCRIPTION |
=head1 DESCRIPTION |
601 |
|
|
650 |
characters to upper and lower case. To allow for maximum speed there |
characters to upper and lower case. To allow for maximum speed there |
651 |
are also I<destructive> versions which change the argument instead of |
are also I<destructive> versions which change the argument instead of |
652 |
allocating a copy which is returned. For convenience, the destructive |
allocating a copy which is returned. For convenience, the destructive |
653 |
version also B<returns> the argument. So both of the following is |
version also B<returns> the argument. So all of the following is |
654 |
valid and C<$word> will contain the lowercased string. |
valid and C<$word> will contain the lowercased string. |
655 |
|
|
656 |
|
$word = isolc($word); |
657 |
$word = disolc($word); |
$word = disolc($word); |
658 |
disolc($word); |
disolc($word); |
|
|
|
659 |
|
|
660 |
Here are the hardcoded characters which are recognized: |
Here are the hardcoded characters which are recognized: |
661 |
|
|
674 |
|
|
675 |
transposes to upper case. |
transposes to upper case. |
676 |
|
|
677 |
|
=item C<$new = >B<isotr>C<($word)> |
678 |
|
|
679 |
|
=item B<disotr>C<($word)> |
680 |
|
|
681 |
|
Remove non-letters according to the above table. |
682 |
|
|
683 |
|
=item C<$new = >B<stop>C<($word)> |
684 |
|
|
685 |
|
Returns an empty string if $word is a stopword. |
686 |
|
|
687 |
|
=item C<$new = >B<grundform>C<($word)> |
688 |
|
|
689 |
|
Calls Text::German::reduce |
690 |
|
|
691 |
|
=item C<$new = >B<utf8iso>C<($word)> |
692 |
|
|
693 |
|
Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is |
694 |
|
internally based on the Latin1 character set, so if you process |
695 |
|
anything in a different encoding, you should convert to Latin1 as the |
696 |
|
first filter. |
697 |
|
|
698 |
|
=item split, split2, split3, ... |
699 |
|
|
700 |
|
The splitN funtions all take a scalar as input and return a list of |
701 |
|
words. Split acts just like the perl split(' '). Split2 eliminates all |
702 |
|
words from the list that are shorter than 2 characters (bytes), split3 |
703 |
|
eliminates those shorter than 3 characters (bytes) and so on. |
704 |
|
|
705 |
=head1 AUTHOR |
=head1 AUTHOR |
706 |
|
|
707 |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |
Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt> |