/[wait]/cvs-head/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /cvs-head/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC revision 19 by ulpfr, Tue May 9 11:29:45 2000 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
2  # $Basename: Filter.pm $  # $Basename: Filter.pm $
3  # $Revision: 1.7 $  # $Revision: 1.9 $
4  # ITIID           : $ITI$ $Header $__Header$  # ITIID           : $ITI$ $Header $__Header$
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Thu Aug 15 18:09:51 1996  # Created On      : Thu Aug 15 18:09:51 1996
# Line 9  Line 9 
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 105  # Update Count    : 105
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # Copyright (c) 1996-1997, Ulrich Pfeifer  # Copyright (c) 1996-1997, Ulrich Pfeifer
14  #  #
15  package WAIT::Filter;  package WAIT::Filter;
16  require WAIT;  require WAIT;
17  use strict;  use strict;
# Line 31  require Exporter; Line 31  require Exporter;
31                  isouc disouc                  isouc disouc
32                  isotr disotr                  isotr disotr
33                  stop grundform                  stop grundform
34                    utf8iso
35                 );                 );
36    # (most implemented in WAIT.xs)
37    
38  $VERSION = substr q$Revision: 1.7 $, 10;  $VERSION = substr q$Revision: 1.9 $, 10;
39    
40  sub split {  sub split {
41    map split(' ', $_), @_;    map split(' ', $_), @_;
# Line 75  sub AUTOLOAD { Line 77  sub AUTOLOAD {
77        if $@ ne '';        if $@ ne '';
78      *decode_entities = HTML::Entities->can('decode_entities');      *decode_entities = HTML::Entities->can('decode_entities');
79      goto &decode_entities;      goto &decode_entities;
80      } elsif ($func =~ /^d?utf8iso$/) {
81        require WAIT::Filter::utf8iso;
82        croak "Your perl version must at least be 5.00556 to use '$func'"
83            if $] < 5.00556;
84        no strict 'refs';
85        *$func = \&{"WAIT::Filter::utf8iso::$func"};
86        goto &utf8iso;
87    }    }
88    croak "Your vendor has not defined WAIT::Filter::$func";    Carp::confess "Class WAIT::Filter::$func not found";
89  }  }
90    
91  while (<DATA>) {  while (<DATA>) {
92    chomp;    chomp;
93    last if /__END__/;    last if /__END__/;
94      next if /^\s*#/; # there's a comment
95    $STOP{$_}++;    $STOP{$_}++;
96  }  }
97    
# Line 204  vfor Line 214  vfor
214  former  former
215  formerly  formerly
216  forty  forty
217  found "  found
218  four  four
219  from  from
220  further  further
# Line 568  WAIT::Filter - Perl extension providing Line 578  WAIT::Filter - Perl extension providing
578    
579  =head1 SYNOPSIS  =head1 SYNOPSIS
580    
581    use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
582                          isotr disotr stop grundform utf8iso);
583    
584    $stem  = Stem($word);    $stem   = Stem($word);
585    $scode = Soundex($word);    $scode  = Soundex($word);
586    $pcode = Phonix($word);    $pcode  = Phonix($word);
587    $lword = isolc($word);    $lword  = isolc($word);
   $uword = isouc($word);  
588    disolc($word);    disolc($word);
589      $uword  = isouc($word);
590    disouc($word);    disouc($word);
591      $trword = isotr($word);
592      disotr($word);
593      $word   = stop($word);
594      $word   = grundform($word);
595    
596      @words = WAIT::Filter::split($word);
597      @words = WAIT::Filter::split2($word);
598      @words = WAIT::Filter::split3($word);
599      @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
600    
601  =head1 DESCRIPTION  =head1 DESCRIPTION
602    
# Line 631  There are some additional function which Line 651  There are some additional function which
651  characters to upper and lower case. To allow for maximum speed there  characters to upper and lower case. To allow for maximum speed there
652  are also I<destructive> versions which change the argument instead of  are also I<destructive> versions which change the argument instead of
653  allocating a copy which is returned. For convenience, the destructive  allocating a copy which is returned. For convenience, the destructive
654  version also B<returns> the argument. So both of the following is  version also B<returns> the argument. So all of the following is
655  valid and C<$word> will contain the lowercased string.  valid and C<$word> will contain the lowercased string.
656    
657      $word = isolc($word);
658    $word = disolc($word);    $word = disolc($word);
659    disolc($word);    disolc($word);
     
660    
661  Here are the hardcoded characters which are recognized:  Here are the hardcoded characters which are recognized:
662    
# Line 655  transposes to lower case. Line 675  transposes to lower case.
675    
676  transposes to upper case.  transposes to upper case.
677    
678    =item C<$new = >B<isotr>C<($word)>
679    
680    =item  B<disotr>C<($word)>
681    
682    Remove non-letters according to the above table.
683    
684    =item C<$new = >B<stop>C<($word)>
685    
686    Returns an empty string if $word is a stopword.
687    
688    =item C<$new = >B<grundform>C<($word)>
689    
690    Calls Text::German::reduce
691    
692    =item C<$new = >B<utf8iso>C<($word)>
693    
694    Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
695    internally based on the Latin1 character set, so if you process
696    anything in a different encoding, you should convert to Latin1 as the
697    first filter.
698    
699    =item split, split2, split3, ...
700    
701    The splitN funtions all take a scalar as input and return a list of
702    words. Split acts just like the perl split(' '). Split2 eliminates all
703    words from the list that are shorter than 2 characters (bytes), split3
704    eliminates those shorter than 3 characters (bytes) and so on.
705    
706  =head1 AUTHOR  =head1 AUTHOR
707    
708  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Legend:
Removed from v.11  
changed lines
  Added in v.19

  ViewVC Help
Powered by ViewVC 1.1.26