/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

branches/CPAN/lib/WAIT/Filter.pm revision 11 by unknown, Fri Apr 28 15:41:10 2000 UTC cvs-head/lib/WAIT/Filter.pm revision 50 by laperla, Sat Mar 3 11:24:59 2001 UTC
# Line 1  Line 1 
1  #                              -*- Mode: Perl -*-  #                              -*- Mode: Cperl -*-
2  # $Basename: Filter.pm $  # $Basename: Filter.pm $
3  # $Revision: 1.7 $  # $Revision: 1.9 $
4  # ITIID           : $ITI$ $Header $__Header$  # ITIID           : $ITI$ $Header $__Header$
5  # Author          : Ulrich Pfeifer  # Author          : Ulrich Pfeifer
6  # Created On      : Thu Aug 15 18:09:51 1996  # Created On      : Thu Aug 15 18:09:51 1996
# Line 9  Line 9 
9  # Language        : CPerl  # Language        : CPerl
10  # Update Count    : 105  # Update Count    : 105
11  # Status          : Unknown, Use with caution!  # Status          : Unknown, Use with caution!
12  #  #
13  # Copyright (c) 1996-1997, Ulrich Pfeifer  # Copyright (c) 1996-1997, Ulrich Pfeifer
14  #  #
15  package WAIT::Filter;  package WAIT::Filter;
16  require WAIT;  require WAIT;
17  use strict;  use strict;
# Line 31  require Exporter; Line 31  require Exporter;
31                  isouc disouc                  isouc disouc
32                  isotr disotr                  isotr disotr
33                  stop grundform                  stop grundform
34                    utf8iso
35                 );                 );
36    # (most implemented in WAIT.xs)
37    
38  $VERSION = substr q$Revision: 1.7 $, 10;  $VERSION = substr q$Revision: 1.9 $, 10;
39    
40  sub split {  sub split {
41    map split(' ', $_), @_;    map split(' ', $_), @_;
# Line 75  sub AUTOLOAD { Line 77  sub AUTOLOAD {
77        if $@ ne '';        if $@ ne '';
78      *decode_entities = HTML::Entities->can('decode_entities');      *decode_entities = HTML::Entities->can('decode_entities');
79      goto &decode_entities;      goto &decode_entities;
80      } elsif ($func =~ /^d?utf8iso$/) {
81        no strict 'refs';
82        *$func = sub {
83          # Courtesy JHI
84          my $s = shift;
85          $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86                 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
87          $s;
88        };
89        goto \&$func;
90    }    }
91    croak "Your vendor has not defined WAIT::Filter::$func";    Carp::confess "Class WAIT::Filter::$func not found";
92  }  }
93    
94  while (<DATA>) {  while (<DATA>) {
95    chomp;    chomp;
96    last if /__END__/;    last if /__END__/;
97      next if /^\s*#/; # there's a comment
98    $STOP{$_}++;    $STOP{$_}++;
99  }  }
100    close DATA;
101    
102  sub stop {  sub stop {
103    if (exists $STOP{$_[0]}) {    if (exists $STOP{$_[0]}) {
# Line 204  vfor Line 218  vfor
218  former  former
219  formerly  formerly
220  forty  forty
221  found "  found
222  four  four
223  from  from
224  further  further
# Line 568  WAIT::Filter - Perl extension providing Line 582  WAIT::Filter - Perl extension providing
582    
583  =head1 SYNOPSIS  =head1 SYNOPSIS
584    
585    use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586                          isotr disotr stop grundform);
587    
588    $stem  = Stem($word);    $stem   = Stem($word);
589    $scode = Soundex($word);    $scode  = Soundex($word);
590    $pcode = Phonix($word);    $pcode  = Phonix($word);
591    $lword = isolc($word);    $lword  = isolc($word);
   $uword = isouc($word);  
592    disolc($word);    disolc($word);
593      $uword  = isouc($word);
594    disouc($word);    disouc($word);
595      $trword = isotr($word);
596      disotr($word);
597      $word   = stop($word);
598      $word   = grundform($word);
599    
600      @words = WAIT::Filter::split($word);
601      @words = WAIT::Filter::split2($word);
602      @words = WAIT::Filter::split3($word);
603      @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604    
605  =head1 DESCRIPTION  =head1 DESCRIPTION
606    
# Line 631  There are some additional function which Line 655  There are some additional function which
655  characters to upper and lower case. To allow for maximum speed there  characters to upper and lower case. To allow for maximum speed there
656  are also I<destructive> versions which change the argument instead of  are also I<destructive> versions which change the argument instead of
657  allocating a copy which is returned. For convenience, the destructive  allocating a copy which is returned. For convenience, the destructive
658  version also B<returns> the argument. So both of the following is  version also B<returns> the argument. So all of the following is
659  valid and C<$word> will contain the lowercased string.  valid and C<$word> will contain the lowercased string.
660    
661      $word = isolc($word);
662    $word = disolc($word);    $word = disolc($word);
663    disolc($word);    disolc($word);
     
664    
665  Here are the hardcoded characters which are recognized:  Here are the hardcoded characters which are recognized:
666    
# Line 655  transposes to lower case. Line 679  transposes to lower case.
679    
680  transposes to upper case.  transposes to upper case.
681    
682    =item C<$new = >B<isotr>C<($word)>
683    
684    =item  B<disotr>C<($word)>
685    
686    Remove non-letters according to the above table.
687    
688    =item C<$new = >B<stop>C<($word)>
689    
690    Returns an empty string if $word is a stopword.
691    
692    =item C<$new = >B<grundform>C<($word)>
693    
694    Calls Text::German::reduce
695    
696    =item C<$new = >B<utf8iso>C<($word)>
697    
698    Deprecated due to flux in perl versions between 5.005 and 5.8. The
699    function converts UTF8 encoded strings to ISO-8859-1. WAIT is
700    internally still based on the Latin1 character set, so if you process
701    anything in a different encoding, you should convert to Latin1 as the
702    first filter or refrain from using the iso-latin-1 based filter
703    functions. It is recommended that you use your own converter based on
704    the perl version you're using.
705    
706    =item split, split2, split3, ...
707    
708    The splitN funtions all take a scalar as input and return a list of
709    words. Split acts just like the perl split(' '). Split2 eliminates all
710    words from the list that are shorter than 2 characters (bytes), split3
711    eliminates those shorter than 3 characters (bytes) and so on.
712    
713  =head1 AUTHOR  =head1 AUTHOR
714    
715  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Legend:
Removed from v.11  
changed lines
  Added in v.50

  ViewVC Help
Powered by ViewVC 1.1.26