/[wait]/trunk/lib/WAIT/Filter.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/Filter.pm

Parent Directory | Revision Log | View Patch Patch

-branches/CPAN/lib/WAIT/Filter.pm
revision 11 by unknown,
Fri Apr 28 15:41:10 2000 UTC
+cvs-head/lib/WAIT/Filter.pm
revision 50 by laperla,
Sat Mar  3 11:24:59 2001 UTC
 Line 1
- #                              -*- Mode: Perl -*-
+ #                              -*- Mode: Cperl -*-
  # $Basename: Filter.pm $
- # $Revision: 1.7 $
+ # $Revision: 1.9 $
  # ITIID           : $ITI$ $Header $__Header$
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug 15 18:09:51 1996
 Line 9
  # Language        : CPerl
  # Update Count    : 105
  # Status          : Unknown, Use with caution!
  #
  # Copyright (c) 1996-1997, Ulrich Pfeifer
  #
  package WAIT::Filter;
  require WAIT;
  use strict;
 Line 31 
 require Exporter;
                  isouc disouc
                  isotr disotr
                  stop grundform
+                 utf8iso
                 );
+ # (most implemented in WAIT.xs)
- $VERSION = substr q$Revision: 1.7 $, 10;
+ $VERSION = substr q$Revision: 1.9 $, 10;
  sub split {
    map split(' ', $_), @_;
-Line 75 
 sub AUTOLOAD {
+Line 77 
 sub AUTOLOAD {
        if $@ ne '';
      *decode_entities = HTML::Entities->can('decode_entities');
      goto &decode_entities;
+   } elsif ($func =~ /^d?utf8iso$/) {
+     no strict 'refs';
+     *$func = sub {
+       # Courtesy JHI
+       my $s = shift;
+       $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
+              {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
+       $s;
+     };
+     goto \&$func;
    }
-   croak "Your vendor has not defined WAIT::Filter::$func";
+   Carp::confess "Class WAIT::Filter::$func not found";
  }
  while (<DATA>) {
    chomp;
    last if /__END__/;
+   next if /^\s*#/; # there's a comment
    $STOP{$_}++;
  }
+ close DATA;
  sub stop {
    if (exists $STOP{$_[0]}) {
-Line 204 
 vfor
+Line 218 
 vfor
  former
  formerly
  forty
- found "
+ found
  four
  from
  further
-Line 568 
 WAIT::Filter - Perl extension providing
+Line 582 
 WAIT::Filter - Perl extension providing
  =head1 SYNOPSIS
-   use WAIT::Filter qw(Stem Soundex Phonix isolc isouc disolc disouc);
+   use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
+                       isotr disotr stop grundform);
-   $stem  = Stem($word);
+   $stem   = Stem($word);
-   $scode = Soundex($word);
+   $scode  = Soundex($word);
-   $pcode = Phonix($word);
+   $pcode  = Phonix($word);
-   $lword = isolc($word);
+   $lword  = isolc($word);
-   $uword = isouc($word);
    disolc($word);
+   $uword  = isouc($word);
    disouc($word);
+   $trword = isotr($word);
+   disotr($word);
+   $word   = stop($word);
+   $word   = grundform($word);
+   @words = WAIT::Filter::split($word);
+   @words = WAIT::Filter::split2($word);
+   @words = WAIT::Filter::split3($word);
+   @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
  =head1 DESCRIPTION
-Line 631 
 There are some additional function which
+Line 655 
 There are some additional function which
  characters to upper and lower case. To allow for maximum speed there
  are also I<destructive> versions which change the argument instead of
  allocating a copy which is returned. For convenience, the destructive
- version also B<returns> the argument. So both of the following is
+ version also B<returns> the argument. So all of the following is
  valid and C<$word> will contain the lowercased string.
+   $word = isolc($word);
    $word = disolc($word);
    disolc($word);
  Here are the hardcoded characters which are recognized:
-Line 655 
 transposes to lower case.
+Line 679 
 transposes to lower case.
  transposes to upper case.
+ =item C<$new = >B<isotr>C<($word)>
+ =item  B<disotr>C<($word)>
+ Remove non-letters according to the above table.
+ =item C<$new = >B<stop>C<($word)>
+ Returns an empty string if $word is a stopword.
+ =item C<$new = >B<grundform>C<($word)>
+ Calls Text::German::reduce
+ =item C<$new = >B<utf8iso>C<($word)>
+ Deprecated due to flux in perl versions between 5.005 and 5.8. The
+ function converts UTF8 encoded strings to ISO-8859-1. WAIT is
+ internally still based on the Latin1 character set, so if you process
+ anything in a different encoding, you should convert to Latin1 as the
+ first filter or refrain from using the iso-latin-1 based filter
+ functions. It is recommended that you use your own converter based on
+ the perl version you're using.
+ =item split, split2, split3, ...
+ The splitN funtions all take a scalar as input and return a list of
+ words. Split acts just like the perl split(' '). Split2 eliminates all
+ words from the list that are shorter than 2 characters (bytes), split3
+ eliminates those shorter than 3 characters (bytes) and so on.
  =head1 AUTHOR
  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

 Legend:



Removed from v.11
 


changed lines


 
Added in v.50
 Legend:



Removed from v.11
 


changed lines


 
Added in v.50
-Removed from v.11
+Added in v.50

	ViewVC Help
Powered by ViewVC 1.1.26