/[wait]/trunk/lib/WAIT/Filter.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/WAIT/Filter.pm

Parent Directory | Revision Log | View Patch Patch

-branches/CPAN/lib/WAIT/Filter.pm
revision 13 by ulpfr,
Fri Apr 28 15:42:44 2000 UTC
+trunk/lib/WAIT/Filter.pm
revision 118 by dpavlin,
Fri Jul 15 18:59:10 2005 UTC
 Line 1
  #                              -*- Mode: Cperl -*-
  # $Basename: Filter.pm $
- # $Revision: 1.8 $
+ # $Revision: 1.9 $
  # ITIID           : $ITI$ $Header $__Header$
  # Author          : Ulrich Pfeifer
  # Created On      : Thu Aug 15 18:09:51 1996
 Line 16 
 package WAIT::Filter;
  require WAIT;
  use strict;
  use Carp;
- use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
+ use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
  use subs qw(grundform);
+ use Text::Unaccent;
+ use Text::Iconv;
  require Exporter;
  @ISA = qw(Exporter);
-Line 31 
 require Exporter;
+Line 34 
 require Exporter;
                  isouc disouc
                  isotr disotr
                  stop grundform
                  utf8iso
                 );
  # (most implemented in WAIT.xs)
- $VERSION = substr q$Revision: 1.8 $, 10;
+ $VERSION = substr q$Revision: 1.9 $, 10;
  sub split {
    map split(' ', $_), @_;
-Line 47 
 $SPLIT = q[
+Line 50 
 $SPLIT = q[
                           }
            ];
+ $UNAC = q[
+         sub unac_CHARSET {
+                 map split(' ',unac_string('CHARSET', $_) || $_), @_;
+         }
+ ];
+ my $iconv;
+ $ICONV = q[
+         sub iconv_CHARSETfrom_CHARSETto {
+                 my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
+                 $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
+                 map split(' ',$ic->convert($_) || $_), @_;
+         }
+ ];
  sub AUTOLOAD {
    my $func = $AUTOLOAD; $func =~ s/.*:://;
-Line 73 
 sub AUTOLOAD {
+Line 93 
 sub AUTOLOAD {
      goto \&date;
    } elsif ($func eq 'decode_entities') {
      eval {require HTML::Entities;};
-     croak "You must have HTML::Entities to use 'date'"
+     croak "You must have HTML::Entities to use 'decode_entities'"
        if $@ ne '';
      *decode_entities = HTML::Entities->can('decode_entities');
      goto &decode_entities;
    } elsif ($func =~ /^d?utf8iso$/) {
-     require WAIT::Filter::utf8iso;
-     croak "Your perl version must at least be 5.00556 to use '$func'"
-         if $] < 5.00556;
      no strict 'refs';
-     *$func = \&{"WAIT::Filter::utf8iso::$func"};
+     *$func = sub {
-     goto &utf8iso;
+       # Courtesy JHI
+       my $s = shift;
+       $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
+              {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
+       $s;
+     };
+     goto \&$func;
+   } elsif ($func =~ /unac_(.+)/) {
+     my $charset = $1;
+     my $unac = $UNAC;
+     $unac =~ s/CHARSET/$charset/g;
+ print "### $unac ###\n";
+     eval $unac;
+     if ($@ eq '') {
+      goto &$func;
+     }
+   } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
+     my ($cf,$ct) = ($1,$2);
+     my $iconv = $ICONV;
+ print "### $cf -> $ct\n";
+     $iconv =~ s/CHARSETfrom/$cf/gs;
+     $iconv =~ s/CHARSETto/$ct/gs;
+ print "### $iconv ###\n";
+     eval $iconv;
+     if ($@ eq '') {
+      goto &$func;
+     }
    }
    Carp::confess "Class WAIT::Filter::$func not found";
  }
-Line 91 
 sub AUTOLOAD {
+Line 134 
 sub AUTOLOAD {
  while (<DATA>) {
    chomp;
    last if /__END__/;
+   next if /^\s*#/; # there's a comment
    $STOP{$_}++;
  }
+ close DATA;
  sub stop {
    if (exists $STOP{$_[0]}) {
-Line 110 
 sub gdate {
+Line 155 
 sub gdate {
  }
 ;
  __DATA__
  a
  about
-Line 569 
 will
+Line 615 
 will
  with
  you
  __END__
- # Below is the stub of documentation for your module. You better edit it!
  =head1 NAME
-Line 578 
 WAIT::Filter - Perl extension providing
+Line 623 
 WAIT::Filter - Perl extension providing
  =head1 SYNOPSIS
    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
-                       isotr disotr stop grundform utf8iso);
+                       isotr disotr stop grundform);
    $stem   = Stem($word);
    $scode  = Soundex($word);
-Line 644 
 computes the 8 byte B<Phonix> code for I
+Line 689 
 computes the 8 byte B<Phonix> code for I
    PY: 1990
    PM: OCT
+ =back
  =head1 ISO charcater case functions
  There are some additional function which transpose some/most ISOlatin1
-Line 662 
 Here are the hardcoded characters which
+Line 709 
 Here are the hardcoded characters which
    abcdefghijklmnopqrstuvwxyz�����������������������������
    ABCDEFGHIJKLMNOPQRSTUVWXYZ�����������������������������
+ =over 5
  =item C<$new = >B<isolc>C<($word)>
  =item B<disolc>C<($word)>
-Line 690 
 Calls Text::German::reduce
+Line 739 
 Calls Text::German::reduce
  =item C<$new = >B<utf8iso>C<($word)>
- Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
+ Deprecated due to flux in perl versions between 5.005 and 5.8. The
- internally based on the Latin1 character set, so if you process
+ function converts UTF8 encoded strings to ISO-8859-1. WAIT is
+ internally still based on the Latin1 character set, so if you process
  anything in a different encoding, you should convert to Latin1 as the
- first filter.
+ first filter or refrain from using the iso-latin-1 based filter
+ functions. It is recommended that you use your own converter based on
+ the perl version you're using.
  =item split, split2, split3, ...
-Line 702 
 words. Split acts just like the perl spl
+Line 754 
 words. Split acts just like the perl spl
  words from the list that are shorter than 2 characters (bytes), split3
  eliminates those shorter than 3 characters (bytes) and so on.
+ =back
  =head1 AUTHOR
  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

 Legend:



Removed from v.13
 


changed lines


 
Added in v.118
 Legend:



Removed from v.13
 


changed lines


 
Added in v.118
-Removed from v.13
+Added in v.118

	ViewVC Help
Powered by ViewVC 1.1.26