/[wait]/trunk/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

cvs-head/lib/WAIT/Filter.pm revision 20 by cvs2svn, Tue May 9 11:29:45 2000 UTC trunk/lib/WAIT/Filter.pm revision 118 by dpavlin, Fri Jul 15 18:59:10 2005 UTC
# Line 16  package WAIT::Filter; Line 16  package WAIT::Filter;
16  require WAIT;  require WAIT;
17  use strict;  use strict;
18  use Carp;  use Carp;
19  use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);  use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $UNAC $ICONV $AUTOLOAD);
20  use subs qw(grundform);  use subs qw(grundform);
21    
22    use Text::Unaccent;
23    use Text::Iconv;
24    
25  require Exporter;  require Exporter;
26    
27  @ISA = qw(Exporter);  @ISA = qw(Exporter);
# Line 31  require Exporter; Line 34  require Exporter;
34                  isouc disouc                  isouc disouc
35                  isotr disotr                  isotr disotr
36                  stop grundform                  stop grundform
37                  utf8iso                  utf8iso
38                 );                 );
39  # (most implemented in WAIT.xs)  # (most implemented in WAIT.xs)
40    
# Line 47  $SPLIT = q[ Line 50  $SPLIT = q[
50                           }                           }
51            ];            ];
52    
53    $UNAC = q[
54            sub unac_CHARSET {
55                    map split(' ',unac_string('CHARSET', $_) || $_), @_;
56            }
57    ];
58    
59    my $iconv;
60    
61    $ICONV = q[
62            sub iconv_CHARSETfrom_CHARSETto {
63                    my $ic = $iconv->{'CHARSETfrom_CHARSETto'});
64                    $ic ||= $iconv->{'CHARSETfrom_CHARSETto'} = Text::Iconv->new('CHARSETfrom','CHARSETto');
65                    map split(' ',$ic->convert($_) || $_), @_;
66            }
67    ];
68                    
69    
70  sub AUTOLOAD {  sub AUTOLOAD {
71    my $func = $AUTOLOAD; $func =~ s/.*:://;    my $func = $AUTOLOAD; $func =~ s/.*:://;
72    
# Line 73  sub AUTOLOAD { Line 93  sub AUTOLOAD {
93      goto \&date;      goto \&date;
94    } elsif ($func eq 'decode_entities') {    } elsif ($func eq 'decode_entities') {
95      eval {require HTML::Entities;};      eval {require HTML::Entities;};
96      croak "You must have HTML::Entities to use 'date'"      croak "You must have HTML::Entities to use 'decode_entities'"
97        if $@ ne '';        if $@ ne '';
98      *decode_entities = HTML::Entities->can('decode_entities');      *decode_entities = HTML::Entities->can('decode_entities');
99      goto &decode_entities;      goto &decode_entities;
100    } elsif ($func =~ /^d?utf8iso$/) {    } elsif ($func =~ /^d?utf8iso$/) {
     require WAIT::Filter::utf8iso;  
     croak "Your perl version must at least be 5.00556 to use '$func'"  
         if $] < 5.00556;  
101      no strict 'refs';      no strict 'refs';
102      *$func = \&{"WAIT::Filter::utf8iso::$func"};      *$func = sub {
103      goto &utf8iso;        # Courtesy JHI
104          my $s = shift;
105          $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
106                 {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
107          $s;
108        };
109        goto \&$func;
110      } elsif ($func =~ /unac_(.+)/) {
111        my $charset = $1;
112        my $unac = $UNAC;
113        $unac =~ s/CHARSET/$charset/g;
114    print "### $unac ###\n";
115        eval $unac;
116        if ($@ eq '') {
117         goto &$func;
118        }
119      } elsif ($func =~ /iconv_([^_]+)_([^_]+)/) {
120        my ($cf,$ct) = ($1,$2);
121        my $iconv = $ICONV;
122    print "### $cf -> $ct\n";
123        $iconv =~ s/CHARSETfrom/$cf/gs;
124        $iconv =~ s/CHARSETto/$ct/gs;
125    print "### $iconv ###\n";
126        eval $iconv;
127        if ($@ eq '') {
128         goto &$func;
129        }
130    }    }
131    Carp::confess "Class WAIT::Filter::$func not found";    Carp::confess "Class WAIT::Filter::$func not found";
132  }  }
# Line 94  while (<DATA>) { Line 137  while (<DATA>) {
137    next if /^\s*#/; # there's a comment    next if /^\s*#/; # there's a comment
138    $STOP{$_}++;    $STOP{$_}++;
139  }  }
140    close DATA;
141    
142  sub stop {  sub stop {
143    if (exists $STOP{$_[0]}) {    if (exists $STOP{$_[0]}) {
# Line 111  sub gdate { Line 155  sub gdate {
155  }  }
156    
157  1;  1;
158    
159  __DATA__  __DATA__
160  a  a
161  about  about
# Line 570  will Line 615  will
615  with  with
616  you  you
617  __END__  __END__
 # Below is the stub of documentation for your module. You better edit it!  
618    
619  =head1 NAME  =head1 NAME
620    
# Line 579  WAIT::Filter - Perl extension providing Line 623  WAIT::Filter - Perl extension providing
623  =head1 SYNOPSIS  =head1 SYNOPSIS
624    
625    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc    use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
626                        isotr disotr stop grundform utf8iso);                        isotr disotr stop grundform);
627    
628    $stem   = Stem($word);    $stem   = Stem($word);
629    $scode  = Soundex($word);    $scode  = Soundex($word);
# Line 645  computes the 8 byte B<Phonix> code for I Line 689  computes the 8 byte B<Phonix> code for I
689    PY: 1990    PY: 1990
690    PM: OCT    PM: OCT
691    
692    =back
693    
694  =head1 ISO charcater case functions  =head1 ISO charcater case functions
695    
696  There are some additional function which transpose some/most ISOlatin1  There are some additional function which transpose some/most ISOlatin1
# Line 663  Here are the hardcoded characters which Line 709  Here are the hardcoded characters which
709    abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß    abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
710    ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß    ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
711    
712    =over 5
713    
714  =item C<$new = >B<isolc>C<($word)>  =item C<$new = >B<isolc>C<($word)>
715    
716  =item B<disolc>C<($word)>  =item B<disolc>C<($word)>
# Line 691  Calls Text::German::reduce Line 739  Calls Text::German::reduce
739    
740  =item C<$new = >B<utf8iso>C<($word)>  =item C<$new = >B<utf8iso>C<($word)>
741    
742  Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is  Deprecated due to flux in perl versions between 5.005 and 5.8. The
743  internally based on the Latin1 character set, so if you process  function converts UTF8 encoded strings to ISO-8859-1. WAIT is
744    internally still based on the Latin1 character set, so if you process
745  anything in a different encoding, you should convert to Latin1 as the  anything in a different encoding, you should convert to Latin1 as the
746  first filter.  first filter or refrain from using the iso-latin-1 based filter
747    functions. It is recommended that you use your own converter based on
748    the perl version you're using.
749    
750  =item split, split2, split3, ...  =item split, split2, split3, ...
751    
# Line 703  words. Split acts just like the perl spl Line 754  words. Split acts just like the perl spl
754  words from the list that are shorter than 2 characters (bytes), split3  words from the list that are shorter than 2 characters (bytes), split3
755  eliminates those shorter than 3 characters (bytes) and so on.  eliminates those shorter than 3 characters (bytes) and so on.
756    
757    =back
758    
759  =head1 AUTHOR  =head1 AUTHOR
760    
761  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>  Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Legend:
Removed from v.20  
changed lines
  Added in v.118

  ViewVC Help
Powered by ViewVC 1.1.26