/[wait]/cvs-head/lib/WAIT/Filter.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Filter.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 32 - (show annotations)
Sun Nov 12 09:31:04 2000 UTC (23 years, 5 months ago) by laperla
File size: 8957 byte(s)
UTF-8 conversion deprecated and for the time being replaced by a
perl-only UTF-8 converter

1 # -*- Mode: Cperl -*-
2 # $Basename: Filter.pm $
3 # $Revision: 1.9 $
4 # ITIID : $ITI$ $Header $__Header$
5 # Author : Ulrich Pfeifer
6 # Created On : Thu Aug 15 18:09:51 1996
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Sun Nov 22 18:44:46 1998
9 # Language : CPerl
10 # Update Count : 105
11 # Status : Unknown, Use with caution!
12 #
13 # Copyright (c) 1996-1997, Ulrich Pfeifer
14 #
15 package WAIT::Filter;
16 require WAIT;
17 use strict;
18 use Carp;
19 use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20 use subs qw(grundform);
21
22 require Exporter;
23
24 @ISA = qw(Exporter);
25 @EXPORT_OK = qw(
26 Stem
27 Soundex
28 Phonix
29 Metaphone
30 isolc disolc
31 isouc disouc
32 isotr disotr
33 stop grundform
34 utf8iso
35 );
36 # (most implemented in WAIT.xs)
37
38 $VERSION = substr q$Revision: 1.9 $, 10;
39
40 sub split {
41 map split(' ', $_), @_;
42 }
43
44 $SPLIT = q[
45 sub splitXXX {
46 grep length($_)>=XXX, map split(' ', $_), @_;
47 }
48 ];
49
50 sub AUTOLOAD {
51 my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53 if ($func =~ /split(\d+)/) {
54 my $num = $1;
55 my $split = $SPLIT;
56
57 $split =~ s/XXX/$num/g;
58 eval $split;
59 if ($@ eq '') {
60 goto &$AUTOLOAD;
61 }
62 } elsif ($func eq 'grundform') {
63 eval {require Text::German;};
64 croak "You must have Text::German to use 'grundform'"
65 if $@ ne '';
66 *grundform = Text::German->can('reduce');
67 goto &grundform;
68 } elsif ($func eq 'date') {
69 eval {require Time::ParseDate;};
70 croak "You must have Time::ParseDate to use 'date'"
71 if $@ ne '';
72 *date = Time::ParseDate->can('parsedate');
73 goto \&date;
74 } elsif ($func eq 'decode_entities') {
75 eval {require HTML::Entities;};
76 croak "You must have HTML::Entities to use 'date'"
77 if $@ ne '';
78 *decode_entities = HTML::Entities->can('decode_entities');
79 goto &decode_entities;
80 } elsif ($func =~ /^d?utf8iso$/) {
81 no strict 'refs';
82 *$func = sub {
83 # Courtesy JHI
84 $_[0] =~ s{([\xC0-\xDF])([\x80-\xBF])}{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; #};
85 };
86 goto \&$func;
87 }
88 Carp::confess "Class WAIT::Filter::$func not found";
89 }
90
91 while (<DATA>) {
92 chomp;
93 last if /__END__/;
94 next if /^\s*#/; # there's a comment
95 $STOP{$_}++;
96 }
97
98 sub stop {
99 if (exists $STOP{$_[0]}) {
100 ''
101 } else {
102 $_[0];
103 }
104 }
105
106 sub gdate {
107 my $date = shift;
108
109 $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
110 date($date);
111 }
112
113 1;
114 __DATA__
115 a
116 about
117 above
118 according
119 across
120 actually
121 adj
122 after
123 afterwards
124 again
125 against
126 all
127 almost
128 alone
129 along
130 already
131 also
132 although
133 always
134 among
135 amongst
136 an
137 and
138 another
139 any
140 anyhow
141 anyone
142 anything
143 anywhere
144 are
145 aren't
146 around
147 as
148 at
149 b
150 be
151 became
152 because
153 become
154 becomes
155 becoming
156 been
157 before
158 beforehand
159 begin
160 beginning
161 behind
162 being
163 below
164 beside
165 besides
166 between
167 beyond
168 billion
169 both
170 but
171 by
172 c
173 can
174 can't
175 cannot
176 caption
177 co
178 co.
179 could
180 couldn't
181 d
182 did
183 didn't
184 do
185 does
186 doesn't
187 don't
188 down
189 during
190 e
191 eg
192 eight
193 eighty
194 either
195 else
196 elsewhere
197 end
198 ending
199 enough
200 etc
201 even
202 ever
203 every
204 everyone
205 everything
206 everywhere
207 except
208 f
209 few
210 fifty
211 first
212 five
213 vfor
214 former
215 formerly
216 forty
217 found
218 four
219 from
220 further
221 g
222 h
223 had
224 has
225 hasn't
226 have
227 haven't
228 he
229 he'd
230 he'll
231 he's
232 hence
233 her
234 here
235 here's
236 hereafter
237 hereby
238 herein
239 hereupon
240 hers
241 herself
242 him
243 himself
244 his
245 how
246 however
247 hundred
248 i
249 i'd
250 i'll
251 i'm
252 i've
253 ie
254 if
255 in
256 inc.
257 indeed
258 instead
259 into
260 is
261 isn't
262 it
263 it's
264 its
265 itself
266 j
267 k
268 l
269 last
270 later
271 latter
272 latterly
273 least
274 less
275 let
276 let's
277 like
278 likely
279 ltd
280 m
281 made
282 make
283 makes
284 many
285 maybe
286 me
287 meantime
288 meanwhile
289 might
290 million
291 miss
292 more
293 moreover
294 most
295 mostly
296 mr
297 mrs
298 much
299 must
300 my
301 myself
302 n
303 namely
304 neither
305 never
306 nevertheless
307 next
308 nine
309 ninety
310 no
311 nobody
312 none
313 nonetheless
314 noone
315 nor
316 not
317 nothing
318 now
319 nowhere
320 o
321 of
322 off
323 often
324 on
325 once
326 one
327 one's
328 only
329 onto
330 or
331 other
332 others
333 otherwise
334 our
335 ours
336 ourselves
337 out
338 over
339 overall
340 own
341 p
342 per
343 perhaps
344 q
345 r
346 rather
347 recent
348 recently
349 s
350 same
351 seem
352 seemed
353 seeming
354 seems
355 seven
356 seventy
357 several
358 she
359 she'd
360 she'll
361 she's
362 should
363 shouldn't
364 since
365 six
366 sixty
367 so
368 some
369 somehow
370 someone
371 something
372 sometime
373 sometimes
374 somewhere
375 still
376 stop
377 such
378 t
379 taking
380 ten
381 than
382 that
383 that'll
384 that's
385 that've
386 the
387 their
388 them
389 themselves
390 then
391 thence
392 there
393 there'd
394 there'll
395 there're
396 there's
397 there've
398 thereafter
399 thereby
400 therefore
401 therein
402 thereupon
403 these
404 they
405 they'd
406 they'll
407 they're
408 they've
409 thirty
410 this
411 those
412 though
413 thousand
414 three
415 through
416 throughout
417 thru
418 thus
419 to
420 together
421 too
422 toward
423 towards
424 trillion
425 twenty
426 two
427 u
428 under
429 unless
430 unlike
431 unlikely
432 until
433 up
434 upon
435 us
436 used
437 using
438 v
439 very
440 via
441 w
442 was
443 wasn't
444 we
445 we'd
446 we'll
447 we're
448 we've
449 well
450 were
451 weren't
452 what
453 what'll
454 what's
455 what've
456 whatever
457 when
458 whence
459 whenever
460 where
461 where's
462 whereafter
463 whereas
464 whereby
465 wherein
466 whereupon
467 wherever
468 whether
469 which
470 while
471 whither
472 who
473 who'd
474 who'll
475 who's
476 whoever
477 whole
478 whom
479 whomever
480 whose
481 why
482 will
483 with
484 within
485 without
486 won't
487 would
488 wouldn't
489 x
490 y
491 yes
492 yet
493 you
494 you'd
495 you'll
496 you're
497 you've
498 your
499 yours
500 yourself
501 yourselves
502 z
503 # occuring in more than 100 files
504 acc
505 accent
506 accents
507 and
508 are
509 bell
510 can
511 character
512 corrections
513 crt
514 daisy
515 dash
516 date
517 defined
518 definitions
519 description
520 devices
521 diablo
522 dummy
523 factors
524 following
525 font
526 for
527 from
528 fudge
529 give
530 have
531 header
532 holds
533 log
534 logo
535 low
536 lpr
537 mark
538 name
539 nroff
540 out
541 output
542 pitch
543 put
544 rcsfile
545 reference
546 resolution
547 revision
548 see
549 set
550 simple
551 smi
552 some
553 string
554 synopsis
555 system
556 that
557 the
558 this
559 translation
560 troff
561 typewriter
562 ucb
563 unbreakable
564 use
565 used
566 user
567 vroff
568 wheel
569 will
570 with
571 you
572 __END__
573 # Below is the stub of documentation for your module. You better edit it!
574
575 =head1 NAME
576
577 WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
578
579 =head1 SYNOPSIS
580
581 use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
582 isotr disotr stop grundform);
583
584 $stem = Stem($word);
585 $scode = Soundex($word);
586 $pcode = Phonix($word);
587 $lword = isolc($word);
588 disolc($word);
589 $uword = isouc($word);
590 disouc($word);
591 $trword = isotr($word);
592 disotr($word);
593 $word = stop($word);
594 $word = grundform($word);
595
596 @words = WAIT::Filter::split($word);
597 @words = WAIT::Filter::split2($word);
598 @words = WAIT::Filter::split3($word);
599 @words = WAIT::Filter::split4($word); # arbitrary numbers allowed
600
601 =head1 DESCRIPTION
602
603 This tiny modules gives access to the basic reduction functions build
604 in B<freeWAIS-sf>.
605
606 =over 5
607
608 =item B<Stem>(I<word>)
609
610 reduces I<word> using the well know Porter algorithm.
611
612 AU: Porter, M.F.
613 TI: An Algorithm for Suffix Stripping
614 JT: Program
615 VO: 14
616 PP: 130-137
617 PY: 1980
618 PM: JUL
619
620 =item B<Soundex>(I<word>)
621
622
623 computes the 4 byte B<Soundex> code for I<word>.
624
625 AU: Gadd, T.N.
626 TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
627 Information Retrieval Systems
628 JT: Program
629 VO: 22
630 NO: 3
631 PP: 222-237
632 PY: 1988
633
634
635 =item B<Phonix>(I<word>)
636
637 computes the 8 byte B<Phonix> code for I<word>.
638
639 AU: Gadd, T.N.
640 TI: PHONIX: The Algorithm
641 JT: Program
642 VO: 24
643 NO: 4
644 PP: 363-366
645 PY: 1990
646 PM: OCT
647
648 =head1 ISO charcater case functions
649
650 There are some additional function which transpose some/most ISOlatin1
651 characters to upper and lower case. To allow for maximum speed there
652 are also I<destructive> versions which change the argument instead of
653 allocating a copy which is returned. For convenience, the destructive
654 version also B<returns> the argument. So all of the following is
655 valid and C<$word> will contain the lowercased string.
656
657 $word = isolc($word);
658 $word = disolc($word);
659 disolc($word);
660
661 Here are the hardcoded characters which are recognized:
662
663 abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
664 ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
665
666 =item C<$new = >B<isolc>C<($word)>
667
668 =item B<disolc>C<($word)>
669
670 transposes to lower case.
671
672 =item C<$new = >B<isouc>C<($word)>
673
674 =item B<disouc>C<($word)>
675
676 transposes to upper case.
677
678 =item C<$new = >B<isotr>C<($word)>
679
680 =item B<disotr>C<($word)>
681
682 Remove non-letters according to the above table.
683
684 =item C<$new = >B<stop>C<($word)>
685
686 Returns an empty string if $word is a stopword.
687
688 =item C<$new = >B<grundform>C<($word)>
689
690 Calls Text::German::reduce
691
692 =item C<$new = >B<utf8iso>C<($word)>
693
694 Deprecated due to flux in perl versions between 5.005 and 5.8. The
695 function converts UTF8 encoded strings to ISO-8859-1. WAIT is
696 internally still based on the Latin1 character set, so if you process
697 anything in a different encoding, you should convert to Latin1 as the
698 first filter or refrain from using the iso-latin-1 based filter
699 functions. It is recommended that you use your own converter based on
700 the perl version you're using.
701
702 =item split, split2, split3, ...
703
704 The splitN funtions all take a scalar as input and return a list of
705 words. Split acts just like the perl split(' '). Split2 eliminates all
706 words from the list that are shorter than 2 characters (bytes), split3
707 eliminates those shorter than 3 characters (bytes) and so on.
708
709 =head1 AUTHOR
710
711 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
712
713 =head1 SEE ALSO
714
715 perl(1).
716
717 =cut
718

Properties

Name Value
cvs2svn:cvs-rev 1.2

  ViewVC Help
Powered by ViewVC 1.1.26