lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.8 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.8 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    require WAIT::Filter::utf8iso;
    croak "Your perl version must at least be 5.00556 to use '$func'"
        if $] < 5.00556;
    no strict 'refs';
    *$func = \&{"WAIT::Filter::utf8iso::$func"};
    goto &utf8iso;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  $STOP{$_}++;
}

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;
__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform utf8iso);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
internally based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.8 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20	use subs qw(grundform);
21
22	require Exporter;
23
24	@ISA = qw(Exporter);
25	@EXPORT_OK = qw(
26	Stem
27	Soundex
28	Phonix
29	Metaphone
30	isolc disolc
31	isouc disouc
32	isotr disotr
33	stop grundform
34	utf8iso
35	);
36	# (most implemented in WAIT.xs)
37
38	$VERSION = substr q$Revision: 1.8 $, 10;
39
40	sub split {
41	map split(' ', $_), @_;
42	}
43
44	$SPLIT = q[
45	sub splitXXX {
46	grep length($_)>=XXX, map split(' ', $_), @_;
47	}
48	];
49
50	sub AUTOLOAD {
51	my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53	if ($func =~ /split(\d+)/) {
54	my $num = $1;
55	my $split = $SPLIT;
56
57	$split =~ s/XXX/$num/g;
58	eval $split;
59	if ($@ eq '') {
60	goto &$AUTOLOAD;
61	}
62	} elsif ($func eq 'grundform') {
63	eval {require Text::German;};
64	croak "You must have Text::German to use 'grundform'"
65	if $@ ne '';
66	*grundform = Text::German->can('reduce');
67	goto &grundform;
68	} elsif ($func eq 'date') {
69	eval {require Time::ParseDate;};
70	croak "You must have Time::ParseDate to use 'date'"
71	if $@ ne '';
72	*date = Time::ParseDate->can('parsedate');
73	goto \&date;
74	} elsif ($func eq 'decode_entities') {
75	eval {require HTML::Entities;};
76	croak "You must have HTML::Entities to use 'date'"
77	if $@ ne '';
78	*decode_entities = HTML::Entities->can('decode_entities');
79	goto &decode_entities;
80	} elsif ($func =~ /^d?utf8iso$/) {
81	require WAIT::Filter::utf8iso;
82	croak "Your perl version must at least be 5.00556 to use '$func'"
83	if $] < 5.00556;
84	no strict 'refs';
85	*$func = \&{"WAIT::Filter::utf8iso::$func"};
86	goto &utf8iso;
87	}
88	Carp::confess "Class WAIT::Filter::$func not found";
89	}
90
91	while (<DATA>) {
92	chomp;
93	last if /__END__/;
94	$STOP{$_}++;
95	}
96
97	sub stop {
98	if (exists $STOP{$_[0]}) {
99	''
100	} else {
101	$_[0];
102	}
103	}
104
105	sub gdate {
106	my $date = shift;
107
108	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
109	date($date);
110	}
111
112	1;
113	__DATA__
114	a
115	about
116	above
117	according
118	across
119	actually
120	adj
121	after
122	afterwards
123	again
124	against
125	all
126	almost
127	alone
128	along
129	already
130	also
131	although
132	always
133	among
134	amongst
135	an
136	and
137	another
138	any
139	anyhow
140	anyone
141	anything
142	anywhere
143	are
144	aren't
145	around
146	as
147	at
148	b
149	be
150	became
151	because
152	become
153	becomes
154	becoming
155	been
156	before
157	beforehand
158	begin
159	beginning
160	behind
161	being
162	below
163	beside
164	besides
165	between
166	beyond
167	billion
168	both
169	but
170	by
171	c
172	can
173	can't
174	cannot
175	caption
176	co
177	co.
178	could
179	couldn't
180	d
181	did
182	didn't
183	do
184	does
185	doesn't
186	don't
187	down
188	during
189	e
190	eg
191	eight
192	eighty
193	either
194	else
195	elsewhere
196	end
197	ending
198	enough
199	etc
200	even
201	ever
202	every
203	everyone
204	everything
205	everywhere
206	except
207	f
208	few
209	fifty
210	first
211	five
212	vfor
213	former
214	formerly
215	forty
216	found
217	four
218	from
219	further
220	g
221	h
222	had
223	has
224	hasn't
225	have
226	haven't
227	he
228	he'd
229	he'll
230	he's
231	hence
232	her
233	here
234	here's
235	hereafter
236	hereby
237	herein
238	hereupon
239	hers
240	herself
241	him
242	himself
243	his
244	how
245	however
246	hundred
247	i
248	i'd
249	i'll
250	i'm
251	i've
252	ie
253	if
254	in
255	inc.
256	indeed
257	instead
258	into
259	is
260	isn't
261	it
262	it's
263	its
264	itself
265	j
266	k
267	l
268	last
269	later
270	latter
271	latterly
272	least
273	less
274	let
275	let's
276	like
277	likely
278	ltd
279	m
280	made
281	make
282	makes
283	many
284	maybe
285	me
286	meantime
287	meanwhile
288	might
289	million
290	miss
291	more
292	moreover
293	most
294	mostly
295	mr
296	mrs
297	much
298	must
299	my
300	myself
301	n
302	namely
303	neither
304	never
305	nevertheless
306	next
307	nine
308	ninety
309	no
310	nobody
311	none
312	nonetheless
313	noone
314	nor
315	not
316	nothing
317	now
318	nowhere
319	o
320	of
321	off
322	often
323	on
324	once
325	one
326	one's
327	only
328	onto
329	or
330	other
331	others
332	otherwise
333	our
334	ours
335	ourselves
336	out
337	over
338	overall
339	own
340	p
341	per
342	perhaps
343	q
344	r
345	rather
346	recent
347	recently
348	s
349	same
350	seem
351	seemed
352	seeming
353	seems
354	seven
355	seventy
356	several
357	she
358	she'd
359	she'll
360	she's
361	should
362	shouldn't
363	since
364	six
365	sixty
366	so
367	some
368	somehow
369	someone
370	something
371	sometime
372	sometimes
373	somewhere
374	still
375	stop
376	such
377	t
378	taking
379	ten
380	than
381	that
382	that'll
383	that's
384	that've
385	the
386	their
387	them
388	themselves
389	then
390	thence
391	there
392	there'd
393	there'll
394	there're
395	there's
396	there've
397	thereafter
398	thereby
399	therefore
400	therein
401	thereupon
402	these
403	they
404	they'd
405	they'll
406	they're
407	they've
408	thirty
409	this
410	those
411	though
412	thousand
413	three
414	through
415	throughout
416	thru
417	thus
418	to
419	together
420	too
421	toward
422	towards
423	trillion
424	twenty
425	two
426	u
427	under
428	unless
429	unlike
430	unlikely
431	until
432	up
433	upon
434	us
435	used
436	using
437	v
438	very
439	via
440	w
441	was
442	wasn't
443	we
444	we'd
445	we'll
446	we're
447	we've
448	well
449	were
450	weren't
451	what
452	what'll
453	what's
454	what've
455	whatever
456	when
457	whence
458	whenever
459	where
460	where's
461	whereafter
462	whereas
463	whereby
464	wherein
465	whereupon
466	wherever
467	whether
468	which
469	while
470	whither
471	who
472	who'd
473	who'll
474	who's
475	whoever
476	whole
477	whom
478	whomever
479	whose
480	why
481	will
482	with
483	within
484	without
485	won't
486	would
487	wouldn't
488	x
489	y
490	yes
491	yet
492	you
493	you'd
494	you'll
495	you're
496	you've
497	your
498	yours
499	yourself
500	yourselves
501	z
502	# occuring in more than 100 files
503	acc
504	accent
505	accents
506	and
507	are
508	bell
509	can
510	character
511	corrections
512	crt
513	daisy
514	dash
515	date
516	defined
517	definitions
518	description
519	devices
520	diablo
521	dummy
522	factors
523	following
524	font
525	for
526	from
527	fudge
528	give
529	have
530	header
531	holds
532	log
533	logo
534	low
535	lpr
536	mark
537	name
538	nroff
539	out
540	output
541	pitch
542	put
543	rcsfile
544	reference
545	resolution
546	revision
547	see
548	set
549	simple
550	smi
551	some
552	string
553	synopsis
554	system
555	that
556	the
557	this
558	translation
559	troff
560	typewriter
561	ucb
562	unbreakable
563	use
564	used
565	user
566	vroff
567	wheel
568	will
569	with
570	you
571	__END__
572	# Below is the stub of documentation for your module. You better edit it!
573
574	=head1 NAME
575
576	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
577
578	=head1 SYNOPSIS
579
580	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
581	isotr disotr stop grundform utf8iso);
582
583	$stem = Stem($word);
584	$scode = Soundex($word);
585	$pcode = Phonix($word);
586	$lword = isolc($word);
587	disolc($word);
588	$uword = isouc($word);
589	disouc($word);
590	$trword = isotr($word);
591	disotr($word);
592	$word = stop($word);
593	$word = grundform($word);
594
595	@words = WAIT::Filter::split($word);
596	@words = WAIT::Filter::split2($word);
597	@words = WAIT::Filter::split3($word);
598	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
599
600	=head1 DESCRIPTION
601
602	This tiny modules gives access to the basic reduction functions build
603	in B<freeWAIS-sf>.
604
605	=over 5
606
607	=item B<Stem>(I<word>)
608
609	reduces I<word> using the well know Porter algorithm.
610
611	AU: Porter, M.F.
612	TI: An Algorithm for Suffix Stripping
613	JT: Program
614	VO: 14
615	PP: 130-137
616	PY: 1980
617	PM: JUL
618
619	=item B<Soundex>(I<word>)
620
621
622	computes the 4 byte B<Soundex> code for I<word>.
623
624	AU: Gadd, T.N.
625	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
626	Information Retrieval Systems
627	JT: Program
628	VO: 22
629	NO: 3
630	PP: 222-237
631	PY: 1988
632
633
634	=item B<Phonix>(I<word>)
635
636	computes the 8 byte B<Phonix> code for I<word>.
637
638	AU: Gadd, T.N.
639	TI: PHONIX: The Algorithm
640	JT: Program
641	VO: 24
642	NO: 4
643	PP: 363-366
644	PY: 1990
645	PM: OCT
646
647	=head1 ISO charcater case functions
648
649	There are some additional function which transpose some/most ISOlatin1
650	characters to upper and lower case. To allow for maximum speed there
651	are also I<destructive> versions which change the argument instead of
652	allocating a copy which is returned. For convenience, the destructive
653	version also B<returns> the argument. So all of the following is
654	valid and C<$word> will contain the lowercased string.
655
656	$word = isolc($word);
657	$word = disolc($word);
658	disolc($word);
659
660	Here are the hardcoded characters which are recognized:
661
662	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
663	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
664
665	=item C<$new = >B<isolc>C<($word)>
666
667	=item B<disolc>C<($word)>
668
669	transposes to lower case.
670
671	=item C<$new = >B<isouc>C<($word)>
672
673	=item B<disouc>C<($word)>
674
675	transposes to upper case.
676
677	=item C<$new = >B<isotr>C<($word)>
678
679	=item B<disotr>C<($word)>
680
681	Remove non-letters according to the above table.
682
683	=item C<$new = >B<stop>C<($word)>
684
685	Returns an empty string if $word is a stopword.
686
687	=item C<$new = >B<grundform>C<($word)>
688
689	Calls Text::German::reduce
690
691	=item C<$new = >B<utf8iso>C<($word)>
692
693	Convert UTF8 encoded strings to ISO-8859-1. WAIT currently is
694	internally based on the Latin1 character set, so if you process
695	anything in a different encoding, you should convert to Latin1 as the
696	first filter.
697
698	=item split, split2, split3, ...
699
700	The splitN funtions all take a scalar as input and return a list of
701	words. Split acts just like the perl split(' '). Split2 eliminates all
702	words from the list that are shorter than 2 characters (bytes), split3
703	eliminates those shorter than 3 characters (bytes) and so on.
704
705	=head1 AUTHOR
706
707	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
708
709	=head1 SEE ALSO
710
711	perl(1).
712
713	=cut
714