lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      $_[0] =~ s{([\xC0-\xDF])([\x80-\xBF])}{chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg; #};
    };
    goto \&$func;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;
__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.9 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20	use subs qw(grundform);
21
22	require Exporter;
23
24	@ISA = qw(Exporter);
25	@EXPORT_OK = qw(
26	Stem
27	Soundex
28	Phonix
29	Metaphone
30	isolc disolc
31	isouc disouc
32	isotr disotr
33	stop grundform
34	utf8iso
35	);
36	# (most implemented in WAIT.xs)
37
38	$VERSION = substr q$Revision: 1.9 $, 10;
39
40	sub split {
41	map split(' ', $_), @_;
42	}
43
44	$SPLIT = q[
45	sub splitXXX {
46	grep length($_)>=XXX, map split(' ', $_), @_;
47	}
48	];
49
50	sub AUTOLOAD {
51	my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53	if ($func =~ /split(\d+)/) {
54	my $num = $1;
55	my $split = $SPLIT;
56
57	$split =~ s/XXX/$num/g;
58	eval $split;
59	if ($@ eq '') {
60	goto &$AUTOLOAD;
61	}
62	} elsif ($func eq 'grundform') {
63	eval {require Text::German;};
64	croak "You must have Text::German to use 'grundform'"
65	if $@ ne '';
66	*grundform = Text::German->can('reduce');
67	goto &grundform;
68	} elsif ($func eq 'date') {
69	eval {require Time::ParseDate;};
70	croak "You must have Time::ParseDate to use 'date'"
71	if $@ ne '';
72	*date = Time::ParseDate->can('parsedate');
73	goto \&date;
74	} elsif ($func eq 'decode_entities') {
75	eval {require HTML::Entities;};
76	croak "You must have HTML::Entities to use 'date'"
77	if $@ ne '';
78	*decode_entities = HTML::Entities->can('decode_entities');
79	goto &decode_entities;
80	} elsif ($func =~ /^d?utf8iso$/) {
81	no strict 'refs';
82	*$func = sub {
83	# Courtesy JHI
84	$_[0] =~ s{([\xC0-\xDF])([\x80-\xBF])}{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg; #};
85	};
86	goto \&$func;
87	}
88	Carp::confess "Class WAIT::Filter::$func not found";
89	}
90
91	while (<DATA>) {
92	chomp;
93	last if /__END__/;
94	next if /^\s*#/; # there's a comment
95	$STOP{$_}++;
96	}
97
98	sub stop {
99	if (exists $STOP{$_[0]}) {
100	''
101	} else {
102	$_[0];
103	}
104	}
105
106	sub gdate {
107	my $date = shift;
108
109	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
110	date($date);
111	}
112
113	1;
114	__DATA__
115	a
116	about
117	above
118	according
119	across
120	actually
121	adj
122	after
123	afterwards
124	again
125	against
126	all
127	almost
128	alone
129	along
130	already
131	also
132	although
133	always
134	among
135	amongst
136	an
137	and
138	another
139	any
140	anyhow
141	anyone
142	anything
143	anywhere
144	are
145	aren't
146	around
147	as
148	at
149	b
150	be
151	became
152	because
153	become
154	becomes
155	becoming
156	been
157	before
158	beforehand
159	begin
160	beginning
161	behind
162	being
163	below
164	beside
165	besides
166	between
167	beyond
168	billion
169	both
170	but
171	by
172	c
173	can
174	can't
175	cannot
176	caption
177	co
178	co.
179	could
180	couldn't
181	d
182	did
183	didn't
184	do
185	does
186	doesn't
187	don't
188	down
189	during
190	e
191	eg
192	eight
193	eighty
194	either
195	else
196	elsewhere
197	end
198	ending
199	enough
200	etc
201	even
202	ever
203	every
204	everyone
205	everything
206	everywhere
207	except
208	f
209	few
210	fifty
211	first
212	five
213	vfor
214	former
215	formerly
216	forty
217	found
218	four
219	from
220	further
221	g
222	h
223	had
224	has
225	hasn't
226	have
227	haven't
228	he
229	he'd
230	he'll
231	he's
232	hence
233	her
234	here
235	here's
236	hereafter
237	hereby
238	herein
239	hereupon
240	hers
241	herself
242	him
243	himself
244	his
245	how
246	however
247	hundred
248	i
249	i'd
250	i'll
251	i'm
252	i've
253	ie
254	if
255	in
256	inc.
257	indeed
258	instead
259	into
260	is
261	isn't
262	it
263	it's
264	its
265	itself
266	j
267	k
268	l
269	last
270	later
271	latter
272	latterly
273	least
274	less
275	let
276	let's
277	like
278	likely
279	ltd
280	m
281	made
282	make
283	makes
284	many
285	maybe
286	me
287	meantime
288	meanwhile
289	might
290	million
291	miss
292	more
293	moreover
294	most
295	mostly
296	mr
297	mrs
298	much
299	must
300	my
301	myself
302	n
303	namely
304	neither
305	never
306	nevertheless
307	next
308	nine
309	ninety
310	no
311	nobody
312	none
313	nonetheless
314	noone
315	nor
316	not
317	nothing
318	now
319	nowhere
320	o
321	of
322	off
323	often
324	on
325	once
326	one
327	one's
328	only
329	onto
330	or
331	other
332	others
333	otherwise
334	our
335	ours
336	ourselves
337	out
338	over
339	overall
340	own
341	p
342	per
343	perhaps
344	q
345	r
346	rather
347	recent
348	recently
349	s
350	same
351	seem
352	seemed
353	seeming
354	seems
355	seven
356	seventy
357	several
358	she
359	she'd
360	she'll
361	she's
362	should
363	shouldn't
364	since
365	six
366	sixty
367	so
368	some
369	somehow
370	someone
371	something
372	sometime
373	sometimes
374	somewhere
375	still
376	stop
377	such
378	t
379	taking
380	ten
381	than
382	that
383	that'll
384	that's
385	that've
386	the
387	their
388	them
389	themselves
390	then
391	thence
392	there
393	there'd
394	there'll
395	there're
396	there's
397	there've
398	thereafter
399	thereby
400	therefore
401	therein
402	thereupon
403	these
404	they
405	they'd
406	they'll
407	they're
408	they've
409	thirty
410	this
411	those
412	though
413	thousand
414	three
415	through
416	throughout
417	thru
418	thus
419	to
420	together
421	too
422	toward
423	towards
424	trillion
425	twenty
426	two
427	u
428	under
429	unless
430	unlike
431	unlikely
432	until
433	up
434	upon
435	us
436	used
437	using
438	v
439	very
440	via
441	w
442	was
443	wasn't
444	we
445	we'd
446	we'll
447	we're
448	we've
449	well
450	were
451	weren't
452	what
453	what'll
454	what's
455	what've
456	whatever
457	when
458	whence
459	whenever
460	where
461	where's
462	whereafter
463	whereas
464	whereby
465	wherein
466	whereupon
467	wherever
468	whether
469	which
470	while
471	whither
472	who
473	who'd
474	who'll
475	who's
476	whoever
477	whole
478	whom
479	whomever
480	whose
481	why
482	will
483	with
484	within
485	without
486	won't
487	would
488	wouldn't
489	x
490	y
491	yes
492	yet
493	you
494	you'd
495	you'll
496	you're
497	you've
498	your
499	yours
500	yourself
501	yourselves
502	z
503	# occuring in more than 100 files
504	acc
505	accent
506	accents
507	and
508	are
509	bell
510	can
511	character
512	corrections
513	crt
514	daisy
515	dash
516	date
517	defined
518	definitions
519	description
520	devices
521	diablo
522	dummy
523	factors
524	following
525	font
526	for
527	from
528	fudge
529	give
530	have
531	header
532	holds
533	log
534	logo
535	low
536	lpr
537	mark
538	name
539	nroff
540	out
541	output
542	pitch
543	put
544	rcsfile
545	reference
546	resolution
547	revision
548	see
549	set
550	simple
551	smi
552	some
553	string
554	synopsis
555	system
556	that
557	the
558	this
559	translation
560	troff
561	typewriter
562	ucb
563	unbreakable
564	use
565	used
566	user
567	vroff
568	wheel
569	will
570	with
571	you
572	__END__
573	# Below is the stub of documentation for your module. You better edit it!
574
575	=head1 NAME
576
577	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
578
579	=head1 SYNOPSIS
580
581	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
582	isotr disotr stop grundform);
583
584	$stem = Stem($word);
585	$scode = Soundex($word);
586	$pcode = Phonix($word);
587	$lword = isolc($word);
588	disolc($word);
589	$uword = isouc($word);
590	disouc($word);
591	$trword = isotr($word);
592	disotr($word);
593	$word = stop($word);
594	$word = grundform($word);
595
596	@words = WAIT::Filter::split($word);
597	@words = WAIT::Filter::split2($word);
598	@words = WAIT::Filter::split3($word);
599	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
600
601	=head1 DESCRIPTION
602
603	This tiny modules gives access to the basic reduction functions build
604	in B<freeWAIS-sf>.
605
606	=over 5
607
608	=item B<Stem>(I<word>)
609
610	reduces I<word> using the well know Porter algorithm.
611
612	AU: Porter, M.F.
613	TI: An Algorithm for Suffix Stripping
614	JT: Program
615	VO: 14
616	PP: 130-137
617	PY: 1980
618	PM: JUL
619
620	=item B<Soundex>(I<word>)
621
622
623	computes the 4 byte B<Soundex> code for I<word>.
624
625	AU: Gadd, T.N.
626	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
627	Information Retrieval Systems
628	JT: Program
629	VO: 22
630	NO: 3
631	PP: 222-237
632	PY: 1988
633
634
635	=item B<Phonix>(I<word>)
636
637	computes the 8 byte B<Phonix> code for I<word>.
638
639	AU: Gadd, T.N.
640	TI: PHONIX: The Algorithm
641	JT: Program
642	VO: 24
643	NO: 4
644	PP: 363-366
645	PY: 1990
646	PM: OCT
647
648	=head1 ISO charcater case functions
649
650	There are some additional function which transpose some/most ISOlatin1
651	characters to upper and lower case. To allow for maximum speed there
652	are also I<destructive> versions which change the argument instead of
653	allocating a copy which is returned. For convenience, the destructive
654	version also B<returns> the argument. So all of the following is
655	valid and C<$word> will contain the lowercased string.
656
657	$word = isolc($word);
658	$word = disolc($word);
659	disolc($word);
660
661	Here are the hardcoded characters which are recognized:
662
663	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
664	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
665
666	=item C<$new = >B<isolc>C<($word)>
667
668	=item B<disolc>C<($word)>
669
670	transposes to lower case.
671
672	=item C<$new = >B<isouc>C<($word)>
673
674	=item B<disouc>C<($word)>
675
676	transposes to upper case.
677
678	=item C<$new = >B<isotr>C<($word)>
679
680	=item B<disotr>C<($word)>
681
682	Remove non-letters according to the above table.
683
684	=item C<$new = >B<stop>C<($word)>
685
686	Returns an empty string if $word is a stopword.
687
688	=item C<$new = >B<grundform>C<($word)>
689
690	Calls Text::German::reduce
691
692	=item C<$new = >B<utf8iso>C<($word)>
693
694	Deprecated due to flux in perl versions between 5.005 and 5.8. The
695	function converts UTF8 encoded strings to ISO-8859-1. WAIT is
696	internally still based on the Latin1 character set, so if you process
697	anything in a different encoding, you should convert to Latin1 as the
698	first filter or refrain from using the iso-latin-1 based filter
699	functions. It is recommended that you use your own converter based on
700	the perl version you're using.
701
702	=item split, split2, split3, ...
703
704	The splitN funtions all take a scalar as input and return a list of
705	words. Split acts just like the perl split(' '). Split2 eliminates all
706	words from the list that are shorter than 2 characters (bytes), split3
707	eliminates those shorter than 3 characters (bytes) and so on.
708
709	=head1 AUTHOR
710
711	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
712
713	=head1 SEE ALSO
714
715	perl(1).
716
717	=cut
718