lib/WAIT/Filter.pm

#                              -*- Mode: Cperl -*- 
# $Basename: Filter.pm $
# $Revision: 1.9 $
# ITIID           : $ITI$ $Header $__Header$
# Author          : Ulrich Pfeifer
# Created On      : Thu Aug 15 18:09:51 1996
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Sun Nov 22 18:44:46 1998
# Language        : CPerl
# Update Count    : 105
# Status          : Unknown, Use with caution!
#
# Copyright (c) 1996-1997, Ulrich Pfeifer
#
package WAIT::Filter;
require WAIT;
use strict;
use Carp;
use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
use subs qw(grundform);

require Exporter;

@ISA = qw(Exporter);
@EXPORT_OK = qw(
                Stem
                Soundex
                Phonix
                Metaphone
                isolc disolc
                isouc disouc
                isotr disotr
                stop grundform
                utf8iso
               );
# (most implemented in WAIT.xs)

$VERSION = substr q$Revision: 1.9 $, 10;

sub split {
  map split(' ', $_), @_;
}

$SPLIT = q[
           sub splitXXX {
                          grep length($_)>=XXX, map split(' ', $_), @_;
                         }
          ];

sub AUTOLOAD {
  my $func = $AUTOLOAD; $func =~ s/.*:://;

  if ($func =~ /split(\d+)/) {
    my $num = $1;
    my $split = $SPLIT;

    $split =~ s/XXX/$num/g;
    eval $split;
    if ($@ eq '') {
      goto &$AUTOLOAD;
    }
  } elsif ($func eq 'grundform') {
    eval {require Text::German;};
    croak "You must have Text::German to use 'grundform'"
      if $@ ne '';
    *grundform = Text::German->can('reduce');
    goto &grundform;
  } elsif ($func eq 'date') {
    eval {require Time::ParseDate;};
    croak "You must have Time::ParseDate to use 'date'"
      if $@ ne '';
    *date = Time::ParseDate->can('parsedate');
    goto \&date;
  } elsif ($func eq 'decode_entities') {
    eval {require HTML::Entities;};
    croak "You must have HTML::Entities to use 'date'"
      if $@ ne '';
    *decode_entities = HTML::Entities->can('decode_entities');
    goto &decode_entities;
  } elsif ($func =~ /^d?utf8iso$/) {
    no strict 'refs';
    *$func = sub {
      # Courtesy JHI
      my $s = shift;
      $s =~ s{([\xC0-\xDF])([\x80-\xBF])}
             {chr(ord($1)<<6&0xC0|ord($2)&0x3F)}eg;
      $s;
    };
    goto \&$func;
  }
  Carp::confess "Class WAIT::Filter::$func not found";
}

while (<DATA>) {
  chomp;
  last if /__END__/;
  next if /^\s*#/; # there's a comment
  $STOP{$_}++;
}
close DATA;

sub stop {
  if (exists $STOP{$_[0]}) {
    ''
  } else {
    $_[0];
  }
}

sub gdate {
  my $date = shift;

  $date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
  date($date);
}

1;
__DATA__
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
b
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
c
can
can't
cannot
caption
co
co.
could
couldn't
d
did
didn't
do
does
doesn't
don't
down
during
e
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
f
few
fifty
first
five
vfor
former
formerly
forty
found
four
from
further
g
h
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc.
indeed
instead
into
is
isn't
it
it's
its
itself
j
k
l
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
m
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
n
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
o
of
off
often
on
once
one
one's
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
p
per
perhaps
q
r
rather
recent
recently
s
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
t
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
u
under
unless
unlike
unlikely
until
up
upon
us
used
using
v
very
via
w
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
x
y
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
z
# occuring in more than 100 files
acc
accent
accents
and
are
bell
can
character
corrections
crt
daisy
dash
date
defined
definitions
description
devices
diablo
dummy
factors
following
font
for
from
fudge
give
have
header
holds
log
logo
low
lpr
mark
name
nroff
out
output
pitch
put
rcsfile
reference
resolution
revision
see
set
simple
smi
some
string
synopsis
system
that
the
this
translation
troff
typewriter
ucb
unbreakable
use
used
user
vroff
wheel
will
with
you
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions

=head1 SYNOPSIS

  use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
                      isotr disotr stop grundform);

  $stem   = Stem($word);
  $scode  = Soundex($word);
  $pcode  = Phonix($word);
  $lword  = isolc($word);
  disolc($word);
  $uword  = isouc($word);
  disouc($word);
  $trword = isotr($word);
  disotr($word);
  $word   = stop($word);
  $word   = grundform($word);

  @words = WAIT::Filter::split($word);
  @words = WAIT::Filter::split2($word);
  @words = WAIT::Filter::split3($word);
  @words = WAIT::Filter::split4($word); # arbitrary numbers allowed

=head1 DESCRIPTION

This tiny modules gives access to the basic reduction functions build
in B<freeWAIS-sf>.

=over 5

=item B<Stem>(I<word>)

reduces I<word> using the well know Porter algorithm.

  AU: Porter, M.F.
  TI: An Algorithm for Suffix Stripping
  JT: Program
  VO: 14
  PP: 130-137
  PY: 1980
  PM: JUL

=item B<Soundex>(I<word>)


computes the 4 byte B<Soundex> code for I<word>.

  AU: Gadd, T.N.
  TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
      Information Retrieval Systems
  JT: Program
  VO: 22
  NO: 3
  PP: 222-237
  PY: 1988


=item B<Phonix>(I<word>)

computes the 8 byte B<Phonix> code for I<word>.

  AU: Gadd, T.N.
  TI: PHONIX: The Algorithm
  JT: Program
  VO: 24
  NO: 4
  PP: 363-366
  PY: 1990
  PM: OCT

=head1 ISO charcater case functions

There are some additional function which transpose some/most ISOlatin1
characters to upper and lower case. To allow for maximum speed there
are also I<destructive> versions which change the argument instead of
allocating a copy which is returned. For convenience, the destructive
version also B<returns> the argument. So all of the following is
valid and C<$word> will contain the lowercased string.

  $word = isolc($word);
  $word = disolc($word);
  disolc($word);

Here are the hardcoded characters which are recognized:

  abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
  ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß

=item C<$new = >B<isolc>C<($word)>

=item B<disolc>C<($word)>

transposes to lower case.

=item C<$new = >B<isouc>C<($word)>

=item  B<disouc>C<($word)>

transposes to upper case.

=item C<$new = >B<isotr>C<($word)>

=item  B<disotr>C<($word)>

Remove non-letters according to the above table.

=item C<$new = >B<stop>C<($word)>

Returns an empty string if $word is a stopword.

=item C<$new = >B<grundform>C<($word)>

Calls Text::German::reduce

=item C<$new = >B<utf8iso>C<($word)>

Deprecated due to flux in perl versions between 5.005 and 5.8. The
function converts UTF8 encoded strings to ISO-8859-1. WAIT is
internally still based on the Latin1 character set, so if you process
anything in a different encoding, you should convert to Latin1 as the
first filter or refrain from using the iso-latin-1 based filter
functions. It is recommended that you use your own converter based on
the perl version you're using.

=item split, split2, split3, ...

The splitN funtions all take a scalar as input and return a list of
words. Split acts just like the perl split(' '). Split2 eliminates all
words from the list that are shorter than 2 characters (bytes), split3
eliminates those shorter than 3 characters (bytes) and so on.

=head1 AUTHOR

Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

=head1 SEE ALSO

perl(1).

=cut

1	# -- Mode: Cperl --
2	# $Basename: Filter.pm $
3	# $Revision: 1.9 $
4	# ITIID : $ITI$ $Header $__Header$
5	# Author : Ulrich Pfeifer
6	# Created On : Thu Aug 15 18:09:51 1996
7	# Last Modified By: Ulrich Pfeifer
8	# Last Modified On: Sun Nov 22 18:44:46 1998
9	# Language : CPerl
10	# Update Count : 105
11	# Status : Unknown, Use with caution!
12	#
13	# Copyright (c) 1996-1997, Ulrich Pfeifer
14	#
15	package WAIT::Filter;
16	require WAIT;
17	use strict;
18	use Carp;
19	use vars qw($VERSION @ISA @EXPORT_OK %STOP $SPLIT $AUTOLOAD);
20	use subs qw(grundform);
21
22	require Exporter;
23
24	@ISA = qw(Exporter);
25	@EXPORT_OK = qw(
26	Stem
27	Soundex
28	Phonix
29	Metaphone
30	isolc disolc
31	isouc disouc
32	isotr disotr
33	stop grundform
34	utf8iso
35	);
36	# (most implemented in WAIT.xs)
37
38	$VERSION = substr q$Revision: 1.9 $, 10;
39
40	sub split {
41	map split(' ', $_), @_;
42	}
43
44	$SPLIT = q[
45	sub splitXXX {
46	grep length($_)>=XXX, map split(' ', $_), @_;
47	}
48	];
49
50	sub AUTOLOAD {
51	my $func = $AUTOLOAD; $func =~ s/.*:://;
52
53	if ($func =~ /split(\d+)/) {
54	my $num = $1;
55	my $split = $SPLIT;
56
57	$split =~ s/XXX/$num/g;
58	eval $split;
59	if ($@ eq '') {
60	goto &$AUTOLOAD;
61	}
62	} elsif ($func eq 'grundform') {
63	eval {require Text::German;};
64	croak "You must have Text::German to use 'grundform'"
65	if $@ ne '';
66	*grundform = Text::German->can('reduce');
67	goto &grundform;
68	} elsif ($func eq 'date') {
69	eval {require Time::ParseDate;};
70	croak "You must have Time::ParseDate to use 'date'"
71	if $@ ne '';
72	*date = Time::ParseDate->can('parsedate');
73	goto \&date;
74	} elsif ($func eq 'decode_entities') {
75	eval {require HTML::Entities;};
76	croak "You must have HTML::Entities to use 'date'"
77	if $@ ne '';
78	*decode_entities = HTML::Entities->can('decode_entities');
79	goto &decode_entities;
80	} elsif ($func =~ /^d?utf8iso$/) {
81	no strict 'refs';
82	*$func = sub {
83	# Courtesy JHI
84	my $s = shift;
85	$s =~ s{([\xC0-\xDF])([\x80-\xBF])}
86	{chr(ord($1)<<6&0xC0\|ord($2)&0x3F)}eg;
87	$s;
88	};
89	goto \&$func;
90	}
91	Carp::confess "Class WAIT::Filter::$func not found";
92	}
93
94	while (<DATA>) {
95	chomp;
96	last if /__END__/;
97	next if /^\s*#/; # there's a comment
98	$STOP{$_}++;
99	}
100	close DATA;
101
102	sub stop {
103	if (exists $STOP{$_[0]}) {
104	''
105	} else {
106	$_[0];
107	}
108	}
109
110	sub gdate {
111	my $date = shift;
112
113	$date =~ s:(\d+)\.(\d+)\.(d+):$2/$1/$3:;
114	date($date);
115	}
116
117	1;
118	__DATA__
119	a
120	about
121	above
122	according
123	across
124	actually
125	adj
126	after
127	afterwards
128	again
129	against
130	all
131	almost
132	alone
133	along
134	already
135	also
136	although
137	always
138	among
139	amongst
140	an
141	and
142	another
143	any
144	anyhow
145	anyone
146	anything
147	anywhere
148	are
149	aren't
150	around
151	as
152	at
153	b
154	be
155	became
156	because
157	become
158	becomes
159	becoming
160	been
161	before
162	beforehand
163	begin
164	beginning
165	behind
166	being
167	below
168	beside
169	besides
170	between
171	beyond
172	billion
173	both
174	but
175	by
176	c
177	can
178	can't
179	cannot
180	caption
181	co
182	co.
183	could
184	couldn't
185	d
186	did
187	didn't
188	do
189	does
190	doesn't
191	don't
192	down
193	during
194	e
195	eg
196	eight
197	eighty
198	either
199	else
200	elsewhere
201	end
202	ending
203	enough
204	etc
205	even
206	ever
207	every
208	everyone
209	everything
210	everywhere
211	except
212	f
213	few
214	fifty
215	first
216	five
217	vfor
218	former
219	formerly
220	forty
221	found
222	four
223	from
224	further
225	g
226	h
227	had
228	has
229	hasn't
230	have
231	haven't
232	he
233	he'd
234	he'll
235	he's
236	hence
237	her
238	here
239	here's
240	hereafter
241	hereby
242	herein
243	hereupon
244	hers
245	herself
246	him
247	himself
248	his
249	how
250	however
251	hundred
252	i
253	i'd
254	i'll
255	i'm
256	i've
257	ie
258	if
259	in
260	inc.
261	indeed
262	instead
263	into
264	is
265	isn't
266	it
267	it's
268	its
269	itself
270	j
271	k
272	l
273	last
274	later
275	latter
276	latterly
277	least
278	less
279	let
280	let's
281	like
282	likely
283	ltd
284	m
285	made
286	make
287	makes
288	many
289	maybe
290	me
291	meantime
292	meanwhile
293	might
294	million
295	miss
296	more
297	moreover
298	most
299	mostly
300	mr
301	mrs
302	much
303	must
304	my
305	myself
306	n
307	namely
308	neither
309	never
310	nevertheless
311	next
312	nine
313	ninety
314	no
315	nobody
316	none
317	nonetheless
318	noone
319	nor
320	not
321	nothing
322	now
323	nowhere
324	o
325	of
326	off
327	often
328	on
329	once
330	one
331	one's
332	only
333	onto
334	or
335	other
336	others
337	otherwise
338	our
339	ours
340	ourselves
341	out
342	over
343	overall
344	own
345	p
346	per
347	perhaps
348	q
349	r
350	rather
351	recent
352	recently
353	s
354	same
355	seem
356	seemed
357	seeming
358	seems
359	seven
360	seventy
361	several
362	she
363	she'd
364	she'll
365	she's
366	should
367	shouldn't
368	since
369	six
370	sixty
371	so
372	some
373	somehow
374	someone
375	something
376	sometime
377	sometimes
378	somewhere
379	still
380	stop
381	such
382	t
383	taking
384	ten
385	than
386	that
387	that'll
388	that's
389	that've
390	the
391	their
392	them
393	themselves
394	then
395	thence
396	there
397	there'd
398	there'll
399	there're
400	there's
401	there've
402	thereafter
403	thereby
404	therefore
405	therein
406	thereupon
407	these
408	they
409	they'd
410	they'll
411	they're
412	they've
413	thirty
414	this
415	those
416	though
417	thousand
418	three
419	through
420	throughout
421	thru
422	thus
423	to
424	together
425	too
426	toward
427	towards
428	trillion
429	twenty
430	two
431	u
432	under
433	unless
434	unlike
435	unlikely
436	until
437	up
438	upon
439	us
440	used
441	using
442	v
443	very
444	via
445	w
446	was
447	wasn't
448	we
449	we'd
450	we'll
451	we're
452	we've
453	well
454	were
455	weren't
456	what
457	what'll
458	what's
459	what've
460	whatever
461	when
462	whence
463	whenever
464	where
465	where's
466	whereafter
467	whereas
468	whereby
469	wherein
470	whereupon
471	wherever
472	whether
473	which
474	while
475	whither
476	who
477	who'd
478	who'll
479	who's
480	whoever
481	whole
482	whom
483	whomever
484	whose
485	why
486	will
487	with
488	within
489	without
490	won't
491	would
492	wouldn't
493	x
494	y
495	yes
496	yet
497	you
498	you'd
499	you'll
500	you're
501	you've
502	your
503	yours
504	yourself
505	yourselves
506	z
507	# occuring in more than 100 files
508	acc
509	accent
510	accents
511	and
512	are
513	bell
514	can
515	character
516	corrections
517	crt
518	daisy
519	dash
520	date
521	defined
522	definitions
523	description
524	devices
525	diablo
526	dummy
527	factors
528	following
529	font
530	for
531	from
532	fudge
533	give
534	have
535	header
536	holds
537	log
538	logo
539	low
540	lpr
541	mark
542	name
543	nroff
544	out
545	output
546	pitch
547	put
548	rcsfile
549	reference
550	resolution
551	revision
552	see
553	set
554	simple
555	smi
556	some
557	string
558	synopsis
559	system
560	that
561	the
562	this
563	translation
564	troff
565	typewriter
566	ucb
567	unbreakable
568	use
569	used
570	user
571	vroff
572	wheel
573	will
574	with
575	you
576	__END__
577	# Below is the stub of documentation for your module. You better edit it!
578
579	=head1 NAME
580
581	WAIT::Filter - Perl extension providing the basic freeWAIS-sf reduction functions
582
583	=head1 SYNOPSIS
584
585	use WAIT::Filter qw(Stem Soundex Phonix isolc disolc isouc disouc
586	isotr disotr stop grundform);
587
588	$stem = Stem($word);
589	$scode = Soundex($word);
590	$pcode = Phonix($word);
591	$lword = isolc($word);
592	disolc($word);
593	$uword = isouc($word);
594	disouc($word);
595	$trword = isotr($word);
596	disotr($word);
597	$word = stop($word);
598	$word = grundform($word);
599
600	@words = WAIT::Filter::split($word);
601	@words = WAIT::Filter::split2($word);
602	@words = WAIT::Filter::split3($word);
603	@words = WAIT::Filter::split4($word); # arbitrary numbers allowed
604
605	=head1 DESCRIPTION
606
607	This tiny modules gives access to the basic reduction functions build
608	in B<freeWAIS-sf>.
609
610	=over 5
611
612	=item B<Stem>(I<word>)
613
614	reduces I<word> using the well know Porter algorithm.
615
616	AU: Porter, M.F.
617	TI: An Algorithm for Suffix Stripping
618	JT: Program
619	VO: 14
620	PP: 130-137
621	PY: 1980
622	PM: JUL
623
624	=item B<Soundex>(I<word>)
625
626
627	computes the 4 byte B<Soundex> code for I<word>.
628
629	AU: Gadd, T.N.
630	TI: 'Fisching for Werds'. Phonetic Retrieval of written text in
631	Information Retrieval Systems
632	JT: Program
633	VO: 22
634	NO: 3
635	PP: 222-237
636	PY: 1988
637
638
639	=item B<Phonix>(I<word>)
640
641	computes the 8 byte B<Phonix> code for I<word>.
642
643	AU: Gadd, T.N.
644	TI: PHONIX: The Algorithm
645	JT: Program
646	VO: 24
647	NO: 4
648	PP: 363-366
649	PY: 1990
650	PM: OCT
651
652	=head1 ISO charcater case functions
653
654	There are some additional function which transpose some/most ISOlatin1
655	characters to upper and lower case. To allow for maximum speed there
656	are also I<destructive> versions which change the argument instead of
657	allocating a copy which is returned. For convenience, the destructive
658	version also B<returns> the argument. So all of the following is
659	valid and C<$word> will contain the lowercased string.
660
661	$word = isolc($word);
662	$word = disolc($word);
663	disolc($word);
664
665	Here are the hardcoded characters which are recognized:
666
667	abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïñòóôõöøùúûüýß
668	ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝß
669
670	=item C<$new = >B<isolc>C<($word)>
671
672	=item B<disolc>C<($word)>
673
674	transposes to lower case.
675
676	=item C<$new = >B<isouc>C<($word)>
677
678	=item B<disouc>C<($word)>
679
680	transposes to upper case.
681
682	=item C<$new = >B<isotr>C<($word)>
683
684	=item B<disotr>C<($word)>
685
686	Remove non-letters according to the above table.
687
688	=item C<$new = >B<stop>C<($word)>
689
690	Returns an empty string if $word is a stopword.
691
692	=item C<$new = >B<grundform>C<($word)>
693
694	Calls Text::German::reduce
695
696	=item C<$new = >B<utf8iso>C<($word)>
697
698	Deprecated due to flux in perl versions between 5.005 and 5.8. The
699	function converts UTF8 encoded strings to ISO-8859-1. WAIT is
700	internally still based on the Latin1 character set, so if you process
701	anything in a different encoding, you should convert to Latin1 as the
702	first filter or refrain from using the iso-latin-1 based filter
703	functions. It is recommended that you use your own converter based on
704	the perl version you're using.
705
706	=item split, split2, split3, ...
707
708	The splitN funtions all take a scalar as input and return a list of
709	words. Split acts just like the perl split(' '). Split2 eliminates all
710	words from the list that are shorter than 2 characters (bytes), split3
711	eliminates those shorter than 3 characters (bytes) and so on.
712
713	=head1 AUTHOR
714
715	Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
716
717	=head1 SEE ALSO
718
719	perl(1).
720
721	=cut
722