1 |
ulpfr |
10 |
#!/usr/bin/perl |
2 |
|
|
# -*- Mode: Perl -*- |
3 |
|
|
# $Basename: WAIT.pm $ |
4 |
|
|
# $Revision: 1.4 $ |
5 |
|
|
# Author : Ulrich Pfeifer |
6 |
|
|
# Created On : Wed Nov 5 16:59:32 1997 |
7 |
|
|
# Last Modified By: Ulrich Pfeifer |
8 |
|
|
# Last Modified On: Wed Nov 12 18:26:44 1997 |
9 |
|
|
# Language : CPerl |
10 |
|
|
# Update Count : 4 |
11 |
|
|
# Status : Unknown, Use with caution! |
12 |
|
|
# |
13 |
|
|
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
|
|
# |
15 |
|
|
# |
16 |
|
|
|
17 |
|
|
package WAIT; |
18 |
|
|
require DynaLoader; |
19 |
|
|
use vars qw($VERSION @ISA); |
20 |
|
|
@ISA = qw(DynaLoader); |
21 |
|
|
|
22 |
|
|
$VERSION = sprintf '%5.3f', map $_/10,'$ProjectVersion: 16.2 $ ' =~ /([\d.]+)/; |
23 |
|
|
|
24 |
|
|
bootstrap WAIT $VERSION; |
25 |
|
|
|
26 |
|
|
__END__ |
27 |
|
|
|
28 |
|
|
=head1 NAME |
29 |
|
|
|
30 |
|
|
WAIT - a rewrite of the freeWAIS-sf engine in Perl |
31 |
|
|
|
32 |
|
|
=head1 Status of this document |
33 |
|
|
|
34 |
|
|
I started writing down some information about the implementation |
35 |
|
|
before I forget them in my spare time. The stuff is incomplete at |
36 |
|
|
least. Any additions, corrections, ... welcome. |
37 |
|
|
|
38 |
|
|
=head1 PURPOSE |
39 |
|
|
|
40 |
|
|
As you might know, I developed and maintained B<freeWAIS-sf> (with the |
41 |
|
|
help of many people in The Net). FreeWAIS-sf is based on B<freeWAIS> |
42 |
|
|
maintained by the Clearing House for Network Information Retrieval |
43 |
|
|
(CNIDR) which in turn is based on B<wais-8-b5> implemented by Thinking |
44 |
|
|
Machine et al. During this long history - implementation started about |
45 |
|
|
1989 - many people contributed to the distribution and added features |
46 |
|
|
not foreseen by the original design. While the system fulfills its |
47 |
|
|
task now, the code has reached a state where adding new features is |
48 |
|
|
nearly impossible and even fixing longstanding bugs and removing |
49 |
|
|
limitations has become a very time consuming task. |
50 |
|
|
|
51 |
|
|
Therefore I decided to pass the maintenance to WSC Inc. and built a |
52 |
|
|
new system from scratch. For obvious reasons I choosed Perl as |
53 |
|
|
implementation language. |
54 |
|
|
|
55 |
|
|
=head1 DESCRIPTION |
56 |
|
|
|
57 |
|
|
The central idea of the system is to provide a framework and the |
58 |
|
|
building blocks for any indexing and search system the users might |
59 |
|
|
want to build. Obviously the framework limits the class of system |
60 |
|
|
which can be build. |
61 |
|
|
|
62 |
|
|
+------+ +-----+ +------+ |
63 |
|
|
==> |Access| ==> |Parse| ==> | | |
64 |
|
|
+------+ +-----+ | | |
65 |
|
|
|| | | +-----+ |
66 |
|
|
|| |Filter| ==> |Index| |
67 |
|
|
\/ | | +-----+ |
68 |
|
|
+-------+ +-----+ | | |
69 |
|
|
<= |Display| <== |Query| <-> | | |
70 |
|
|
+-------+ +-----+ +------+ |
71 |
|
|
|
72 |
|
|
A collection (aka table) is defined by the instances of the B<access> |
73 |
|
|
and B<parse> module together with the B<filter definitions>. At query |
74 |
|
|
time in addition a B<query> and a B<display> module must be choosen. |
75 |
|
|
|
76 |
|
|
=head2 Access |
77 |
|
|
|
78 |
|
|
The access module defines which documents where members of a |
79 |
|
|
database. Usually an access module is a tied hash, whose keys are the |
80 |
|
|
Ids of the documents (did = document id) and whose values are the |
81 |
|
|
documents themselves. The indexing process loops over the keys using |
82 |
|
|
C<FIRSTKEY> and C<NEXTKEY>. Documents are retrieved with C<FETCH>. |
83 |
|
|
|
84 |
|
|
By convention access modules should be members of the |
85 |
|
|
C<WAIT::Document> hierarchy. Have a look at the |
86 |
|
|
C<WAIT::Document::Split> module to get the idea. |
87 |
|
|
|
88 |
|
|
|
89 |
|
|
=head2 Parse |
90 |
|
|
|
91 |
|
|
The task parse module is to split the documents into logical parts |
92 |
|
|
via the C<split> method. E.g. the C<WAIT::Parse::Nroff> splits |
93 |
|
|
manuals piped through B<nroff>(1) into the sections I<name>, |
94 |
|
|
I<synopsis>, I<options>, I<description>, I<author>, I<example>, |
95 |
|
|
I<bugs>, I<text>, I<see>, and I<environment>. Here is the |
96 |
|
|
implementation of C<WAIT::Parse::Base> which handes documents with a |
97 |
|
|
pretty simple tagged format: |
98 |
|
|
|
99 |
|
|
AU: Pfeifer, U.; Fuhr, N.; Huynh, T. |
100 |
|
|
TI: Searching Structured Documents with the Enhanced Retrieval |
101 |
|
|
Functionality of freeWAIS-sf and SFgate |
102 |
|
|
ER: D. Kroemker |
103 |
|
|
BT: Computer Networks and ISDN Systems; Proceedings of the third |
104 |
|
|
International World-Wide Web Conference |
105 |
|
|
PN: Elsevier |
106 |
|
|
PA: Amsterdam - Lausanne - New York - Oxford - Shannon - Tokyo |
107 |
|
|
PP: 1027-1036 |
108 |
|
|
PY: 1995 |
109 |
|
|
|
110 |
|
|
sub split { # called as method |
111 |
|
|
my %result; |
112 |
|
|
my $fld; |
113 |
|
|
|
114 |
|
|
for (split /\n/, $_[1]) { |
115 |
|
|
if (s/^(\S+):\s*//) { |
116 |
|
|
$fld = lc $1; |
117 |
|
|
} |
118 |
|
|
$result{$fld} .= $_ if defined $fld; |
119 |
|
|
} |
120 |
|
|
return \%result; |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
Since the original document cannot be reconstructed from its |
124 |
|
|
attributes, we need a second method (I<tag>) which marks the regions |
125 |
|
|
of the document with tags for the different attributes. This tagged |
126 |
|
|
form is used by the display module to hilight search terms in the |
127 |
|
|
documents. Besides the tags for the attributes, the method might assign |
128 |
|
|
the special tags C<_b> and C<_i> for indicating bold and italic |
129 |
|
|
regions. |
130 |
|
|
|
131 |
|
|
sub tag { |
132 |
|
|
my @result; |
133 |
|
|
my $tag; |
134 |
|
|
|
135 |
|
|
for (split /\n/, $_[1]) { |
136 |
|
|
next if /^\w\w:\s*$/; |
137 |
|
|
if (s/^(\S+)://) { |
138 |
|
|
push @result, {_b => 1}, "$1:"; |
139 |
|
|
$tag = lc $1; |
140 |
|
|
} |
141 |
|
|
if (defined $tag) { |
142 |
|
|
push @result, {$tag => 1}, "$_\n"; |
143 |
|
|
} else { |
144 |
|
|
push @result, {}, "$_\n"; |
145 |
|
|
} |
146 |
|
|
} |
147 |
|
|
return @result; # we don't go for speed |
148 |
|
|
} |
149 |
|
|
|
150 |
|
|
Obviously one could implement C<split> via C<tag>. The reason for |
151 |
|
|
having two functions is speed. We need to call C<split> for each |
152 |
|
|
document when indexing a collection. Therefore speed is essential. On |
153 |
|
|
the other hand, C<tag> is called in order to display a single document |
154 |
|
|
and may be a little slower. It may care about tagging bold and italic |
155 |
|
|
regions. See C<WAIT::Parse::Nroff> how this might decrease |
156 |
|
|
performance. |
157 |
|
|
|
158 |
|
|
|
159 |
|
|
=head2 Filter definition |
160 |
|
|
|
161 |
|
|
From the Information Retrieval perspective, the hardest part of the |
162 |
|
|
system is the filter module. The database administrator defines for |
163 |
|
|
each attribute, how the contents should be processed before it is |
164 |
|
|
stored in the index. Usually the processing contains steps to restrict |
165 |
|
|
the character set, case transformation, splitting to words and |
166 |
|
|
transforming to word stems. In WAIT these steps are defined naturally |
167 |
|
|
as a pipeline of processing steps. The pipelines are made up by |
168 |
|
|
functions in the package B<WAIT::Filter> which is pre-populated by the |
169 |
|
|
most common functions but may be extended any time. |
170 |
|
|
|
171 |
|
|
The equivalent for a typical freeWAIS-sf processing would be this |
172 |
|
|
pipeline: |
173 |
|
|
|
174 |
|
|
[ 'isotr', 'isolc', 'split2', 'stop', 'Stem'] |
175 |
|
|
|
176 |
|
|
The function C<isotr> replaces unknown characters by blanks. C<isolc> |
177 |
|
|
transforms to lower case. C<split2> splits into words and removes |
178 |
|
|
words shorter than two characters. C<stop> removes the freeWAIS-sf |
179 |
|
|
stopwords and C<Stem> applies the Porter algorithm for computing the |
180 |
|
|
stem of the words. |
181 |
|
|
|
182 |
|
|
The filter definition for a collection defines a set of piplines for |
183 |
|
|
the attributes and modifies the pipelines which should be used for |
184 |
|
|
prefix and interval searches. |
185 |
|
|
|
186 |
|
|
Here is a complete example: |
187 |
|
|
|
188 |
|
|
|
189 |
|
|
my $stem = [{ |
190 |
|
|
'prefix' => ['unroff', 'isotr', 'isolc'], |
191 |
|
|
'intervall' => ['unroff', 'isotr', 'isolc'], |
192 |
|
|
},'unroff', 'isotr', 'isolc', 'split2', 'stop', 'Stem']; |
193 |
|
|
my $text = [{ |
194 |
|
|
'prefix' => ['unroff', 'isotr', 'isolc'], |
195 |
|
|
'intervall' => ['unroff', 'isotr', 'isolc'], |
196 |
|
|
}, |
197 |
|
|
'unroff', 'isotr', 'isolc', 'split2', 'stop']; |
198 |
|
|
my $sound = ['unroff', 'isotr', 'isolc', 'split2', 'Soundex']; |
199 |
|
|
|
200 |
|
|
my $spec = [ |
201 |
|
|
'name' => $stem, |
202 |
|
|
'synopsis' => $stem, |
203 |
|
|
'bugs' => $stem, |
204 |
|
|
'description' => $stem, |
205 |
|
|
'text' => $stem, |
206 |
|
|
'environment' => $text, |
207 |
|
|
'example' => $text, 'example' => $stem, |
208 |
|
|
'author' => $sound, 'author' => $stem, |
209 |
|
|
] |
210 |
|
|
|