/[mws]/trunk/MWS/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/MWS/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (show annotations)
Mon May 10 20:26:17 2004 UTC (20 years ago) by dpavlin
File size: 6426 byte(s)
major code re-structuring: separation of indexer code into target independent
and depended, documentation improvements

1 #!/usr/bin/perl -w
2
3 package MWS::SWISH;
4 use strict;
5 use warnings;
6
7 use MWS::Indexer;
8 our @ISA=qw(MWS::Indexer);
9
10 our $VERSION = '1.00';
11
12 use SWISH::API;
13 use Text::Iconv;
14 use File::Temp qw/ :mktemp /;
15 use Text::Soundex;
16 use Carp;
17
18 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
19 my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
20
21 =head1 NAME
22
23 MWS::SWISH - index your data using swish-e
24
25 =head1 DESCRIPTION
26
27 This is simple implementation to use SWISH-e with queryies like
28 Lucene (subject:something)
29
30 =head1 METHODS
31
32 =head2 open_index
33
34 This will open index in directory C<index_dir> in configuration file
35 with name which is same as name of configuration.
36
37 my $index = $self->open_index;
38
39 It saves index handle into $self->{index} and returns it.
40
41 =cut
42
43 sub open_index {
44 my $self = shift;
45
46 my $swish = $self->{index};
47
48 if (! $swish) {
49
50 my $index_file = $self->{index_dir}."/".$self->{config_name};
51 print STDERR "opening index '$index_file'\n";
52 $swish = SWISH::API->new($index_file);
53 $swish->AbortLastError if $swish->Error;
54
55 $self->{index} = $swish;
56 }
57
58 return $swish;
59 }
60
61 =head2 search_index
62
63 Takes array of terms and operators like this:
64
65 my @results = $self->search_index('field:search');
66 my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...);
67
68 It returns array of hases with results.
69
70 =cut
71
72 sub search_index {
73 my $self = shift;
74
75 croak "search_index needs query" if (! @_);
76
77 my $index = $self->open_index;
78
79 my $sw;
80
81 my $sort_map = {
82 'date' => 'date_utime',
83 'from' => 'from_phrase',
84 'to' => 'to_phrase',
85 'rank' => 'swishrank',
86 };
87
88 my $sort_by;
89
90 foreach my $s (@_) {
91
92 if ($s =~ /^\s*(\w+):(.+)\s*$/) {
93 my ($f,$v) = ($1,$2);
94 if (lc($f) eq "sort") {
95 my ($sf,$sv) = split(/ /,$v,2);
96 $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
97 $sort_by .= " $sv";
98 } else {
99 $sw .= "$f=($v)";
100 }
101 } else {
102 # and/or/not operators
103 $sw .= " $s ";
104 }
105 }
106
107 print STDERR "swish search: $sw\n";
108
109 # convert to UTF-8
110 $sw = $iso2utf->convert($sw) || $sw;
111
112 my $search = $index->New_Search_Object;
113 $search->SetSort( $sort_by );
114 my $results = $search->Execute($sw);
115
116 # store total number of hits
117 $self->{'total_hits'} = $results->Hits;
118
119 my @res_ids;
120
121 my $count = 1;
122
123 while ( my $r = $results->NextResult ) {
124
125 sub p($$) {
126 my ($r,$prop) = @_;
127 $prop = $r->Property($prop) || return;
128 $prop =~ s/##lf##/\n/gs;
129 return $utf2iso->convert($prop);
130 }
131
132 my $id = p($r,"swishdocpath");
133 push @res_ids, $id;
134
135 foreach my $p (qw(from to cc bcc)) {
136 @{$self->{cache}->{$id}->{$p}} = ();
137 my $props = p($r,$p.'_phrase') || last;
138 foreach my $v (split(/##/, $props)) {
139 push @{$self->{cache}->{$id}->{$p}}, $v;
140 $self->add_counter($p,$v);
141 }
142 }
143
144 foreach my $p (qw(subject body date date_utime)) {
145 $self->{cache}->{$id}->{$p} = p($r,$p);
146 }
147
148 $self->add_counter_calendar(p($r,'date_utime'));
149
150 # this is redundant, but needed for templates later...
151 $self->{cache}->{$id}->{'id'} = $id;
152
153 last if (++$count > $self->{max_results});
154 }
155
156 return @res_ids;
157 }
158
159 =head2 create_index
160
161 This function can be null for indexes which doesn't need special
162 setup before add_index is called. however, swish-e support will
163 fork swish binary to create index at this point.
164
165 =cut
166
167 sub create_index {
168 my $self = shift;
169
170 my $index_prog = $0 || die "can't deduce my own name!";
171 my $config_file = $self->{config_file} || die "no self->config_file";
172 my $index_file = $self->{index_dir} || die "no self->index_dir";
173 $index_file .= "/";
174 $index_file .= $self->{config_name} || die "no self->config_name";
175
176 my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
177
178 print STDERR "creating swish-e configuration file $swish_config_file\n";
179
180 my $swish_config = qq{
181 # swish-e config file
182
183 IndexDir $index_prog
184 SwishProgParameters --recursive $config_file
185
186 # input file definition
187 DefaultContents XML2
188
189 # indexed metatags
190 MetaNames xml swishdocpath
191
192 # stored metatags
193 PropertyNames from_phrase from_address
194 PropertyNames to_phrase to_address
195 PropertyNames cc_phrase cc_address
196 PropertyNames subject body
197 #PropertyNamesDate date
198 PropertyNamesNumeric date_utime
199 PropertyNames date
200
201 #XMLClassAttributes type
202 UndefinedMetaTags auto
203 UndefinedXMLAttributes auto
204
205 IndexFile $index_file
206
207 # Croatian ISO-8859-2 characters to unaccented equivalents
208 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
209
210
211 # debug
212 ParserWarnLevel 3
213 IndexReport 1
214
215 };
216
217 print $tmp_fh $swish_config;
218 close($tmp_fh);
219
220 exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
221 exit 0;
222
223 }
224
225 =head2 add_index
226
227 This function will add document to index.
228
229 $self->add_index('mailbox messageid', $document);
230
231 =cut
232
233 sub add_index {
234 my $self = shift;
235
236 my $mbox_id = shift || croak "add_index needs mbox_id";
237 my $document = shift || croak "add_index needs document";
238
239 my ($mbox,$id) = split(/\s/,$mbox_id,2);
240
241 my $xml = qq{<message>};
242 foreach my $tag (keys %$document) {
243 my $data = $document->{$tag};
244 next if (! $data || $data eq '');
245 # save [cr/]lf before conversion to XML
246 $data =~ s/\n\r/##lf##/gs;
247 $data =~ s/\n/##lf##/gs;
248 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
249 }
250 $xml .= qq{</message>};
251
252 $xml = $iso2utf->convert($xml);
253 use bytes; # as opposed to chars
254 print "Path-Name: $mbox $id\n";
255 print "Content-Length: ".(length($xml)+1)."\n";
256 print "Document-Type: XML\n\n$xml\n";
257
258 }
259
260 =head2 close_index
261
262 Close index at end (dummy for swish-e).
263
264 =cut
265
266 sub close_index {
267 my $self = shift;
268
269 }
270
271 =head2 apropos_index
272
273 This method is optional (it can return undef) and it returns words which
274 sound like word specified.
275
276 my @words = $self->apropos_index('word')
277
278 This implementation uses L<Text::Soundex>.
279
280 =cut
281
282 sub apropos_index {
283 my $self = shift;
284
285 my $fld = shift || croak "apropos_index need field";
286 my $words = shift || return;
287
288 my @a;
289
290 foreach my $word (split(/\s+/,$words)) {
291
292 my $hash = soundex($word);
293 my $c = substr($word,0,1);
294
295 my $index = $self->open_index;
296 my $index_file = $self->{index_dir}."/".$self->{config_name};
297
298 open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
299 my @k_arr;
300 while(<SWISH>) {
301 next if (/^#/);
302 s/^.+?:\s+//;
303 @k_arr = split(/\s+/);
304 }
305
306 foreach my $k (@k_arr) {
307 push @a, $k if (soundex($k) eq $hash);
308 }
309 }
310
311 # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
312 return @a;
313
314 }
315
316 1;

  ViewVC Help
Powered by ViewVC 1.1.26