1 |
#!/usr/bin/perl -w |
2 |
|
3 |
package MWS::SWISH; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
use MWS::Indexer; |
8 |
our @ISA=qw(MWS::Indexer); |
9 |
|
10 |
our $VERSION = '1.00'; |
11 |
|
12 |
use SWISH::API; |
13 |
use Text::Iconv; |
14 |
use File::Temp qw/ :mktemp /; |
15 |
use Text::Soundex; |
16 |
use Carp; |
17 |
|
18 |
my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8'); |
19 |
my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2'); |
20 |
|
21 |
=head1 NAME |
22 |
|
23 |
MWS::SWISH - index your data using swish-e |
24 |
|
25 |
=head1 DESCRIPTION |
26 |
|
27 |
This is simple implementation to use SWISH-e with queryies like |
28 |
Lucene (subject:something) |
29 |
|
30 |
=head1 METHODS |
31 |
|
32 |
=head2 open_index |
33 |
|
34 |
This will open index in directory C<index_dir> in configuration file |
35 |
with name which is same as name of configuration. |
36 |
|
37 |
my $index = $self->open_index; |
38 |
|
39 |
It saves index handle into $self->{index} and returns it. |
40 |
|
41 |
=cut |
42 |
|
43 |
sub open_index { |
44 |
my $self = shift; |
45 |
|
46 |
my $swish = $self->{index}; |
47 |
|
48 |
if (! $swish) { |
49 |
|
50 |
my $index_file = $self->{index_dir}."/".$self->{config_name}; |
51 |
print STDERR "opening index '$index_file'\n"; |
52 |
$swish = SWISH::API->new($index_file); |
53 |
$swish->AbortLastError if $swish->Error; |
54 |
|
55 |
$self->{index} = $swish; |
56 |
} |
57 |
|
58 |
return $swish; |
59 |
} |
60 |
|
61 |
=head2 search_index |
62 |
|
63 |
Takes array of terms and operators like this: |
64 |
|
65 |
my @results = $self->search_index('field:search'); |
66 |
my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...); |
67 |
|
68 |
It returns array of hases with results. |
69 |
|
70 |
=cut |
71 |
|
72 |
sub search_index { |
73 |
my $self = shift; |
74 |
|
75 |
croak "search_index needs query" if (! @_); |
76 |
|
77 |
my $index = $self->open_index; |
78 |
|
79 |
my $sw; |
80 |
|
81 |
my $sort_map = { |
82 |
'date' => 'date_utime', |
83 |
'from' => 'from_phrase', |
84 |
'to' => 'to_phrase', |
85 |
'rank' => 'swishrank', |
86 |
}; |
87 |
|
88 |
my $sort_by; |
89 |
|
90 |
foreach my $s (@_) { |
91 |
|
92 |
if ($s =~ /^\s*(\w+):(.+)\s*$/) { |
93 |
my ($f,$v) = ($1,$2); |
94 |
if (lc($f) eq "sort") { |
95 |
my ($sf,$sv) = split(/ /,$v,2); |
96 |
$sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map"; |
97 |
$sort_by .= " $sv"; |
98 |
} else { |
99 |
$sw .= "$f=($v)"; |
100 |
} |
101 |
} else { |
102 |
# and/or/not operators |
103 |
$sw .= " $s "; |
104 |
} |
105 |
} |
106 |
|
107 |
print STDERR "swish search: $sw\n"; |
108 |
|
109 |
# convert to UTF-8 |
110 |
$sw = $iso2utf->convert($sw) || $sw; |
111 |
|
112 |
my $search = $index->New_Search_Object; |
113 |
$search->SetSort( $sort_by ); |
114 |
my $results = $search->Execute($sw); |
115 |
|
116 |
# store total number of hits |
117 |
$self->{'total_hits'} = $results->Hits; |
118 |
|
119 |
my @res_ids; |
120 |
|
121 |
my $count = 1; |
122 |
|
123 |
while ( my $r = $results->NextResult ) { |
124 |
|
125 |
sub p($$) { |
126 |
my ($r,$prop) = @_; |
127 |
eval { $prop = $r->Property($prop) } || return; |
128 |
$prop =~ s/##lf##/\n/gs; |
129 |
return $utf2iso->convert($prop); |
130 |
} |
131 |
|
132 |
my $id = p($r,"swishdocpath"); |
133 |
push @res_ids, $id; |
134 |
|
135 |
foreach my $p (qw(from to cc bcc)) { |
136 |
@{$self->{cache}->{$id}->{$p}} = (); |
137 |
my $props = p($r,$p.'_phrase') || last; |
138 |
foreach my $v (split(/##/, $props)) { |
139 |
push @{$self->{cache}->{$id}->{$p}}, $v; |
140 |
$self->add_counter($p,$v); |
141 |
} |
142 |
} |
143 |
|
144 |
foreach my $p (qw(subject body date date_utime)) { |
145 |
$self->{cache}->{$id}->{$p} = p($r,$p); |
146 |
} |
147 |
|
148 |
$self->add_counter_calendar(p($r,'date_utime')); |
149 |
|
150 |
# this is redundant, but needed for templates later... |
151 |
$self->{cache}->{$id}->{'id'} = $id; |
152 |
|
153 |
last if (++$count > $self->{max_results}); |
154 |
} |
155 |
|
156 |
return @res_ids; |
157 |
} |
158 |
|
159 |
=head2 create_index |
160 |
|
161 |
This function can be null for indexes which doesn't need special |
162 |
setup before add_index is called. however, swish-e support will |
163 |
exec swish binary to create index at this point. |
164 |
|
165 |
Exec is used to free memory of parent process (because swish-e will |
166 |
again start this script, but with --recursive) and free it's memory. |
167 |
|
168 |
This has unfortunate efect that all output from swish-e will also be displayed. |
169 |
If C<STDOUT> of swish-e is recirected to C</dev/null> we will loose errors |
170 |
from this module (allthough they are directed to STDERR). |
171 |
|
172 |
=cut |
173 |
|
174 |
sub create_index { |
175 |
my $self = shift; |
176 |
|
177 |
my $index_prog = $0 || die "can't deduce my own name!"; |
178 |
my $config_file = $self->{config_file} || die "no self->config_file"; |
179 |
my $index_file = $self->{index_dir} || die "no self->index_dir"; |
180 |
$index_file .= "/"; |
181 |
$index_file .= $self->{config_name} || die "no self->config_name"; |
182 |
|
183 |
my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX"); |
184 |
|
185 |
print STDERR "creating swish-e configuration file $swish_config_file\n"; |
186 |
|
187 |
my $swish_config = qq{ |
188 |
# swish-e config file |
189 |
|
190 |
IndexDir $index_prog |
191 |
SwishProgParameters --recursive $config_file |
192 |
|
193 |
# input file definition |
194 |
DefaultContents XML2 |
195 |
|
196 |
# indexed metatags |
197 |
MetaNames xml swishdocpath |
198 |
|
199 |
# stored metatags |
200 |
PropertyNames from_phrase from_address |
201 |
PropertyNames to_phrase to_address |
202 |
PropertyNames cc_phrase cc_address |
203 |
PropertyNames subject body |
204 |
#PropertyNamesDate date |
205 |
PropertyNamesNumeric date_utime |
206 |
PropertyNames date |
207 |
|
208 |
#XMLClassAttributes type |
209 |
UndefinedMetaTags auto |
210 |
UndefinedXMLAttributes auto |
211 |
|
212 |
IndexFile $index_file |
213 |
|
214 |
# Croatian ISO-8859-2 characters to unaccented equivalents |
215 |
TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz |
216 |
|
217 |
|
218 |
# disable output |
219 |
ParserWarnLevel 0 |
220 |
IndexReport 0 |
221 |
|
222 |
}; |
223 |
|
224 |
print $tmp_fh $swish_config; |
225 |
close($tmp_fh); |
226 |
|
227 |
exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file"; |
228 |
exit 0; |
229 |
|
230 |
} |
231 |
|
232 |
=head2 add_index |
233 |
|
234 |
This function will add document to index. |
235 |
|
236 |
$self->add_index('mailbox messageid', $document); |
237 |
|
238 |
=cut |
239 |
|
240 |
sub add_index { |
241 |
my $self = shift; |
242 |
|
243 |
my $mbox_id = shift || croak "add_index needs mbox_id"; |
244 |
my $document = shift || croak "add_index needs document"; |
245 |
|
246 |
my ($mbox,$id) = split(/\s/,$mbox_id,2); |
247 |
|
248 |
my $xml = qq{<message>}; |
249 |
foreach my $tag (keys %$document) { |
250 |
my $data = $document->{$tag}; |
251 |
next if (! $data || $data eq ''); |
252 |
# save [cr/]lf before conversion to XML |
253 |
$data =~ s/\n\r/##lf##/gs; |
254 |
$data =~ s/\n/##lf##/gs; |
255 |
$xml .= "<$tag><![CDATA[".$data."]]></$tag>\n"; |
256 |
} |
257 |
$xml .= qq{</message>}; |
258 |
|
259 |
$xml = $iso2utf->convert($xml); |
260 |
use bytes; # as opposed to chars |
261 |
print "Path-Name: $mbox $id\n"; |
262 |
print "Content-Length: ".(length($xml)+1)."\n"; |
263 |
print "Document-Type: XML\n\n$xml\n"; |
264 |
|
265 |
} |
266 |
|
267 |
=head2 close_index |
268 |
|
269 |
Close index at end (dummy for swish-e). |
270 |
|
271 |
=cut |
272 |
|
273 |
sub close_index { |
274 |
my $self = shift; |
275 |
|
276 |
} |
277 |
|
278 |
=head2 apropos_index |
279 |
|
280 |
This method is optional (it can return undef) and it returns words which |
281 |
sound like word specified. |
282 |
|
283 |
my @words = $self->apropos_index('word') |
284 |
|
285 |
This implementation uses L<Text::Soundex>. |
286 |
|
287 |
=cut |
288 |
|
289 |
sub apropos_index { |
290 |
my $self = shift; |
291 |
|
292 |
my $fld = shift || croak "apropos_index need field"; |
293 |
my $words = shift || return; |
294 |
|
295 |
my @a; |
296 |
|
297 |
foreach my $word (split(/\s+/,$words)) { |
298 |
|
299 |
my $hash = soundex($word); |
300 |
my $c = substr($word,0,1); |
301 |
|
302 |
my $index = $self->open_index; |
303 |
my $index_file = $self->{index_dir}."/".$self->{config_name}; |
304 |
|
305 |
open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e"; |
306 |
my @k_arr; |
307 |
while(<SWISH>) { |
308 |
next if (/^#/); |
309 |
s/^.+?:\s+//; |
310 |
@k_arr = split(/\s+/); |
311 |
} |
312 |
|
313 |
foreach my $k (@k_arr) { |
314 |
push @a, $k if (soundex($k) eq $hash); |
315 |
} |
316 |
} |
317 |
|
318 |
# print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n"; |
319 |
return @a; |
320 |
|
321 |
} |
322 |
|
323 |
1; |