1 |
dpavlin |
12 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
use strict; |
4 |
|
|
|
5 |
|
|
# |
6 |
|
|
# simple implementation to use SWISH-e with queryies like |
7 |
|
|
# Lucene (subject:something) |
8 |
|
|
# |
9 |
|
|
|
10 |
|
|
use SWISH::API; |
11 |
dpavlin |
14 |
use Text::Iconv; |
12 |
|
|
use Data::Dumper; |
13 |
dpavlin |
27 |
use File::Temp qw/ :mktemp /; |
14 |
dpavlin |
30 |
use Text::Soundex; |
15 |
dpavlin |
12 |
|
16 |
dpavlin |
14 |
my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8'); |
17 |
|
|
my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2'); |
18 |
|
|
|
19 |
dpavlin |
12 |
sub open_index { |
20 |
|
|
my $self = shift; |
21 |
|
|
|
22 |
|
|
my $swish = $self->{index}; |
23 |
|
|
|
24 |
|
|
if (! $swish) { |
25 |
|
|
|
26 |
dpavlin |
27 |
my $index_file = $self->{index_dir}."/".$self->{config_name}; |
27 |
dpavlin |
12 |
print STDERR "opening index '$index_file'\n"; |
28 |
|
|
$swish = SWISH::API->new($index_file); |
29 |
|
|
$swish->AbortLastError if $swish->Error; |
30 |
|
|
|
31 |
|
|
$self->{index} = $swish; |
32 |
|
|
} |
33 |
|
|
|
34 |
|
|
return $swish; |
35 |
|
|
} |
36 |
|
|
|
37 |
|
|
sub search_index { |
38 |
|
|
my $self = shift; |
39 |
|
|
|
40 |
dpavlin |
21 |
croak "search_index needs query" if (! @_); |
41 |
dpavlin |
12 |
|
42 |
|
|
my $index = $self->open_index; |
43 |
|
|
|
44 |
dpavlin |
21 |
my $sw; |
45 |
|
|
|
46 |
dpavlin |
24 |
my $sort_map = { |
47 |
|
|
'date' => 'date_utime', |
48 |
|
|
'from' => 'from_phrase', |
49 |
|
|
'to' => 'to_phrase', |
50 |
|
|
'rank' => 'swishrank', |
51 |
|
|
}; |
52 |
|
|
|
53 |
|
|
my $sort_by; |
54 |
|
|
|
55 |
dpavlin |
21 |
foreach my $s (@_) { |
56 |
|
|
|
57 |
|
|
if ($s =~ /^\s*(\w+):(.+)\s*$/) { |
58 |
dpavlin |
24 |
my ($f,$v) = ($1,$2); |
59 |
|
|
if (lc($f) eq "sort") { |
60 |
|
|
my ($sf,$sv) = split(/ /,$v,2); |
61 |
|
|
$sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map"; |
62 |
|
|
$sort_by .= " $sv"; |
63 |
|
|
} else { |
64 |
|
|
$sw .= "$f=($v)"; |
65 |
|
|
} |
66 |
dpavlin |
21 |
} else { |
67 |
|
|
# and/or/not operators |
68 |
|
|
$sw .= " $s "; |
69 |
|
|
} |
70 |
dpavlin |
12 |
} |
71 |
|
|
|
72 |
dpavlin |
21 |
print STDERR "swish search: $sw\n"; |
73 |
dpavlin |
18 |
|
74 |
|
|
# convert to UTF-8 |
75 |
dpavlin |
21 |
$sw = $iso2utf->convert($sw) || $sw; |
76 |
dpavlin |
12 |
|
77 |
dpavlin |
24 |
my $search = $index->New_Search_Object; |
78 |
|
|
$search->SetSort( $sort_by ); |
79 |
|
|
my $results = $search->Execute($sw); |
80 |
|
|
|
81 |
dpavlin |
17 |
# store total number of hits |
82 |
|
|
$self->{'total_hits'} = $results->Hits; |
83 |
|
|
|
84 |
dpavlin |
14 |
my @res_ids; |
85 |
dpavlin |
12 |
|
86 |
dpavlin |
18 |
my $count = 1; |
87 |
dpavlin |
17 |
|
88 |
dpavlin |
14 |
while ( my $r = $results->NextResult ) { |
89 |
dpavlin |
13 |
|
90 |
dpavlin |
14 |
sub p($$) { |
91 |
|
|
my ($r,$prop) = @_; |
92 |
|
|
$prop = $r->Property($prop); |
93 |
|
|
$prop =~ s/##lf##/\n/gs; |
94 |
|
|
return $utf2iso->convert($prop); |
95 |
|
|
} |
96 |
dpavlin |
13 |
|
97 |
dpavlin |
14 |
my $id = p($r,"swishdocpath"); |
98 |
|
|
push @res_ids, $id; |
99 |
|
|
|
100 |
dpavlin |
13 |
foreach my $p (qw(from to cc bcc)) { |
101 |
dpavlin |
19 |
@{$self->{cache}->{$id}->{$p}} = (); |
102 |
|
|
foreach my $v (split(/##/, p($r,$p.'_phrase'))) { |
103 |
|
|
push @{$self->{cache}->{$id}->{$p}}, $v; |
104 |
|
|
$self->add_counter($p,$v); |
105 |
|
|
} |
106 |
dpavlin |
13 |
} |
107 |
|
|
|
108 |
dpavlin |
23 |
foreach my $p (qw(subject body date date_utime)) { |
109 |
dpavlin |
14 |
$self->{cache}->{$id}->{$p} = p($r,$p); |
110 |
dpavlin |
13 |
} |
111 |
|
|
|
112 |
dpavlin |
20 |
$self->add_counter_calendar(p($r,'date_utime')); |
113 |
|
|
|
114 |
dpavlin |
13 |
# this is redundant, but needed for templates later... |
115 |
|
|
$self->{cache}->{$id}->{'id'} = $id; |
116 |
dpavlin |
17 |
|
117 |
dpavlin |
18 |
last if (++$count > $self->{max_results}); |
118 |
dpavlin |
12 |
} |
119 |
|
|
|
120 |
dpavlin |
14 |
return @res_ids; |
121 |
dpavlin |
12 |
} |
122 |
|
|
|
123 |
dpavlin |
27 |
# this function can be null for indexes which doesn't need special |
124 |
|
|
# setup before add_index is called. however, swish-e support will |
125 |
|
|
# fork swish binary to create index at this point |
126 |
|
|
sub create_index { |
127 |
|
|
my $self = shift; |
128 |
|
|
|
129 |
|
|
my $index_prog = $0 || die "can't deduce my own name!"; |
130 |
|
|
my $config_file = $self->{config_file} || die "no self->config_file"; |
131 |
|
|
my $index_file = $self->{index_dir} || die "no self->index_dir"; |
132 |
|
|
$index_file .= "/"; |
133 |
|
|
$index_file .= $self->{config_name} || die "no self->config_name"; |
134 |
|
|
|
135 |
|
|
my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX"); |
136 |
|
|
|
137 |
|
|
print STDERR "creating swish-e configuration file $swish_config_file\n"; |
138 |
|
|
|
139 |
|
|
my $swish_config = qq{ |
140 |
|
|
# swish-e config file |
141 |
|
|
|
142 |
|
|
IndexDir $index_prog |
143 |
|
|
SwishProgParameters --recursive $config_file |
144 |
|
|
|
145 |
|
|
# input file definition |
146 |
|
|
DefaultContents XML2 |
147 |
|
|
|
148 |
|
|
# indexed metatags |
149 |
|
|
MetaNames xml swishdocpath |
150 |
|
|
|
151 |
|
|
# stored metatags |
152 |
|
|
PropertyNames from_phrase from_address |
153 |
|
|
PropertyNames to_phrase to_address |
154 |
|
|
PropertyNames cc_phrase cc_address |
155 |
|
|
PropertyNames subject body |
156 |
|
|
#PropertyNamesDate date |
157 |
|
|
PropertyNamesNumeric date_utime |
158 |
|
|
PropertyNames date |
159 |
|
|
|
160 |
|
|
#XMLClassAttributes type |
161 |
|
|
UndefinedMetaTags auto |
162 |
|
|
UndefinedXMLAttributes auto |
163 |
|
|
|
164 |
|
|
IndexFile $index_file |
165 |
|
|
|
166 |
|
|
# Croatian ISO-8859-2 characters to unaccented equivalents |
167 |
|
|
TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz |
168 |
|
|
|
169 |
|
|
|
170 |
|
|
# debug |
171 |
|
|
ParserWarnLevel 3 |
172 |
|
|
IndexReport 1 |
173 |
|
|
|
174 |
|
|
}; |
175 |
|
|
|
176 |
|
|
print $tmp_fh $swish_config; |
177 |
|
|
close($tmp_fh); |
178 |
|
|
|
179 |
|
|
exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file"; |
180 |
|
|
exit 0; |
181 |
|
|
|
182 |
|
|
} |
183 |
|
|
|
184 |
dpavlin |
12 |
sub add_index { |
185 |
|
|
my $self = shift; |
186 |
dpavlin |
14 |
|
187 |
|
|
my $mbox_id = shift || croak "add_index needs mbox_id"; |
188 |
|
|
my $document = shift || croak "add_index needs document"; |
189 |
|
|
|
190 |
|
|
my ($mbox,$id) = split(/\s/,$mbox_id,2); |
191 |
|
|
|
192 |
|
|
my $xml = qq{<message>}; |
193 |
|
|
foreach my $tag (keys %$document) { |
194 |
|
|
my $data = $document->{$tag}; |
195 |
dpavlin |
16 |
next if (! $data || $data eq ''); |
196 |
dpavlin |
14 |
# save [cr/]lf before conversion to XML |
197 |
|
|
$data =~ s/\n\r/##lf##/gs; |
198 |
|
|
$data =~ s/\n/##lf##/gs; |
199 |
dpavlin |
16 |
$xml .= "<$tag><![CDATA[".$data."]]></$tag>\n"; |
200 |
dpavlin |
14 |
} |
201 |
|
|
$xml .= qq{</message>}; |
202 |
|
|
|
203 |
|
|
$xml = $iso2utf->convert($xml); |
204 |
|
|
use bytes; # as opposed to chars |
205 |
|
|
print "Path-Name: $mbox $id\n"; |
206 |
|
|
print "Content-Length: ".(length($xml)+1)."\n"; |
207 |
|
|
print "Document-Type: XML\n\n$xml\n"; |
208 |
|
|
|
209 |
dpavlin |
12 |
} |
210 |
|
|
|
211 |
|
|
sub close_index { |
212 |
|
|
my $self = shift; |
213 |
|
|
|
214 |
|
|
} |
215 |
|
|
|
216 |
dpavlin |
30 |
# this is optional function which return words which sound like |
217 |
|
|
sub apropos_index { |
218 |
|
|
my $self = shift; |
219 |
|
|
|
220 |
|
|
my $fld = shift || croak "apropos_index need field"; |
221 |
|
|
my $words = shift || return; |
222 |
|
|
|
223 |
|
|
my @a; |
224 |
|
|
|
225 |
|
|
foreach my $word (split(/\s+/,$words)) { |
226 |
|
|
|
227 |
|
|
my $hash = soundex($word); |
228 |
|
|
my $c = substr($word,0,1); |
229 |
|
|
|
230 |
|
|
my $index = $self->open_index; |
231 |
|
|
my $index_file = $self->{index_dir}."/".$self->{config_name}; |
232 |
|
|
|
233 |
|
|
open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e"; |
234 |
|
|
my @k_arr; |
235 |
|
|
while(<SWISH>) { |
236 |
|
|
next if (/^#/); |
237 |
|
|
s/^.+?:\s+//; |
238 |
|
|
@k_arr = split(/\s+/); |
239 |
|
|
} |
240 |
|
|
|
241 |
|
|
foreach my $k (@k_arr) { |
242 |
|
|
push @a, $k if (soundex($k) eq $hash); |
243 |
|
|
} |
244 |
|
|
} |
245 |
|
|
|
246 |
|
|
# print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n"; |
247 |
|
|
return @a; |
248 |
|
|
|
249 |
|
|
} |
250 |
|
|
|
251 |
dpavlin |
12 |
1; |