/[mws]/trunk/lib/MWS/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/lib/MWS/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 55 - (show annotations)
Fri Aug 6 22:37:40 2004 UTC (19 years, 8 months ago) by dpavlin
File size: 6796 byte(s)
don't die on unknown properties

1 #!/usr/bin/perl -w
2
3 package MWS::SWISH;
4 use strict;
5 use warnings;
6
7 use MWS::Indexer;
8 our @ISA=qw(MWS::Indexer);
9
10 our $VERSION = '1.00';
11
12 use SWISH::API;
13 use Text::Iconv;
14 use File::Temp qw/ :mktemp /;
15 use Text::Soundex;
16 use Carp;
17
18 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
19 my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
20
21 =head1 NAME
22
23 MWS::SWISH - index your data using swish-e
24
25 =head1 DESCRIPTION
26
27 This is simple implementation to use SWISH-e with queryies like
28 Lucene (subject:something)
29
30 =head1 METHODS
31
32 =head2 open_index
33
34 This will open index in directory C<index_dir> in configuration file
35 with name which is same as name of configuration.
36
37 my $index = $self->open_index;
38
39 It saves index handle into $self->{index} and returns it.
40
41 =cut
42
43 sub open_index {
44 my $self = shift;
45
46 my $swish = $self->{index};
47
48 if (! $swish) {
49
50 my $index_file = $self->{index_dir}."/".$self->{config_name};
51 print STDERR "opening index '$index_file'\n";
52 $swish = SWISH::API->new($index_file);
53 $swish->AbortLastError if $swish->Error;
54
55 $self->{index} = $swish;
56 }
57
58 return $swish;
59 }
60
61 =head2 search_index
62
63 Takes array of terms and operators like this:
64
65 my @results = $self->search_index('field:search');
66 my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...);
67
68 It returns array of hases with results.
69
70 =cut
71
72 sub search_index {
73 my $self = shift;
74
75 croak "search_index needs query" if (! @_);
76
77 my $index = $self->open_index;
78
79 my $sw;
80
81 my $sort_map = {
82 'date' => 'date_utime',
83 'from' => 'from_phrase',
84 'to' => 'to_phrase',
85 'rank' => 'swishrank',
86 };
87
88 my $sort_by;
89
90 foreach my $s (@_) {
91
92 if ($s =~ /^\s*(\w+):(.+)\s*$/) {
93 my ($f,$v) = ($1,$2);
94 if (lc($f) eq "sort") {
95 my ($sf,$sv) = split(/ /,$v,2);
96 $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
97 $sort_by .= " $sv";
98 } else {
99 $sw .= "$f=($v)";
100 }
101 } else {
102 # and/or/not operators
103 $sw .= " $s ";
104 }
105 }
106
107 print STDERR "swish search: $sw\n";
108
109 # convert to UTF-8
110 $sw = $iso2utf->convert($sw) || $sw;
111
112 my $search = $index->New_Search_Object;
113 $search->SetSort( $sort_by );
114 my $results = $search->Execute($sw);
115
116 # store total number of hits
117 $self->{'total_hits'} = $results->Hits;
118
119 my @res_ids;
120
121 my $count = 1;
122
123 while ( my $r = $results->NextResult ) {
124
125 sub p($$) {
126 my ($r,$prop) = @_;
127 eval { $prop = $r->Property($prop) } || return;
128 $prop =~ s/##lf##/\n/gs;
129 return $utf2iso->convert($prop);
130 }
131
132 my $id = p($r,"swishdocpath");
133 push @res_ids, $id;
134
135 foreach my $p (qw(from to cc bcc)) {
136 @{$self->{cache}->{$id}->{$p}} = ();
137 my $props = p($r,$p.'_phrase') || last;
138 foreach my $v (split(/##/, $props)) {
139 push @{$self->{cache}->{$id}->{$p}}, $v;
140 $self->add_counter($p,$v);
141 }
142 }
143
144 foreach my $p (qw(subject body date date_utime)) {
145 $self->{cache}->{$id}->{$p} = p($r,$p);
146 }
147
148 $self->add_counter_calendar(p($r,'date_utime'));
149
150 # this is redundant, but needed for templates later...
151 $self->{cache}->{$id}->{'id'} = $id;
152
153 last if (++$count > $self->{max_results});
154 }
155
156 return @res_ids;
157 }
158
159 =head2 create_index
160
161 This function can be null for indexes which doesn't need special
162 setup before add_index is called. however, swish-e support will
163 exec swish binary to create index at this point.
164
165 Exec is used to free memory of parent process (because swish-e will
166 again start this script, but with --recursive) and free it's memory.
167
168 This has unfortunate efect that all output from swish-e will also be displayed.
169 If C<STDOUT> of swish-e is recirected to C</dev/null> we will loose errors
170 from this module (allthough they are directed to STDERR).
171
172 =cut
173
174 sub create_index {
175 my $self = shift;
176
177 my $index_prog = $0 || die "can't deduce my own name!";
178 my $config_file = $self->{config_file} || die "no self->config_file";
179 my $index_file = $self->{index_dir} || die "no self->index_dir";
180 $index_file .= "/";
181 $index_file .= $self->{config_name} || die "no self->config_name";
182
183 my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
184
185 print STDERR "creating swish-e configuration file $swish_config_file\n";
186
187 my $swish_config = qq{
188 # swish-e config file
189
190 IndexDir $index_prog
191 SwishProgParameters --recursive $config_file
192
193 # input file definition
194 DefaultContents XML2
195
196 # indexed metatags
197 MetaNames xml swishdocpath
198
199 # stored metatags
200 PropertyNames from_phrase from_address
201 PropertyNames to_phrase to_address
202 PropertyNames cc_phrase cc_address
203 PropertyNames subject body
204 #PropertyNamesDate date
205 PropertyNamesNumeric date_utime
206 PropertyNames date
207
208 #XMLClassAttributes type
209 UndefinedMetaTags auto
210 UndefinedXMLAttributes auto
211
212 IndexFile $index_file
213
214 # Croatian ISO-8859-2 characters to unaccented equivalents
215 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
216
217
218 # disable output
219 ParserWarnLevel 0
220 IndexReport 0
221
222 };
223
224 print $tmp_fh $swish_config;
225 close($tmp_fh);
226
227 exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
228 exit 0;
229
230 }
231
232 =head2 add_index
233
234 This function will add document to index.
235
236 $self->add_index('mailbox messageid', $document);
237
238 =cut
239
240 sub add_index {
241 my $self = shift;
242
243 my $mbox_id = shift || croak "add_index needs mbox_id";
244 my $document = shift || croak "add_index needs document";
245
246 my ($mbox,$id) = split(/\s/,$mbox_id,2);
247
248 my $xml = qq{<message>};
249 foreach my $tag (keys %$document) {
250 my $data = $document->{$tag};
251 next if (! $data || $data eq '');
252 # save [cr/]lf before conversion to XML
253 $data =~ s/\n\r/##lf##/gs;
254 $data =~ s/\n/##lf##/gs;
255 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
256 }
257 $xml .= qq{</message>};
258
259 $xml = $iso2utf->convert($xml);
260 use bytes; # as opposed to chars
261 print "Path-Name: $mbox $id\n";
262 print "Content-Length: ".(length($xml)+1)."\n";
263 print "Document-Type: XML\n\n$xml\n";
264
265 }
266
267 =head2 close_index
268
269 Close index at end (dummy for swish-e).
270
271 =cut
272
273 sub close_index {
274 my $self = shift;
275
276 }
277
278 =head2 apropos_index
279
280 This method is optional (it can return undef) and it returns words which
281 sound like word specified.
282
283 my @words = $self->apropos_index('word')
284
285 This implementation uses L<Text::Soundex>.
286
287 =cut
288
289 sub apropos_index {
290 my $self = shift;
291
292 my $fld = shift || croak "apropos_index need field";
293 my $words = shift || return;
294
295 my @a;
296
297 foreach my $word (split(/\s+/,$words)) {
298
299 my $hash = soundex($word);
300 my $c = substr($word,0,1);
301
302 my $index = $self->open_index;
303 my $index_file = $self->{index_dir}."/".$self->{config_name};
304
305 open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
306 my @k_arr;
307 while(<SWISH>) {
308 next if (/^#/);
309 s/^.+?:\s+//;
310 @k_arr = split(/\s+/);
311 }
312
313 foreach my $k (@k_arr) {
314 push @a, $k if (soundex($k) eq $hash);
315 }
316 }
317
318 # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
319 return @a;
320
321 }
322
323 1;

  ViewVC Help
Powered by ViewVC 1.1.26