/[mws]/trunk/MWS/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/MWS/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 47 - (hide annotations)
Tue May 11 22:59:27 2004 UTC (20 years ago) by dpavlin
File size: 6787 byte(s)
Major code update: 0.9-rc3 if no serious problems are found, this will
become first public version (0.9).

- search.pl is working again
- fixed Mail::Box problem with unimplemented lock_type => 'none' on Maildir
- documented Mozilla 1.5 problem with sidebar float: right
- don't output anything from swish-e while indexing
- remove (e-mail) from addresses (it seems that Exchange like to add those)
- added progress report while indexing
- documented all command-line utilities

1 dpavlin 12 #!/usr/bin/perl -w
2    
3 dpavlin 41 package MWS::SWISH;
4 dpavlin 12 use strict;
5 dpavlin 41 use warnings;
6 dpavlin 12
7 dpavlin 41 use MWS::Indexer;
8     our @ISA=qw(MWS::Indexer);
9 dpavlin 12
10 dpavlin 41 our $VERSION = '1.00';
11    
12 dpavlin 12 use SWISH::API;
13 dpavlin 14 use Text::Iconv;
14 dpavlin 27 use File::Temp qw/ :mktemp /;
15 dpavlin 30 use Text::Soundex;
16 dpavlin 41 use Carp;
17 dpavlin 12
18 dpavlin 14 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
19     my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
20    
21 dpavlin 41 =head1 NAME
22    
23     MWS::SWISH - index your data using swish-e
24    
25     =head1 DESCRIPTION
26    
27     This is simple implementation to use SWISH-e with queryies like
28     Lucene (subject:something)
29    
30     =head1 METHODS
31    
32     =head2 open_index
33    
34     This will open index in directory C<index_dir> in configuration file
35     with name which is same as name of configuration.
36    
37     my $index = $self->open_index;
38    
39     It saves index handle into $self->{index} and returns it.
40    
41     =cut
42    
43 dpavlin 12 sub open_index {
44     my $self = shift;
45    
46     my $swish = $self->{index};
47    
48     if (! $swish) {
49    
50 dpavlin 27 my $index_file = $self->{index_dir}."/".$self->{config_name};
51 dpavlin 12 print STDERR "opening index '$index_file'\n";
52     $swish = SWISH::API->new($index_file);
53     $swish->AbortLastError if $swish->Error;
54    
55     $self->{index} = $swish;
56     }
57    
58     return $swish;
59     }
60    
61 dpavlin 41 =head2 search_index
62    
63     Takes array of terms and operators like this:
64    
65     my @results = $self->search_index('field:search');
66     my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...);
67    
68     It returns array of hases with results.
69    
70     =cut
71    
72 dpavlin 12 sub search_index {
73     my $self = shift;
74    
75 dpavlin 21 croak "search_index needs query" if (! @_);
76 dpavlin 12
77     my $index = $self->open_index;
78    
79 dpavlin 21 my $sw;
80    
81 dpavlin 24 my $sort_map = {
82     'date' => 'date_utime',
83     'from' => 'from_phrase',
84     'to' => 'to_phrase',
85     'rank' => 'swishrank',
86     };
87    
88     my $sort_by;
89    
90 dpavlin 21 foreach my $s (@_) {
91    
92     if ($s =~ /^\s*(\w+):(.+)\s*$/) {
93 dpavlin 24 my ($f,$v) = ($1,$2);
94     if (lc($f) eq "sort") {
95     my ($sf,$sv) = split(/ /,$v,2);
96     $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
97     $sort_by .= " $sv";
98     } else {
99     $sw .= "$f=($v)";
100     }
101 dpavlin 21 } else {
102     # and/or/not operators
103     $sw .= " $s ";
104     }
105 dpavlin 12 }
106    
107 dpavlin 21 print STDERR "swish search: $sw\n";
108 dpavlin 18
109     # convert to UTF-8
110 dpavlin 21 $sw = $iso2utf->convert($sw) || $sw;
111 dpavlin 12
112 dpavlin 24 my $search = $index->New_Search_Object;
113     $search->SetSort( $sort_by );
114     my $results = $search->Execute($sw);
115    
116 dpavlin 17 # store total number of hits
117     $self->{'total_hits'} = $results->Hits;
118    
119 dpavlin 14 my @res_ids;
120 dpavlin 12
121 dpavlin 18 my $count = 1;
122 dpavlin 17
123 dpavlin 14 while ( my $r = $results->NextResult ) {
124 dpavlin 13
125 dpavlin 14 sub p($$) {
126     my ($r,$prop) = @_;
127 dpavlin 41 $prop = $r->Property($prop) || return;
128 dpavlin 14 $prop =~ s/##lf##/\n/gs;
129     return $utf2iso->convert($prop);
130     }
131 dpavlin 13
132 dpavlin 14 my $id = p($r,"swishdocpath");
133     push @res_ids, $id;
134    
135 dpavlin 13 foreach my $p (qw(from to cc bcc)) {
136 dpavlin 19 @{$self->{cache}->{$id}->{$p}} = ();
137 dpavlin 41 my $props = p($r,$p.'_phrase') || last;
138     foreach my $v (split(/##/, $props)) {
139 dpavlin 19 push @{$self->{cache}->{$id}->{$p}}, $v;
140     $self->add_counter($p,$v);
141     }
142 dpavlin 13 }
143    
144 dpavlin 23 foreach my $p (qw(subject body date date_utime)) {
145 dpavlin 14 $self->{cache}->{$id}->{$p} = p($r,$p);
146 dpavlin 13 }
147    
148 dpavlin 20 $self->add_counter_calendar(p($r,'date_utime'));
149    
150 dpavlin 13 # this is redundant, but needed for templates later...
151     $self->{cache}->{$id}->{'id'} = $id;
152 dpavlin 17
153 dpavlin 18 last if (++$count > $self->{max_results});
154 dpavlin 12 }
155    
156 dpavlin 14 return @res_ids;
157 dpavlin 12 }
158    
159 dpavlin 41 =head2 create_index
160    
161     This function can be null for indexes which doesn't need special
162     setup before add_index is called. however, swish-e support will
163 dpavlin 47 exec swish binary to create index at this point.
164 dpavlin 41
165 dpavlin 47 Exec is used to free memory of parent process (because swish-e will
166     again start this script, but with --recursive) and free it's memory.
167    
168     This has unfortunate efect that all output from swish-e will also be displayed.
169     If C<STDOUT> of swish-e is recirected to C</dev/null> we will loose errors
170     from this module (allthough they are directed to STDERR).
171    
172 dpavlin 41 =cut
173    
174 dpavlin 27 sub create_index {
175     my $self = shift;
176    
177     my $index_prog = $0 || die "can't deduce my own name!";
178     my $config_file = $self->{config_file} || die "no self->config_file";
179     my $index_file = $self->{index_dir} || die "no self->index_dir";
180     $index_file .= "/";
181     $index_file .= $self->{config_name} || die "no self->config_name";
182    
183     my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
184    
185     print STDERR "creating swish-e configuration file $swish_config_file\n";
186    
187     my $swish_config = qq{
188     # swish-e config file
189    
190     IndexDir $index_prog
191     SwishProgParameters --recursive $config_file
192    
193     # input file definition
194     DefaultContents XML2
195    
196     # indexed metatags
197     MetaNames xml swishdocpath
198    
199     # stored metatags
200     PropertyNames from_phrase from_address
201     PropertyNames to_phrase to_address
202     PropertyNames cc_phrase cc_address
203     PropertyNames subject body
204     #PropertyNamesDate date
205     PropertyNamesNumeric date_utime
206     PropertyNames date
207    
208     #XMLClassAttributes type
209     UndefinedMetaTags auto
210     UndefinedXMLAttributes auto
211    
212     IndexFile $index_file
213    
214     # Croatian ISO-8859-2 characters to unaccented equivalents
215     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
216    
217    
218 dpavlin 47 # disable output
219     ParserWarnLevel 0
220     IndexReport 0
221 dpavlin 27
222     };
223    
224     print $tmp_fh $swish_config;
225     close($tmp_fh);
226    
227     exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
228     exit 0;
229    
230     }
231    
232 dpavlin 41 =head2 add_index
233    
234     This function will add document to index.
235    
236     $self->add_index('mailbox messageid', $document);
237    
238     =cut
239    
240 dpavlin 12 sub add_index {
241     my $self = shift;
242 dpavlin 14
243     my $mbox_id = shift || croak "add_index needs mbox_id";
244     my $document = shift || croak "add_index needs document";
245    
246     my ($mbox,$id) = split(/\s/,$mbox_id,2);
247    
248     my $xml = qq{<message>};
249     foreach my $tag (keys %$document) {
250     my $data = $document->{$tag};
251 dpavlin 16 next if (! $data || $data eq '');
252 dpavlin 14 # save [cr/]lf before conversion to XML
253     $data =~ s/\n\r/##lf##/gs;
254     $data =~ s/\n/##lf##/gs;
255 dpavlin 16 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
256 dpavlin 14 }
257     $xml .= qq{</message>};
258    
259     $xml = $iso2utf->convert($xml);
260     use bytes; # as opposed to chars
261     print "Path-Name: $mbox $id\n";
262     print "Content-Length: ".(length($xml)+1)."\n";
263     print "Document-Type: XML\n\n$xml\n";
264    
265 dpavlin 12 }
266    
267 dpavlin 41 =head2 close_index
268    
269     Close index at end (dummy for swish-e).
270    
271     =cut
272    
273 dpavlin 12 sub close_index {
274     my $self = shift;
275    
276     }
277    
278 dpavlin 41 =head2 apropos_index
279    
280     This method is optional (it can return undef) and it returns words which
281     sound like word specified.
282    
283     my @words = $self->apropos_index('word')
284    
285     This implementation uses L<Text::Soundex>.
286    
287     =cut
288    
289 dpavlin 30 sub apropos_index {
290     my $self = shift;
291    
292     my $fld = shift || croak "apropos_index need field";
293     my $words = shift || return;
294    
295     my @a;
296    
297     foreach my $word (split(/\s+/,$words)) {
298    
299     my $hash = soundex($word);
300     my $c = substr($word,0,1);
301    
302     my $index = $self->open_index;
303     my $index_file = $self->{index_dir}."/".$self->{config_name};
304    
305     open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
306     my @k_arr;
307     while(<SWISH>) {
308     next if (/^#/);
309     s/^.+?:\s+//;
310     @k_arr = split(/\s+/);
311     }
312    
313     foreach my $k (@k_arr) {
314     push @a, $k if (soundex($k) eq $hash);
315     }
316     }
317    
318     # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
319     return @a;
320    
321     }
322    
323 dpavlin 12 1;

  ViewVC Help
Powered by ViewVC 1.1.26