/[mws]/trunk/MWS/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/MWS/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (hide annotations)
Mon May 10 20:26:17 2004 UTC (20 years ago) by dpavlin
File size: 6426 byte(s)
major code re-structuring: separation of indexer code into target independent
and depended, documentation improvements

1 dpavlin 12 #!/usr/bin/perl -w
2    
3 dpavlin 41 package MWS::SWISH;
4 dpavlin 12 use strict;
5 dpavlin 41 use warnings;
6 dpavlin 12
7 dpavlin 41 use MWS::Indexer;
8     our @ISA=qw(MWS::Indexer);
9 dpavlin 12
10 dpavlin 41 our $VERSION = '1.00';
11    
12 dpavlin 12 use SWISH::API;
13 dpavlin 14 use Text::Iconv;
14 dpavlin 27 use File::Temp qw/ :mktemp /;
15 dpavlin 30 use Text::Soundex;
16 dpavlin 41 use Carp;
17 dpavlin 12
18 dpavlin 14 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
19     my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
20    
21 dpavlin 41 =head1 NAME
22    
23     MWS::SWISH - index your data using swish-e
24    
25     =head1 DESCRIPTION
26    
27     This is simple implementation to use SWISH-e with queryies like
28     Lucene (subject:something)
29    
30     =head1 METHODS
31    
32     =head2 open_index
33    
34     This will open index in directory C<index_dir> in configuration file
35     with name which is same as name of configuration.
36    
37     my $index = $self->open_index;
38    
39     It saves index handle into $self->{index} and returns it.
40    
41     =cut
42    
43 dpavlin 12 sub open_index {
44     my $self = shift;
45    
46     my $swish = $self->{index};
47    
48     if (! $swish) {
49    
50 dpavlin 27 my $index_file = $self->{index_dir}."/".$self->{config_name};
51 dpavlin 12 print STDERR "opening index '$index_file'\n";
52     $swish = SWISH::API->new($index_file);
53     $swish->AbortLastError if $swish->Error;
54    
55     $self->{index} = $swish;
56     }
57    
58     return $swish;
59     }
60    
61 dpavlin 41 =head2 search_index
62    
63     Takes array of terms and operators like this:
64    
65     my @results = $self->search_index('field:search');
66     my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...);
67    
68     It returns array of hases with results.
69    
70     =cut
71    
72 dpavlin 12 sub search_index {
73     my $self = shift;
74    
75 dpavlin 21 croak "search_index needs query" if (! @_);
76 dpavlin 12
77     my $index = $self->open_index;
78    
79 dpavlin 21 my $sw;
80    
81 dpavlin 24 my $sort_map = {
82     'date' => 'date_utime',
83     'from' => 'from_phrase',
84     'to' => 'to_phrase',
85     'rank' => 'swishrank',
86     };
87    
88     my $sort_by;
89    
90 dpavlin 21 foreach my $s (@_) {
91    
92     if ($s =~ /^\s*(\w+):(.+)\s*$/) {
93 dpavlin 24 my ($f,$v) = ($1,$2);
94     if (lc($f) eq "sort") {
95     my ($sf,$sv) = split(/ /,$v,2);
96     $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
97     $sort_by .= " $sv";
98     } else {
99     $sw .= "$f=($v)";
100     }
101 dpavlin 21 } else {
102     # and/or/not operators
103     $sw .= " $s ";
104     }
105 dpavlin 12 }
106    
107 dpavlin 21 print STDERR "swish search: $sw\n";
108 dpavlin 18
109     # convert to UTF-8
110 dpavlin 21 $sw = $iso2utf->convert($sw) || $sw;
111 dpavlin 12
112 dpavlin 24 my $search = $index->New_Search_Object;
113     $search->SetSort( $sort_by );
114     my $results = $search->Execute($sw);
115    
116 dpavlin 17 # store total number of hits
117     $self->{'total_hits'} = $results->Hits;
118    
119 dpavlin 14 my @res_ids;
120 dpavlin 12
121 dpavlin 18 my $count = 1;
122 dpavlin 17
123 dpavlin 14 while ( my $r = $results->NextResult ) {
124 dpavlin 13
125 dpavlin 14 sub p($$) {
126     my ($r,$prop) = @_;
127 dpavlin 41 $prop = $r->Property($prop) || return;
128 dpavlin 14 $prop =~ s/##lf##/\n/gs;
129     return $utf2iso->convert($prop);
130     }
131 dpavlin 13
132 dpavlin 14 my $id = p($r,"swishdocpath");
133     push @res_ids, $id;
134    
135 dpavlin 13 foreach my $p (qw(from to cc bcc)) {
136 dpavlin 19 @{$self->{cache}->{$id}->{$p}} = ();
137 dpavlin 41 my $props = p($r,$p.'_phrase') || last;
138     foreach my $v (split(/##/, $props)) {
139 dpavlin 19 push @{$self->{cache}->{$id}->{$p}}, $v;
140     $self->add_counter($p,$v);
141     }
142 dpavlin 13 }
143    
144 dpavlin 23 foreach my $p (qw(subject body date date_utime)) {
145 dpavlin 14 $self->{cache}->{$id}->{$p} = p($r,$p);
146 dpavlin 13 }
147    
148 dpavlin 20 $self->add_counter_calendar(p($r,'date_utime'));
149    
150 dpavlin 13 # this is redundant, but needed for templates later...
151     $self->{cache}->{$id}->{'id'} = $id;
152 dpavlin 17
153 dpavlin 18 last if (++$count > $self->{max_results});
154 dpavlin 12 }
155    
156 dpavlin 14 return @res_ids;
157 dpavlin 12 }
158    
159 dpavlin 41 =head2 create_index
160    
161     This function can be null for indexes which doesn't need special
162     setup before add_index is called. however, swish-e support will
163     fork swish binary to create index at this point.
164    
165     =cut
166    
167 dpavlin 27 sub create_index {
168     my $self = shift;
169    
170     my $index_prog = $0 || die "can't deduce my own name!";
171     my $config_file = $self->{config_file} || die "no self->config_file";
172     my $index_file = $self->{index_dir} || die "no self->index_dir";
173     $index_file .= "/";
174     $index_file .= $self->{config_name} || die "no self->config_name";
175    
176     my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
177    
178     print STDERR "creating swish-e configuration file $swish_config_file\n";
179    
180     my $swish_config = qq{
181     # swish-e config file
182    
183     IndexDir $index_prog
184     SwishProgParameters --recursive $config_file
185    
186     # input file definition
187     DefaultContents XML2
188    
189     # indexed metatags
190     MetaNames xml swishdocpath
191    
192     # stored metatags
193     PropertyNames from_phrase from_address
194     PropertyNames to_phrase to_address
195     PropertyNames cc_phrase cc_address
196     PropertyNames subject body
197     #PropertyNamesDate date
198     PropertyNamesNumeric date_utime
199     PropertyNames date
200    
201     #XMLClassAttributes type
202     UndefinedMetaTags auto
203     UndefinedXMLAttributes auto
204    
205     IndexFile $index_file
206    
207     # Croatian ISO-8859-2 characters to unaccented equivalents
208     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
209    
210    
211     # debug
212     ParserWarnLevel 3
213     IndexReport 1
214    
215     };
216    
217     print $tmp_fh $swish_config;
218     close($tmp_fh);
219    
220     exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
221     exit 0;
222    
223     }
224    
225 dpavlin 41 =head2 add_index
226    
227     This function will add document to index.
228    
229     $self->add_index('mailbox messageid', $document);
230    
231     =cut
232    
233 dpavlin 12 sub add_index {
234     my $self = shift;
235 dpavlin 14
236     my $mbox_id = shift || croak "add_index needs mbox_id";
237     my $document = shift || croak "add_index needs document";
238    
239     my ($mbox,$id) = split(/\s/,$mbox_id,2);
240    
241     my $xml = qq{<message>};
242     foreach my $tag (keys %$document) {
243     my $data = $document->{$tag};
244 dpavlin 16 next if (! $data || $data eq '');
245 dpavlin 14 # save [cr/]lf before conversion to XML
246     $data =~ s/\n\r/##lf##/gs;
247     $data =~ s/\n/##lf##/gs;
248 dpavlin 16 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
249 dpavlin 14 }
250     $xml .= qq{</message>};
251    
252     $xml = $iso2utf->convert($xml);
253     use bytes; # as opposed to chars
254     print "Path-Name: $mbox $id\n";
255     print "Content-Length: ".(length($xml)+1)."\n";
256     print "Document-Type: XML\n\n$xml\n";
257    
258 dpavlin 12 }
259    
260 dpavlin 41 =head2 close_index
261    
262     Close index at end (dummy for swish-e).
263    
264     =cut
265    
266 dpavlin 12 sub close_index {
267     my $self = shift;
268    
269     }
270    
271 dpavlin 41 =head2 apropos_index
272    
273     This method is optional (it can return undef) and it returns words which
274     sound like word specified.
275    
276     my @words = $self->apropos_index('word')
277    
278     This implementation uses L<Text::Soundex>.
279    
280     =cut
281    
282 dpavlin 30 sub apropos_index {
283     my $self = shift;
284    
285     my $fld = shift || croak "apropos_index need field";
286     my $words = shift || return;
287    
288     my @a;
289    
290     foreach my $word (split(/\s+/,$words)) {
291    
292     my $hash = soundex($word);
293     my $c = substr($word,0,1);
294    
295     my $index = $self->open_index;
296     my $index_file = $self->{index_dir}."/".$self->{config_name};
297    
298     open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
299     my @k_arr;
300     while(<SWISH>) {
301     next if (/^#/);
302     s/^.+?:\s+//;
303     @k_arr = split(/\s+/);
304     }
305    
306     foreach my $k (@k_arr) {
307     push @a, $k if (soundex($k) eq $hash);
308     }
309     }
310    
311     # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
312     return @a;
313    
314     }
315    
316 dpavlin 12 1;

  ViewVC Help
Powered by ViewVC 1.1.26