/[mws]/trunk/MWS_swish.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/MWS_swish.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 30 - (show annotations)
Sun May 9 00:09:32 2004 UTC (19 years, 11 months ago) by dpavlin
File size: 5355 byte(s)
this is 0.9-rc2:
- added apropos functionality based on Text::Soundex (activated if no results
  are found)
- cleanup of debugging and minor other improvements in preparation for 0.9

1 #!/usr/bin/perl -w
2
3 use strict;
4
5 #
6 # simple implementation to use SWISH-e with queryies like
7 # Lucene (subject:something)
8 #
9
10 use SWISH::API;
11 use Text::Iconv;
12 use Data::Dumper;
13 use File::Temp qw/ :mktemp /;
14 use Text::Soundex;
15
16 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
17 my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
18
19 sub open_index {
20 my $self = shift;
21
22 my $swish = $self->{index};
23
24 if (! $swish) {
25
26 my $index_file = $self->{index_dir}."/".$self->{config_name};
27 print STDERR "opening index '$index_file'\n";
28 $swish = SWISH::API->new($index_file);
29 $swish->AbortLastError if $swish->Error;
30
31 $self->{index} = $swish;
32 }
33
34 return $swish;
35 }
36
37 sub search_index {
38 my $self = shift;
39
40 croak "search_index needs query" if (! @_);
41
42 my $index = $self->open_index;
43
44 my $sw;
45
46 my $sort_map = {
47 'date' => 'date_utime',
48 'from' => 'from_phrase',
49 'to' => 'to_phrase',
50 'rank' => 'swishrank',
51 };
52
53 my $sort_by;
54
55 foreach my $s (@_) {
56
57 if ($s =~ /^\s*(\w+):(.+)\s*$/) {
58 my ($f,$v) = ($1,$2);
59 if (lc($f) eq "sort") {
60 my ($sf,$sv) = split(/ /,$v,2);
61 $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
62 $sort_by .= " $sv";
63 } else {
64 $sw .= "$f=($v)";
65 }
66 } else {
67 # and/or/not operators
68 $sw .= " $s ";
69 }
70 }
71
72 print STDERR "swish search: $sw\n";
73
74 # convert to UTF-8
75 $sw = $iso2utf->convert($sw) || $sw;
76
77 my $search = $index->New_Search_Object;
78 $search->SetSort( $sort_by );
79 my $results = $search->Execute($sw);
80
81 # store total number of hits
82 $self->{'total_hits'} = $results->Hits;
83
84 my @res_ids;
85
86 my $count = 1;
87
88 while ( my $r = $results->NextResult ) {
89
90 sub p($$) {
91 my ($r,$prop) = @_;
92 $prop = $r->Property($prop);
93 $prop =~ s/##lf##/\n/gs;
94 return $utf2iso->convert($prop);
95 }
96
97 my $id = p($r,"swishdocpath");
98 push @res_ids, $id;
99
100 foreach my $p (qw(from to cc bcc)) {
101 @{$self->{cache}->{$id}->{$p}} = ();
102 foreach my $v (split(/##/, p($r,$p.'_phrase'))) {
103 push @{$self->{cache}->{$id}->{$p}}, $v;
104 $self->add_counter($p,$v);
105 }
106 }
107
108 foreach my $p (qw(subject body date date_utime)) {
109 $self->{cache}->{$id}->{$p} = p($r,$p);
110 }
111
112 $self->add_counter_calendar(p($r,'date_utime'));
113
114 # this is redundant, but needed for templates later...
115 $self->{cache}->{$id}->{'id'} = $id;
116
117 last if (++$count > $self->{max_results});
118 }
119
120 return @res_ids;
121 }
122
123 # this function can be null for indexes which doesn't need special
124 # setup before add_index is called. however, swish-e support will
125 # fork swish binary to create index at this point
126 sub create_index {
127 my $self = shift;
128
129 my $index_prog = $0 || die "can't deduce my own name!";
130 my $config_file = $self->{config_file} || die "no self->config_file";
131 my $index_file = $self->{index_dir} || die "no self->index_dir";
132 $index_file .= "/";
133 $index_file .= $self->{config_name} || die "no self->config_name";
134
135 my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
136
137 print STDERR "creating swish-e configuration file $swish_config_file\n";
138
139 my $swish_config = qq{
140 # swish-e config file
141
142 IndexDir $index_prog
143 SwishProgParameters --recursive $config_file
144
145 # input file definition
146 DefaultContents XML2
147
148 # indexed metatags
149 MetaNames xml swishdocpath
150
151 # stored metatags
152 PropertyNames from_phrase from_address
153 PropertyNames to_phrase to_address
154 PropertyNames cc_phrase cc_address
155 PropertyNames subject body
156 #PropertyNamesDate date
157 PropertyNamesNumeric date_utime
158 PropertyNames date
159
160 #XMLClassAttributes type
161 UndefinedMetaTags auto
162 UndefinedXMLAttributes auto
163
164 IndexFile $index_file
165
166 # Croatian ISO-8859-2 characters to unaccented equivalents
167 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
168
169
170 # debug
171 ParserWarnLevel 3
172 IndexReport 1
173
174 };
175
176 print $tmp_fh $swish_config;
177 close($tmp_fh);
178
179 exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
180 exit 0;
181
182 }
183
184 sub add_index {
185 my $self = shift;
186
187 my $mbox_id = shift || croak "add_index needs mbox_id";
188 my $document = shift || croak "add_index needs document";
189
190 my ($mbox,$id) = split(/\s/,$mbox_id,2);
191
192 my $xml = qq{<message>};
193 foreach my $tag (keys %$document) {
194 my $data = $document->{$tag};
195 next if (! $data || $data eq '');
196 # save [cr/]lf before conversion to XML
197 $data =~ s/\n\r/##lf##/gs;
198 $data =~ s/\n/##lf##/gs;
199 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
200 }
201 $xml .= qq{</message>};
202
203 $xml = $iso2utf->convert($xml);
204 use bytes; # as opposed to chars
205 print "Path-Name: $mbox $id\n";
206 print "Content-Length: ".(length($xml)+1)."\n";
207 print "Document-Type: XML\n\n$xml\n";
208
209 }
210
211 sub close_index {
212 my $self = shift;
213
214 }
215
216 # this is optional function which return words which sound like
217 sub apropos_index {
218 my $self = shift;
219
220 my $fld = shift || croak "apropos_index need field";
221 my $words = shift || return;
222
223 my @a;
224
225 foreach my $word (split(/\s+/,$words)) {
226
227 my $hash = soundex($word);
228 my $c = substr($word,0,1);
229
230 my $index = $self->open_index;
231 my $index_file = $self->{index_dir}."/".$self->{config_name};
232
233 open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
234 my @k_arr;
235 while(<SWISH>) {
236 next if (/^#/);
237 s/^.+?:\s+//;
238 @k_arr = split(/\s+/);
239 }
240
241 foreach my $k (@k_arr) {
242 push @a, $k if (soundex($k) eq $hash);
243 }
244 }
245
246 # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
247 return @a;
248
249 }
250
251 1;

  ViewVC Help
Powered by ViewVC 1.1.26