/[mws]/trunk/MWS_swish.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/MWS_swish.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 30 - (hide annotations)
Sun May 9 00:09:32 2004 UTC (20 years ago) by dpavlin
File size: 5355 byte(s)
this is 0.9-rc2:
- added apropos functionality based on Text::Soundex (activated if no results
  are found)
- cleanup of debugging and minor other improvements in preparation for 0.9

1 dpavlin 12 #!/usr/bin/perl -w
2    
3     use strict;
4    
5     #
6     # simple implementation to use SWISH-e with queryies like
7     # Lucene (subject:something)
8     #
9    
10     use SWISH::API;
11 dpavlin 14 use Text::Iconv;
12     use Data::Dumper;
13 dpavlin 27 use File::Temp qw/ :mktemp /;
14 dpavlin 30 use Text::Soundex;
15 dpavlin 12
16 dpavlin 14 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
17     my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
18    
19 dpavlin 12 sub open_index {
20     my $self = shift;
21    
22     my $swish = $self->{index};
23    
24     if (! $swish) {
25    
26 dpavlin 27 my $index_file = $self->{index_dir}."/".$self->{config_name};
27 dpavlin 12 print STDERR "opening index '$index_file'\n";
28     $swish = SWISH::API->new($index_file);
29     $swish->AbortLastError if $swish->Error;
30    
31     $self->{index} = $swish;
32     }
33    
34     return $swish;
35     }
36    
37     sub search_index {
38     my $self = shift;
39    
40 dpavlin 21 croak "search_index needs query" if (! @_);
41 dpavlin 12
42     my $index = $self->open_index;
43    
44 dpavlin 21 my $sw;
45    
46 dpavlin 24 my $sort_map = {
47     'date' => 'date_utime',
48     'from' => 'from_phrase',
49     'to' => 'to_phrase',
50     'rank' => 'swishrank',
51     };
52    
53     my $sort_by;
54    
55 dpavlin 21 foreach my $s (@_) {
56    
57     if ($s =~ /^\s*(\w+):(.+)\s*$/) {
58 dpavlin 24 my ($f,$v) = ($1,$2);
59     if (lc($f) eq "sort") {
60     my ($sf,$sv) = split(/ /,$v,2);
61     $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
62     $sort_by .= " $sv";
63     } else {
64     $sw .= "$f=($v)";
65     }
66 dpavlin 21 } else {
67     # and/or/not operators
68     $sw .= " $s ";
69     }
70 dpavlin 12 }
71    
72 dpavlin 21 print STDERR "swish search: $sw\n";
73 dpavlin 18
74     # convert to UTF-8
75 dpavlin 21 $sw = $iso2utf->convert($sw) || $sw;
76 dpavlin 12
77 dpavlin 24 my $search = $index->New_Search_Object;
78     $search->SetSort( $sort_by );
79     my $results = $search->Execute($sw);
80    
81 dpavlin 17 # store total number of hits
82     $self->{'total_hits'} = $results->Hits;
83    
84 dpavlin 14 my @res_ids;
85 dpavlin 12
86 dpavlin 18 my $count = 1;
87 dpavlin 17
88 dpavlin 14 while ( my $r = $results->NextResult ) {
89 dpavlin 13
90 dpavlin 14 sub p($$) {
91     my ($r,$prop) = @_;
92     $prop = $r->Property($prop);
93     $prop =~ s/##lf##/\n/gs;
94     return $utf2iso->convert($prop);
95     }
96 dpavlin 13
97 dpavlin 14 my $id = p($r,"swishdocpath");
98     push @res_ids, $id;
99    
100 dpavlin 13 foreach my $p (qw(from to cc bcc)) {
101 dpavlin 19 @{$self->{cache}->{$id}->{$p}} = ();
102     foreach my $v (split(/##/, p($r,$p.'_phrase'))) {
103     push @{$self->{cache}->{$id}->{$p}}, $v;
104     $self->add_counter($p,$v);
105     }
106 dpavlin 13 }
107    
108 dpavlin 23 foreach my $p (qw(subject body date date_utime)) {
109 dpavlin 14 $self->{cache}->{$id}->{$p} = p($r,$p);
110 dpavlin 13 }
111    
112 dpavlin 20 $self->add_counter_calendar(p($r,'date_utime'));
113    
114 dpavlin 13 # this is redundant, but needed for templates later...
115     $self->{cache}->{$id}->{'id'} = $id;
116 dpavlin 17
117 dpavlin 18 last if (++$count > $self->{max_results});
118 dpavlin 12 }
119    
120 dpavlin 14 return @res_ids;
121 dpavlin 12 }
122    
123 dpavlin 27 # this function can be null for indexes which doesn't need special
124     # setup before add_index is called. however, swish-e support will
125     # fork swish binary to create index at this point
126     sub create_index {
127     my $self = shift;
128    
129     my $index_prog = $0 || die "can't deduce my own name!";
130     my $config_file = $self->{config_file} || die "no self->config_file";
131     my $index_file = $self->{index_dir} || die "no self->index_dir";
132     $index_file .= "/";
133     $index_file .= $self->{config_name} || die "no self->config_name";
134    
135     my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
136    
137     print STDERR "creating swish-e configuration file $swish_config_file\n";
138    
139     my $swish_config = qq{
140     # swish-e config file
141    
142     IndexDir $index_prog
143     SwishProgParameters --recursive $config_file
144    
145     # input file definition
146     DefaultContents XML2
147    
148     # indexed metatags
149     MetaNames xml swishdocpath
150    
151     # stored metatags
152     PropertyNames from_phrase from_address
153     PropertyNames to_phrase to_address
154     PropertyNames cc_phrase cc_address
155     PropertyNames subject body
156     #PropertyNamesDate date
157     PropertyNamesNumeric date_utime
158     PropertyNames date
159    
160     #XMLClassAttributes type
161     UndefinedMetaTags auto
162     UndefinedXMLAttributes auto
163    
164     IndexFile $index_file
165    
166     # Croatian ISO-8859-2 characters to unaccented equivalents
167     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
168    
169    
170     # debug
171     ParserWarnLevel 3
172     IndexReport 1
173    
174     };
175    
176     print $tmp_fh $swish_config;
177     close($tmp_fh);
178    
179     exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
180     exit 0;
181    
182     }
183    
184 dpavlin 12 sub add_index {
185     my $self = shift;
186 dpavlin 14
187     my $mbox_id = shift || croak "add_index needs mbox_id";
188     my $document = shift || croak "add_index needs document";
189    
190     my ($mbox,$id) = split(/\s/,$mbox_id,2);
191    
192     my $xml = qq{<message>};
193     foreach my $tag (keys %$document) {
194     my $data = $document->{$tag};
195 dpavlin 16 next if (! $data || $data eq '');
196 dpavlin 14 # save [cr/]lf before conversion to XML
197     $data =~ s/\n\r/##lf##/gs;
198     $data =~ s/\n/##lf##/gs;
199 dpavlin 16 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
200 dpavlin 14 }
201     $xml .= qq{</message>};
202    
203     $xml = $iso2utf->convert($xml);
204     use bytes; # as opposed to chars
205     print "Path-Name: $mbox $id\n";
206     print "Content-Length: ".(length($xml)+1)."\n";
207     print "Document-Type: XML\n\n$xml\n";
208    
209 dpavlin 12 }
210    
211     sub close_index {
212     my $self = shift;
213    
214     }
215    
216 dpavlin 30 # this is optional function which return words which sound like
217     sub apropos_index {
218     my $self = shift;
219    
220     my $fld = shift || croak "apropos_index need field";
221     my $words = shift || return;
222    
223     my @a;
224    
225     foreach my $word (split(/\s+/,$words)) {
226    
227     my $hash = soundex($word);
228     my $c = substr($word,0,1);
229    
230     my $index = $self->open_index;
231     my $index_file = $self->{index_dir}."/".$self->{config_name};
232    
233     open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
234     my @k_arr;
235     while(<SWISH>) {
236     next if (/^#/);
237     s/^.+?:\s+//;
238     @k_arr = split(/\s+/);
239     }
240    
241     foreach my $k (@k_arr) {
242     push @a, $k if (soundex($k) eq $hash);
243     }
244     }
245    
246     # print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
247     return @a;
248    
249     }
250    
251 dpavlin 12 1;

  ViewVC Help
Powered by ViewVC 1.1.26