/[mws]/trunk/lib/MWS/SWISH.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/lib/MWS/SWISH.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/MWS_swish.pm revision 20 by dpavlin, Fri May 7 23:35:39 2004 UTC trunk/lib/MWS/SWISH.pm revision 55 by dpavlin, Fri Aug 6 22:37:40 2004 UTC
# Line 1  Line 1 
1  #!/usr/bin/perl -w  #!/usr/bin/perl -w
2    
3    package MWS::SWISH;
4  use strict;  use strict;
5    use warnings;
6    
7  #  use MWS::Indexer;
8  # simple implementation to use SWISH-e with queryies like  our @ISA=qw(MWS::Indexer);
9  # Lucene (subject:something)  
10  #  our $VERSION = '1.00';
11    
12  use SWISH::API;  use SWISH::API;
13  use Text::Iconv;  use Text::Iconv;
14  use Data::Dumper;  use File::Temp qw/ :mktemp /;
15    use Text::Soundex;
16    use Carp;
17    
18  my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');  my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
19  my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');  my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
20    
21    =head1 NAME
22    
23    MWS::SWISH - index your data using swish-e
24    
25    =head1 DESCRIPTION
26    
27    This is simple implementation to use SWISH-e with queryies like
28    Lucene (subject:something)
29    
30    =head1 METHODS
31    
32    =head2 open_index
33    
34    This will open index in directory C<index_dir> in configuration file
35    with name which is same as name of configuration.
36    
37     my $index = $self->open_index;
38    
39    It saves index handle into $self->{index} and returns it.
40    
41    =cut
42    
43  sub open_index {  sub open_index {
44          my $self = shift;          my $self = shift;
45    
# Line 21  sub open_index { Line 47  sub open_index {
47    
48          if (! $swish) {          if (! $swish) {
49    
50                  my $index_file = $self->{index_file} || croak "open_index needs index filename";                  my $index_file = $self->{index_dir}."/".$self->{config_name};
                 $index_file .= "/swish-e";  
51                  print STDERR "opening index '$index_file'\n";                  print STDERR "opening index '$index_file'\n";
52                  $swish = SWISH::API->new($index_file);                  $swish = SWISH::API->new($index_file);
53                  $swish->AbortLastError if $swish->Error;                  $swish->AbortLastError if $swish->Error;
# Line 33  sub open_index { Line 58  sub open_index {
58          return $swish;          return $swish;
59  }  }
60    
61    =head2 search_index
62    
63    Takes array of terms and operators like this:
64    
65     my @results = $self->search_index('field:search');
66     my @results = $self->search_index('fld1:word1', 'and', 'fld2:word2' ...);
67    
68    It returns array of hases with results.
69    
70    =cut
71    
72  sub search_index {  sub search_index {
73          my $self = shift;          my $self = shift;
74    
75          my $s = shift || croak "search_index needs query";          croak "search_index needs query" if (! @_);
76    
77          my $index = $self->open_index;          my $index = $self->open_index;
78    
79          if ($s =~ /:/) {          my $sw;
80                  my ($fld,$val) = split(/:/,$s,2);  
81                  $s = "$fld=($val)";          my $sort_map = {
82                    'date' => 'date_utime',
83                    'from' => 'from_phrase',
84                    'to' => 'to_phrase',
85                    'rank' => 'swishrank',
86            };
87    
88            my $sort_by;
89    
90            foreach my $s (@_) {
91    
92                    if ($s =~ /^\s*(\w+):(.+)\s*$/) {
93                            my ($f,$v) = ($1,$2);
94                            if (lc($f) eq "sort") {
95                                    my ($sf,$sv) = split(/ /,$v,2);
96                                    $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
97                                    $sort_by .= " $sv";
98                            } else {
99                                    $sw .= "$f=($v)";
100                            }
101                    } else {
102                            # and/or/not operators
103                            $sw .= " $s ";
104                    }
105          }          }
106    
107          print STDERR "swish search: $s\n";          print STDERR "swish search: $sw\n";
108    
109          # convert to UTF-8          # convert to UTF-8
110          $s = $iso2utf->convert($s) || $s;          $sw = $iso2utf->convert($sw) || $sw;
111          my $results = $index->Query($s);  
112            my $search = $index->New_Search_Object;
113            $search->SetSort( $sort_by );
114            my $results = $search->Execute($sw);
115    
116          # store total number of hits          # store total number of hits
117          $self->{'total_hits'} = $results->Hits;          $self->{'total_hits'} = $results->Hits;
# Line 62  sub search_index { Line 124  sub search_index {
124    
125                  sub p($$) {                  sub p($$) {
126                          my ($r,$prop) = @_;                          my ($r,$prop) = @_;
127                          $prop = $r->Property($prop);                          eval { $prop = $r->Property($prop) } || return;
128                          $prop =~ s/##lf##/\n/gs;                          $prop =~ s/##lf##/\n/gs;
129                          return $utf2iso->convert($prop);                          return $utf2iso->convert($prop);
130                  }                  }
# Line 72  sub search_index { Line 134  sub search_index {
134    
135                  foreach my $p (qw(from to cc bcc)) {                  foreach my $p (qw(from to cc bcc)) {
136                          @{$self->{cache}->{$id}->{$p}} = ();                          @{$self->{cache}->{$id}->{$p}} = ();
137                          foreach my $v (split(/##/, p($r,$p.'_phrase'))) {                          my $props = p($r,$p.'_phrase') || last;
138                            foreach my $v (split(/##/, $props)) {
139                                  push @{$self->{cache}->{$id}->{$p}}, $v;                                  push @{$self->{cache}->{$id}->{$p}}, $v;
140                                  $self->add_counter($p,$v);                                  $self->add_counter($p,$v);
141                          }                          }
142                  }                  }
143    
144                  foreach my $p (qw(subject body date)) {                  foreach my $p (qw(subject body date date_utime)) {
145                          $self->{cache}->{$id}->{$p} = p($r,$p);                          $self->{cache}->{$id}->{$p} = p($r,$p);
146                  }                  }
147    
# Line 93  sub search_index { Line 156  sub search_index {
156          return @res_ids;          return @res_ids;
157  }  }
158    
159    =head2 create_index
160    
161    This function can be null for indexes which doesn't need special
162    setup before add_index is called. however, swish-e support will
163    exec swish binary to create index at this point.
164    
165    Exec is used to free memory of parent process (because swish-e will
166    again start this script, but with --recursive) and free it's memory.
167    
168    This has unfortunate efect that all output from swish-e will also be displayed.
169    If C<STDOUT> of swish-e is recirected to C</dev/null> we will loose errors
170    from this module (allthough they are directed to STDERR).
171    
172    =cut
173    
174    sub create_index {
175            my $self = shift;
176    
177            my $index_prog = $0 || die "can't deduce my own name!";
178            my $config_file = $self->{config_file} || die "no self->config_file";
179            my $index_file = $self->{index_dir} || die "no self->index_dir";
180            $index_file .= "/";
181            $index_file .= $self->{config_name} || die "no self->config_name";
182    
183            my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
184    
185            print STDERR "creating swish-e configuration file $swish_config_file\n";
186    
187            my $swish_config = qq{
188    # swish-e config file
189    
190    IndexDir $index_prog
191    SwishProgParameters --recursive $config_file
192    
193    # input file definition
194    DefaultContents XML2
195    
196    # indexed metatags
197    MetaNames xml swishdocpath
198    
199    # stored metatags
200    PropertyNames from_phrase from_address
201    PropertyNames to_phrase to_address
202    PropertyNames cc_phrase cc_address
203    PropertyNames subject body
204    #PropertyNamesDate date
205    PropertyNamesNumeric date_utime
206    PropertyNames date
207    
208    #XMLClassAttributes type
209    UndefinedMetaTags auto
210    UndefinedXMLAttributes auto
211    
212    IndexFile $index_file
213    
214    # Croatian ISO-8859-2 characters to unaccented equivalents
215    TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
216    
217    
218    # disable output
219    ParserWarnLevel 0
220    IndexReport 0
221    
222    };
223    
224            print $tmp_fh $swish_config;
225            close($tmp_fh);
226    
227            exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
228            exit 0;
229    
230    }
231    
232    =head2 add_index
233    
234    This function will add document to index.
235    
236     $self->add_index('mailbox messageid', $document);
237    
238    =cut
239    
240  sub add_index {  sub add_index {
241          my $self = shift;          my $self = shift;
242    
# Line 120  sub add_index { Line 264  sub add_index {
264    
265  }  }
266    
267    =head2 close_index
268    
269    Close index at end (dummy for swish-e).
270    
271    =cut
272    
273  sub close_index {  sub close_index {
274          my $self = shift;          my $self = shift;
275    
276  }  }
277    
278    =head2 apropos_index
279    
280    This method is optional (it can return undef) and it returns words which
281    sound like word specified.
282    
283     my @words = $self->apropos_index('word')
284    
285    This implementation uses L<Text::Soundex>.
286    
287    =cut
288    
289    sub apropos_index {
290            my $self = shift;
291    
292            my $fld = shift || croak "apropos_index need field";
293            my $words = shift || return;
294    
295            my @a;
296    
297            foreach my $word (split(/\s+/,$words)) {
298    
299                    my $hash = soundex($word);
300                    my $c = substr($word,0,1);
301    
302                    my $index = $self->open_index;
303                    my $index_file = $self->{index_dir}."/".$self->{config_name};
304    
305                    open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e";
306                    my @k_arr;
307                    while(<SWISH>) {
308                            next if (/^#/);
309                            s/^.+?:\s+//;
310                            @k_arr = split(/\s+/);
311                    }
312    
313                    foreach my $k (@k_arr) {
314                            push @a, $k if (soundex($k) eq $hash);
315                    }
316            }
317    
318    #       print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n";
319            return @a;
320    
321    }
322    
323  1;  1;

Legend:
Removed from v.20  
changed lines
  Added in v.55

  ViewVC Help
Powered by ViewVC 1.1.26