--- trunk/MWS_swish.pm 2004/05/07 23:35:39 20 +++ trunk/MWS_swish.pm 2004/05/09 00:09:32 30 @@ -10,6 +10,8 @@ use SWISH::API; use Text::Iconv; use Data::Dumper; +use File::Temp qw/ :mktemp /; +use Text::Soundex; my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8'); my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2'); @@ -21,8 +23,7 @@ if (! $swish) { - my $index_file = $self->{index_file} || croak "open_index needs index filename"; - $index_file .= "/swish-e"; + my $index_file = $self->{index_dir}."/".$self->{config_name}; print STDERR "opening index '$index_file'\n"; $swish = SWISH::API->new($index_file); $swish->AbortLastError if $swish->Error; @@ -36,20 +37,46 @@ sub search_index { my $self = shift; - my $s = shift || croak "search_index needs query"; + croak "search_index needs query" if (! @_); my $index = $self->open_index; - if ($s =~ /:/) { - my ($fld,$val) = split(/:/,$s,2); - $s = "$fld=($val)"; + my $sw; + + my $sort_map = { + 'date' => 'date_utime', + 'from' => 'from_phrase', + 'to' => 'to_phrase', + 'rank' => 'swishrank', + }; + + my $sort_by; + + foreach my $s (@_) { + + if ($s =~ /^\s*(\w+):(.+)\s*$/) { + my ($f,$v) = ($1,$2); + if (lc($f) eq "sort") { + my ($sf,$sv) = split(/ /,$v,2); + $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map"; + $sort_by .= " $sv"; + } else { + $sw .= "$f=($v)"; + } + } else { + # and/or/not operators + $sw .= " $s "; + } } - print STDERR "swish search: $s\n"; + print STDERR "swish search: $sw\n"; # convert to UTF-8 - $s = $iso2utf->convert($s) || $s; - my $results = $index->Query($s); + $sw = $iso2utf->convert($sw) || $sw; + + my $search = $index->New_Search_Object; + $search->SetSort( $sort_by ); + my $results = $search->Execute($sw); # store total number of hits $self->{'total_hits'} = $results->Hits; @@ -78,7 +105,7 @@ } } - foreach my $p (qw(subject body date)) { + foreach my $p (qw(subject body date date_utime)) { $self->{cache}->{$id}->{$p} = p($r,$p); } @@ -93,6 +120,67 @@ return @res_ids; } +# this function can be null for indexes which doesn't need special +# setup before add_index is called. however, swish-e support will +# fork swish binary to create index at this point +sub create_index { + my $self = shift; + + my $index_prog = $0 || die "can't deduce my own name!"; + my $config_file = $self->{config_file} || die "no self->config_file"; + my $index_file = $self->{index_dir} || die "no self->index_dir"; + $index_file .= "/"; + $index_file .= $self->{config_name} || die "no self->config_name"; + + my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX"); + + print STDERR "creating swish-e configuration file $swish_config_file\n"; + + my $swish_config = qq{ +# swish-e config file + +IndexDir $index_prog +SwishProgParameters --recursive $config_file + +# input file definition +DefaultContents XML2 + +# indexed metatags +MetaNames xml swishdocpath + +# stored metatags +PropertyNames from_phrase from_address +PropertyNames to_phrase to_address +PropertyNames cc_phrase cc_address +PropertyNames subject body +#PropertyNamesDate date +PropertyNamesNumeric date_utime +PropertyNames date + +#XMLClassAttributes type +UndefinedMetaTags auto +UndefinedXMLAttributes auto + +IndexFile $index_file + +# Croatian ISO-8859-2 characters to unaccented equivalents +TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz + + +# debug +ParserWarnLevel 3 +IndexReport 1 + +}; + + print $tmp_fh $swish_config; + close($tmp_fh); + + exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file"; + exit 0; + +} + sub add_index { my $self = shift; @@ -125,4 +213,39 @@ } +# this is optional function which return words which sound like +sub apropos_index { + my $self = shift; + + my $fld = shift || croak "apropos_index need field"; + my $words = shift || return; + + my @a; + + foreach my $word (split(/\s+/,$words)) { + + my $hash = soundex($word); + my $c = substr($word,0,1); + + my $index = $self->open_index; + my $index_file = $self->{index_dir}."/".$self->{config_name}; + + open(SWISH,"swish-e -f $index_file -k $c |") || die "can't start swish-e"; + my @k_arr; + while() { + next if (/^#/); + s/^.+?:\s+//; + @k_arr = split(/\s+/); + } + + foreach my $k (@k_arr) { + push @a, $k if (soundex($k) eq $hash); + } + } + +# print STDERR "apropos_index($fld,$word) [$hash]: ",join(" ",@a),"\n"; + return @a; + +} + 1;