/[mws]/trunk/MWS_swish.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/MWS_swish.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (hide annotations)
Sat May 8 20:34:26 2004 UTC (20 years ago) by dpavlin
File size: 4597 byte(s)
this is 0.9-rc1:
- scripts now accept configuration file as parametar, if none is specified,
  they will use global.conf in current directory
- more css design
- how to install using PAR
- mbox2index can now call swish-e by itself, swish-e configuration
  moved to MWS_swish.pm
- httpd server now shows 30 newest messages in this year when accessed
  through root URL

1 dpavlin 12 #!/usr/bin/perl -w
2    
3     use strict;
4    
5     #
6     # simple implementation to use SWISH-e with queryies like
7     # Lucene (subject:something)
8     #
9    
10     use SWISH::API;
11 dpavlin 14 use Text::Iconv;
12     use Data::Dumper;
13 dpavlin 27 use File::Temp qw/ :mktemp /;
14 dpavlin 12
15 dpavlin 14 my $iso2utf = Text::Iconv->new('ISO-8859-2','UTF-8');
16     my $utf2iso = Text::Iconv->new('UTF-8','ISO-8859-2');
17    
18 dpavlin 12 sub open_index {
19     my $self = shift;
20    
21     my $swish = $self->{index};
22    
23     if (! $swish) {
24    
25 dpavlin 27 my $index_file = $self->{index_dir}."/".$self->{config_name};
26 dpavlin 12 print STDERR "opening index '$index_file'\n";
27     $swish = SWISH::API->new($index_file);
28     $swish->AbortLastError if $swish->Error;
29    
30     $self->{index} = $swish;
31     }
32    
33     return $swish;
34     }
35    
36     sub search_index {
37     my $self = shift;
38    
39 dpavlin 21 croak "search_index needs query" if (! @_);
40 dpavlin 12
41     my $index = $self->open_index;
42    
43 dpavlin 21 my $sw;
44    
45 dpavlin 24 my $sort_map = {
46     'date' => 'date_utime',
47     'from' => 'from_phrase',
48     'to' => 'to_phrase',
49     'rank' => 'swishrank',
50     };
51    
52     my $sort_by;
53    
54 dpavlin 21 foreach my $s (@_) {
55    
56     if ($s =~ /^\s*(\w+):(.+)\s*$/) {
57 dpavlin 24 my ($f,$v) = ($1,$2);
58     if (lc($f) eq "sort") {
59     my ($sf,$sv) = split(/ /,$v,2);
60     $sort_by = $sort_map->{$sf} || croak "unsupported sort by field $v - fix sort_map";
61     $sort_by .= " $sv";
62     } else {
63     $sw .= "$f=($v)";
64     }
65 dpavlin 21 } else {
66     # and/or/not operators
67     $sw .= " $s ";
68     }
69 dpavlin 12 }
70    
71 dpavlin 21 print STDERR "swish search: $sw\n";
72 dpavlin 18
73     # convert to UTF-8
74 dpavlin 21 $sw = $iso2utf->convert($sw) || $sw;
75 dpavlin 12
76 dpavlin 24 my $search = $index->New_Search_Object;
77     $search->SetSort( $sort_by );
78     my $results = $search->Execute($sw);
79    
80 dpavlin 17 # store total number of hits
81     $self->{'total_hits'} = $results->Hits;
82    
83 dpavlin 14 my @res_ids;
84 dpavlin 12
85 dpavlin 18 my $count = 1;
86 dpavlin 17
87 dpavlin 14 while ( my $r = $results->NextResult ) {
88 dpavlin 13
89 dpavlin 14 sub p($$) {
90     my ($r,$prop) = @_;
91     $prop = $r->Property($prop);
92     $prop =~ s/##lf##/\n/gs;
93     return $utf2iso->convert($prop);
94     }
95 dpavlin 13
96 dpavlin 14 my $id = p($r,"swishdocpath");
97     push @res_ids, $id;
98    
99 dpavlin 13 foreach my $p (qw(from to cc bcc)) {
100 dpavlin 19 @{$self->{cache}->{$id}->{$p}} = ();
101     foreach my $v (split(/##/, p($r,$p.'_phrase'))) {
102     push @{$self->{cache}->{$id}->{$p}}, $v;
103     $self->add_counter($p,$v);
104     }
105 dpavlin 13 }
106    
107 dpavlin 23 foreach my $p (qw(subject body date date_utime)) {
108 dpavlin 14 $self->{cache}->{$id}->{$p} = p($r,$p);
109 dpavlin 13 }
110    
111 dpavlin 20 $self->add_counter_calendar(p($r,'date_utime'));
112    
113 dpavlin 13 # this is redundant, but needed for templates later...
114     $self->{cache}->{$id}->{'id'} = $id;
115 dpavlin 17
116 dpavlin 18 last if (++$count > $self->{max_results});
117 dpavlin 12 }
118    
119 dpavlin 14 return @res_ids;
120 dpavlin 12 }
121    
122 dpavlin 27 # this function can be null for indexes which doesn't need special
123     # setup before add_index is called. however, swish-e support will
124     # fork swish binary to create index at this point
125     sub create_index {
126     my $self = shift;
127    
128     my $index_prog = $0 || die "can't deduce my own name!";
129     my $config_file = $self->{config_file} || die "no self->config_file";
130     my $index_file = $self->{index_dir} || die "no self->index_dir";
131     $index_file .= "/";
132     $index_file .= $self->{config_name} || die "no self->config_name";
133    
134     my ($tmp_fh, $swish_config_file) = mkstemp("/tmp/swishXXXXX");
135    
136     print STDERR "creating swish-e configuration file $swish_config_file\n";
137    
138     my $swish_config = qq{
139     # swish-e config file
140    
141     IndexDir $index_prog
142     SwishProgParameters --recursive $config_file
143    
144     # input file definition
145     DefaultContents XML2
146    
147     # indexed metatags
148     MetaNames xml swishdocpath
149    
150     # stored metatags
151     PropertyNames from_phrase from_address
152     PropertyNames to_phrase to_address
153     PropertyNames cc_phrase cc_address
154     PropertyNames subject body
155     #PropertyNamesDate date
156     PropertyNamesNumeric date_utime
157     PropertyNames date
158    
159     #XMLClassAttributes type
160     UndefinedMetaTags auto
161     UndefinedXMLAttributes auto
162    
163     IndexFile $index_file
164    
165     # Croatian ISO-8859-2 characters to unaccented equivalents
166     TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
167    
168    
169     # debug
170     ParserWarnLevel 3
171     IndexReport 1
172    
173     };
174    
175     print $tmp_fh $swish_config;
176     close($tmp_fh);
177    
178     exec "swish-e -S prog -c $swish_config_file" || die "can't fork swish with $swish_config_file";
179     exit 0;
180    
181     }
182    
183 dpavlin 12 sub add_index {
184     my $self = shift;
185 dpavlin 14
186     my $mbox_id = shift || croak "add_index needs mbox_id";
187     my $document = shift || croak "add_index needs document";
188    
189     my ($mbox,$id) = split(/\s/,$mbox_id,2);
190    
191     my $xml = qq{<message>};
192     foreach my $tag (keys %$document) {
193     my $data = $document->{$tag};
194 dpavlin 16 next if (! $data || $data eq '');
195 dpavlin 14 # save [cr/]lf before conversion to XML
196     $data =~ s/\n\r/##lf##/gs;
197     $data =~ s/\n/##lf##/gs;
198 dpavlin 16 $xml .= "<$tag><![CDATA[".$data."]]></$tag>\n";
199 dpavlin 14 }
200     $xml .= qq{</message>};
201    
202     $xml = $iso2utf->convert($xml);
203     use bytes; # as opposed to chars
204     print "Path-Name: $mbox $id\n";
205     print "Content-Length: ".(length($xml)+1)."\n";
206     print "Document-Type: XML\n\n$xml\n";
207    
208 dpavlin 12 }
209    
210     sub close_index {
211     my $self = shift;
212    
213     }
214    
215     1;

  ViewVC Help
Powered by ViewVC 1.1.26