1 |
package SWISH::Split; |
2 |
|
3 |
use 5.008; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
our $VERSION = '0.00'; |
8 |
|
9 |
use SWISH::API; |
10 |
use Text::Iconv; |
11 |
use File::Temp qw/ :mktemp /; |
12 |
use Carp; |
13 |
use Digest::MD5 qw(md5_hex); |
14 |
use Memoize; |
15 |
|
16 |
use Data::Dumper; |
17 |
|
18 |
=head1 NAME |
19 |
|
20 |
SWISH::Split - Perl interface to split index variant of Swish-e |
21 |
|
22 |
=head1 SYNOPSIS |
23 |
|
24 |
use SWISH::Split; |
25 |
|
26 |
|
27 |
=head1 DESCRIPTION |
28 |
|
29 |
This is alternative interface for indexing data with swish-e. It's designed |
30 |
to split indexes over multiple files to allow updates of records in index |
31 |
by reindexing just changed parts. |
32 |
|
33 |
Data is stored in index using intrface which is somewhat similar to |
34 |
L<Plucene::Simple>. This could make your migration (or supporting two index |
35 |
engines) easier. |
36 |
|
37 |
In the background, it will fork swish-e binaries (one for each index slice) |
38 |
and produce UTF-8 encoded XML files. So, if your imput charset isn't |
39 |
C<ISO-8859-1> you will have to specify it. |
40 |
|
41 |
=head1 Methods used for indexing |
42 |
|
43 |
=head2 open |
44 |
|
45 |
Create new object for index. |
46 |
|
47 |
my $i = SWISH::Split->open({ |
48 |
index => '/path/to/index', |
49 |
slice_name => \&slice_on_path, |
50 |
slices => 30, |
51 |
merge => 1, |
52 |
codepage => 'ISO-8859-2' |
53 |
); |
54 |
|
55 |
# split index on first component of path |
56 |
sub slice_on_path { |
57 |
return shift split(/\//,$_[0]); |
58 |
} |
59 |
|
60 |
|
61 |
C<slices> is maximum number of index slices. See L<"in_slice"> for |
62 |
more explanation. |
63 |
|
64 |
=cut |
65 |
|
66 |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
67 |
|
68 |
sub open { |
69 |
my $class = shift; |
70 |
my $self = {@_}; |
71 |
bless($self, $class); |
72 |
|
73 |
print Dumper($self->{'slice_name'}); |
74 |
|
75 |
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
76 |
croak "need slices" unless $self->{'slices'}; |
77 |
|
78 |
croak "need index" unless $self->{'index'}; |
79 |
croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; |
80 |
croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; |
81 |
|
82 |
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
83 |
|
84 |
memoize('in_slice'); |
85 |
|
86 |
$self ? return $self : return undef; |
87 |
|
88 |
} |
89 |
|
90 |
=head2 add |
91 |
|
92 |
Add document to index. |
93 |
|
94 |
$i->add($swishpath, { |
95 |
headline => 'foobar result', |
96 |
property => 'data', |
97 |
}) |
98 |
|
99 |
=cut |
100 |
|
101 |
sub add { |
102 |
my $self = shift; |
103 |
|
104 |
my $swishpath = shift || return; |
105 |
my $data = shift || return; |
106 |
|
107 |
return 1; |
108 |
} |
109 |
|
110 |
=head2 delete |
111 |
|
112 |
Delete documents from index. |
113 |
|
114 |
$i->delete(@swishpath); |
115 |
|
116 |
=cut |
117 |
|
118 |
sub delete { |
119 |
my $self = shift; |
120 |
|
121 |
my @paths = @_ || return; |
122 |
|
123 |
return 42; |
124 |
} |
125 |
|
126 |
|
127 |
=head2 close |
128 |
|
129 |
Close index file and finish indexing. |
130 |
|
131 |
$i->close; |
132 |
|
133 |
This is most time-consuming operation. When it's called, it will re-index |
134 |
all entries which haven't changed in all slices. |
135 |
|
136 |
=cut |
137 |
|
138 |
sub close { |
139 |
my $self = shift; |
140 |
|
141 |
return 1; |
142 |
} |
143 |
|
144 |
|
145 |
|
146 |
=head1 Reporting methods |
147 |
|
148 |
This methods return statistics about your index. |
149 |
|
150 |
=head2 swishpaths |
151 |
|
152 |
Return array of C<swishpath>s in index. |
153 |
|
154 |
my @p = $i->swishpaths; |
155 |
|
156 |
=cut |
157 |
|
158 |
sub swishpaths { |
159 |
my $self = shift; |
160 |
} |
161 |
|
162 |
=head2 swishpaths_updated |
163 |
|
164 |
Return array with updated C<swishpath>s. |
165 |
|
166 |
my @d = $i->swishpaths_updated; |
167 |
|
168 |
=cut |
169 |
|
170 |
sub swishpaths_updated { |
171 |
my $self = shift; |
172 |
} |
173 |
|
174 |
|
175 |
=head2 swishpaths_deleted |
176 |
|
177 |
Return array with deleted C<swishpath>s. |
178 |
|
179 |
my $n = $i->swishpaths_deleted; |
180 |
|
181 |
=cut |
182 |
|
183 |
sub swishpaths_deleted { |
184 |
my $self = shift; |
185 |
} |
186 |
|
187 |
|
188 |
=head2 slices |
189 |
|
190 |
Return array with all slice names. |
191 |
|
192 |
my @s = $i->slices; |
193 |
|
194 |
=cut |
195 |
|
196 |
sub slices { |
197 |
my $self = shift; |
198 |
} |
199 |
|
200 |
=head1 Helper methods |
201 |
|
202 |
This methods are used internally, but they might be useful. |
203 |
|
204 |
=head2 in_slice |
205 |
|
206 |
Takes path and return slice in which this path belongs. |
207 |
|
208 |
my $s = $i->in_slice('path/to/document/in/index'); |
209 |
|
210 |
If there are C<slices> parametar to L<"open"> it will use |
211 |
MD5 hash to spread documents across slices. That will produce random |
212 |
distribution of your documents in slices, which might or might not be best |
213 |
for your data. If you have to re-index large number of slices on each |
214 |
run, think about creating your own C<slice> function and distributing |
215 |
documents manually across slices. |
216 |
|
217 |
This function is C<Memoize>ed for performance reasons. |
218 |
|
219 |
=cut |
220 |
|
221 |
sub in_slice { |
222 |
my $self = shift; |
223 |
|
224 |
my $path = shift || confess "need path"; |
225 |
|
226 |
print Dumper($self->{'slice_name'}); |
227 |
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
228 |
|
229 |
if ($self->{'slices'}) { |
230 |
# first, pass path through slice_name function |
231 |
my $slice = &{$self->{'slice_name'}}($path); |
232 |
# then calculate MD5 hash |
233 |
$slice = md5_hex($slice); |
234 |
# take first 8 chars to produce number |
235 |
# FIXME how random is this? |
236 |
$slice = hex(substr($slice,0,8)); |
237 |
|
238 |
print "slice_nr: $slice slices: ",$self->{'slices'},"\n"; |
239 |
return ($slice % $self->{'slices'}); |
240 |
} else { |
241 |
return &{$self->{'split'}}($path); |
242 |
} |
243 |
} |
244 |
|
245 |
=head2 find_paths |
246 |
|
247 |
Return array of C<swishpath>s for given C<swish-e> query. |
248 |
|
249 |
my @p = $i->find_paths("headline=test*"); |
250 |
|
251 |
Useful for combining with L<"delete_documents"> to delete documents |
252 |
which hasn't changed a while (so, expired). |
253 |
|
254 |
=cut |
255 |
|
256 |
sub find_paths { |
257 |
my $self = shift; |
258 |
|
259 |
my $s = shift || return; |
260 |
} |
261 |
|
262 |
|
263 |
|
264 |
1; |
265 |
__END__ |
266 |
|
267 |
|
268 |
=head2 Searching |
269 |
|
270 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
271 |
index names. |
272 |
|
273 |
use SWISH::API; |
274 |
|
275 |
my $swish = SWISH::API->new( glob('index.swish-e/*') ); |
276 |
|
277 |
You can also alternativly create merged index (using C<merge> option) and |
278 |
not change your source code at all. |
279 |
|
280 |
That would also benefit performance, but it increases indexing time |
281 |
because merged indexes must be re-created on each indexing run. |
282 |
|
283 |
=head2 EXPORT |
284 |
|
285 |
None by default. |
286 |
|
287 |
|
288 |
|
289 |
=head1 SEE ALSO |
290 |
|
291 |
L<SWISH::API>, |
292 |
L<http://www.swish-e.org/> |
293 |
|
294 |
=head1 AUTHOR |
295 |
|
296 |
Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt> |
297 |
|
298 |
=head1 COPYRIGHT AND LICENSE |
299 |
|
300 |
Copyright (C) 2004 by Dobrica Pavlinusic |
301 |
|
302 |
This library is free software; you can redistribute it and/or modify |
303 |
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
304 |
at your option, any later version of Perl 5 you may have available. |
305 |
|
306 |
|
307 |
=cut |