1 |
package SWISH::Split; |
2 |
|
3 |
use 5.008; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
our $VERSION = '0.00'; |
8 |
|
9 |
use SWISH::API; |
10 |
use Text::Iconv; |
11 |
use File::Temp qw/ :mktemp /; |
12 |
use Carp; |
13 |
use Digest::MD5 qw(md5_hex); |
14 |
use Memoize; |
15 |
|
16 |
use Data::Dumper; |
17 |
|
18 |
=head1 NAME |
19 |
|
20 |
SWISH::Split - Perl interface to split index variant of Swish-e |
21 |
|
22 |
=head1 SYNOPSIS |
23 |
|
24 |
use SWISH::Split; |
25 |
|
26 |
|
27 |
=head1 DESCRIPTION |
28 |
|
29 |
This is alternative interface for indexing data with swish-e. It's designed |
30 |
to split indexes over multiple files to allow updates of records in index |
31 |
by reindexing just changed parts. |
32 |
|
33 |
Data is stored in index using intrface which is somewhat similar to |
34 |
L<Plucene::Simple>. This could make your migration (or supporting two index |
35 |
engines) easier. |
36 |
|
37 |
In the background, it will fork swish-e binaries (one for each index slice) |
38 |
and produce UTF-8 encoded XML files. So, if your imput charset isn't |
39 |
C<ISO-8859-1> you will have to specify it. |
40 |
|
41 |
=head1 Methods used for indexing |
42 |
|
43 |
=head2 open |
44 |
|
45 |
Create new object for index. |
46 |
|
47 |
my $i = SWISH::Split->open({ |
48 |
index => '/path/to/index', |
49 |
slice_name => \&slice_on_path, |
50 |
slices => 30, |
51 |
merge => 1, |
52 |
codepage => 'ISO-8859-2' |
53 |
); |
54 |
|
55 |
# split index on first component of path |
56 |
sub slice_on_path { |
57 |
return shift split(/\//,$_[0]); |
58 |
} |
59 |
|
60 |
|
61 |
C<slices> is maximum number of index slices. See L<"in_slice"> for |
62 |
more explanation. |
63 |
|
64 |
=cut |
65 |
|
66 |
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
67 |
|
68 |
sub open { |
69 |
my $class = shift; |
70 |
my $self = {@_}; |
71 |
bless($self, $class); |
72 |
|
73 |
print Dumper($self->{'slice_name'}); |
74 |
|
75 |
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
76 |
croak "need slices" unless $self->{'slices'}; |
77 |
|
78 |
croak "need index" unless $self->{'index'}; |
79 |
croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; |
80 |
croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; |
81 |
|
82 |
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
83 |
|
84 |
memoize('in_slice'); |
85 |
|
86 |
$self ? return $self : return undef; |
87 |
|
88 |
} |
89 |
|
90 |
=head2 add |
91 |
|
92 |
Add document to index. |
93 |
|
94 |
$i->add($swishpath, { |
95 |
headline => 'foobar result', |
96 |
property => 'data', |
97 |
}) |
98 |
|
99 |
=cut |
100 |
|
101 |
sub add { |
102 |
my $self = shift; |
103 |
} |
104 |
|
105 |
=head2 delete |
106 |
|
107 |
Delete document from index. |
108 |
|
109 |
$i->delete($swishpath); |
110 |
|
111 |
=cut |
112 |
|
113 |
sub delete { |
114 |
my $self = shift; |
115 |
} |
116 |
|
117 |
|
118 |
=head2 close |
119 |
|
120 |
Close index file and finish indexing. |
121 |
|
122 |
$i->close; |
123 |
|
124 |
This is most time-consuming operation. When it's called, it will re-index |
125 |
all entries which haven't changed in all slices. |
126 |
|
127 |
=cut |
128 |
|
129 |
sub close { |
130 |
my $self = shift; |
131 |
} |
132 |
|
133 |
|
134 |
|
135 |
=head1 Reporting methods |
136 |
|
137 |
This methods return statistics about your index. |
138 |
|
139 |
=head2 swishpaths |
140 |
|
141 |
Return array of C<swishpath>s in index. |
142 |
|
143 |
my @p = $i->swishpaths; |
144 |
|
145 |
=cut |
146 |
|
147 |
sub swishpaths { |
148 |
my $self = shift; |
149 |
} |
150 |
|
151 |
=head2 swishpaths_updated |
152 |
|
153 |
Return array with updated C<swishpath>s. |
154 |
|
155 |
my @d = $i->swishpaths_updated; |
156 |
|
157 |
=cut |
158 |
|
159 |
sub swishpaths_updated { |
160 |
my $self = shift; |
161 |
} |
162 |
|
163 |
|
164 |
=head2 swishpaths_deleted |
165 |
|
166 |
Return array with deleted C<swishpath>s. |
167 |
|
168 |
my $n = $i->swishpaths_deleted; |
169 |
|
170 |
=cut |
171 |
|
172 |
sub swishpaths_deleted { |
173 |
my $self = shift; |
174 |
} |
175 |
|
176 |
|
177 |
=head2 slices |
178 |
|
179 |
Return array with all slice names. |
180 |
|
181 |
my @s = $i->slices; |
182 |
|
183 |
=cut |
184 |
|
185 |
sub slices { |
186 |
my $self = shift; |
187 |
} |
188 |
|
189 |
=head1 Helper methods |
190 |
|
191 |
This methods are used internally, but they might be useful. |
192 |
|
193 |
=head2 in_slice |
194 |
|
195 |
Takes path and return slice in which this path belongs. |
196 |
|
197 |
my $s = $i->in_slice('path/to/document/in/index'); |
198 |
|
199 |
If there are C<slices> parametar to L<"open"> it will use |
200 |
MD5 hash to spread documents across slices. That will produce random |
201 |
distribution of your documents in slices, which might or might not be best |
202 |
for your data. If you have to re-index large number of slices on each |
203 |
run, think about creating your own C<slice> function and distributing |
204 |
documents manually across slices. |
205 |
|
206 |
This function is C<Memoize>ed for performance reasons. |
207 |
|
208 |
=cut |
209 |
|
210 |
sub in_slice { |
211 |
my $self = shift; |
212 |
|
213 |
my $path = shift || confess "need path"; |
214 |
|
215 |
print Dumper($self->{'slice_name'}); |
216 |
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
217 |
|
218 |
if ($self->{'slices'}) { |
219 |
# first, pass path through slice_name function |
220 |
my $slice = &{$self->{'slice_name'}}($path); |
221 |
# then calculate MD5 hash |
222 |
$slice = md5_hex($slice); |
223 |
# take first 8 chars to produce number |
224 |
# FIXME how random is this? |
225 |
$slice = hex(substr($slice,0,8)); |
226 |
|
227 |
print "slice_nr: $slice slices: ",$self->{'slices'},"\n"; |
228 |
return ($slice % $self->{'slices'}); |
229 |
} else { |
230 |
return &{$self->{'split'}}($path); |
231 |
} |
232 |
} |
233 |
|
234 |
|
235 |
|
236 |
1; |
237 |
__END__ |
238 |
|
239 |
|
240 |
=head2 Searching |
241 |
|
242 |
Searching is still conducted using L<SWISH::API>, but you have to glob |
243 |
index names. |
244 |
|
245 |
use SWISH::API; |
246 |
|
247 |
my $swish = SWISH::API->new( glob('index.swish-e/*') ); |
248 |
|
249 |
You can also alternativly create merged index (using C<merge> option) and |
250 |
not change your source code at all. |
251 |
|
252 |
That would also benefit performance, but it increases indexing time |
253 |
because merged indexes must be re-created on each indexing run. |
254 |
|
255 |
=head2 EXPORT |
256 |
|
257 |
None by default. |
258 |
|
259 |
|
260 |
|
261 |
=head1 SEE ALSO |
262 |
|
263 |
L<SWISH::API>, |
264 |
L<http://www.swish-e.org/> |
265 |
|
266 |
=head1 AUTHOR |
267 |
|
268 |
Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt> |
269 |
|
270 |
=head1 COPYRIGHT AND LICENSE |
271 |
|
272 |
Copyright (C) 2004 by Dobrica Pavlinusic |
273 |
|
274 |
This library is free software; you can redistribute it and/or modify |
275 |
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
276 |
at your option, any later version of Perl 5 you may have available. |
277 |
|
278 |
|
279 |
=cut |