/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (show annotations)
Sun Aug 8 10:53:04 2004 UTC (17 years, 3 months ago) by dpavlin
File size: 5721 byte(s)
one more planned call: find_paths

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15
16 use Data::Dumper;
17
18 =head1 NAME
19
20 SWISH::Split - Perl interface to split index variant of Swish-e
21
22 =head1 SYNOPSIS
23
24 use SWISH::Split;
25
26
27 =head1 DESCRIPTION
28
29 This is alternative interface for indexing data with swish-e. It's designed
30 to split indexes over multiple files to allow updates of records in index
31 by reindexing just changed parts.
32
33 Data is stored in index using intrface which is somewhat similar to
34 L<Plucene::Simple>. This could make your migration (or supporting two index
35 engines) easier.
36
37 In the background, it will fork swish-e binaries (one for each index slice)
38 and produce UTF-8 encoded XML files. So, if your imput charset isn't
39 C<ISO-8859-1> you will have to specify it.
40
41 =head1 Methods used for indexing
42
43 =head2 open
44
45 Create new object for index.
46
47 my $i = SWISH::Split->open({
48 index => '/path/to/index',
49 slice_name => \&slice_on_path,
50 slices => 30,
51 merge => 1,
52 codepage => 'ISO-8859-2'
53 );
54
55 # split index on first component of path
56 sub slice_on_path {
57 return shift split(/\//,$_[0]);
58 }
59
60
61 C<slices> is maximum number of index slices. See L<"in_slice"> for
62 more explanation.
63
64 =cut
65
66 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67
68 sub open {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 print Dumper($self->{'slice_name'});
74
75 croak "need slice_name coderef" unless ref $self->{'slice_name'};
76 croak "need slices" unless $self->{'slices'};
77
78 croak "need index" unless $self->{'index'};
79 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81
82 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83
84 memoize('in_slice');
85
86 $self ? return $self : return undef;
87
88 }
89
90 =head2 add
91
92 Add document to index.
93
94 $i->add($swishpath, {
95 headline => 'foobar result',
96 property => 'data',
97 })
98
99 =cut
100
101 sub add {
102 my $self = shift;
103
104 my $swishpath = shift || return;
105 my $data = shift || return;
106
107 return 1;
108 }
109
110 =head2 delete
111
112 Delete documents from index.
113
114 $i->delete(@swishpath);
115
116 =cut
117
118 sub delete {
119 my $self = shift;
120
121 my @paths = @_ || return;
122
123 return 42;
124 }
125
126
127 =head2 close
128
129 Close index file and finish indexing.
130
131 $i->close;
132
133 This is most time-consuming operation. When it's called, it will re-index
134 all entries which haven't changed in all slices.
135
136 =cut
137
138 sub close {
139 my $self = shift;
140
141 return 1;
142 }
143
144
145
146 =head1 Reporting methods
147
148 This methods return statistics about your index.
149
150 =head2 swishpaths
151
152 Return array of C<swishpath>s in index.
153
154 my @p = $i->swishpaths;
155
156 =cut
157
158 sub swishpaths {
159 my $self = shift;
160 }
161
162 =head2 swishpaths_updated
163
164 Return array with updated C<swishpath>s.
165
166 my @d = $i->swishpaths_updated;
167
168 =cut
169
170 sub swishpaths_updated {
171 my $self = shift;
172 }
173
174
175 =head2 swishpaths_deleted
176
177 Return array with deleted C<swishpath>s.
178
179 my $n = $i->swishpaths_deleted;
180
181 =cut
182
183 sub swishpaths_deleted {
184 my $self = shift;
185 }
186
187
188 =head2 slices
189
190 Return array with all slice names.
191
192 my @s = $i->slices;
193
194 =cut
195
196 sub slices {
197 my $self = shift;
198 }
199
200 =head1 Helper methods
201
202 This methods are used internally, but they might be useful.
203
204 =head2 in_slice
205
206 Takes path and return slice in which this path belongs.
207
208 my $s = $i->in_slice('path/to/document/in/index');
209
210 If there are C<slices> parametar to L<"open"> it will use
211 MD5 hash to spread documents across slices. That will produce random
212 distribution of your documents in slices, which might or might not be best
213 for your data. If you have to re-index large number of slices on each
214 run, think about creating your own C<slice> function and distributing
215 documents manually across slices.
216
217 This function is C<Memoize>ed for performance reasons.
218
219 =cut
220
221 sub in_slice {
222 my $self = shift;
223
224 my $path = shift || confess "need path";
225
226 print Dumper($self->{'slice_name'});
227 confess "need slice_name function" unless ref ($self->{'slice_name'});
228
229 if ($self->{'slices'}) {
230 # first, pass path through slice_name function
231 my $slice = &{$self->{'slice_name'}}($path);
232 # then calculate MD5 hash
233 $slice = md5_hex($slice);
234 # take first 8 chars to produce number
235 # FIXME how random is this?
236 $slice = hex(substr($slice,0,8));
237
238 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
239 return ($slice % $self->{'slices'});
240 } else {
241 return &{$self->{'split'}}($path);
242 }
243 }
244
245 =head2 find_paths
246
247 Return array of C<swishpath>s for given C<swish-e> query.
248
249 my @p = $i->find_paths("headline=test*");
250
251 Useful for combining with L<"delete_documents"> to delete documents
252 which hasn't changed a while (so, expired).
253
254 =cut
255
256 sub find_paths {
257 my $self = shift;
258
259 my $s = shift || return;
260 }
261
262
263
264 1;
265 __END__
266
267
268 =head2 Searching
269
270 Searching is still conducted using L<SWISH::API>, but you have to glob
271 index names.
272
273 use SWISH::API;
274
275 my $swish = SWISH::API->new( glob('index.swish-e/*') );
276
277 You can also alternativly create merged index (using C<merge> option) and
278 not change your source code at all.
279
280 That would also benefit performance, but it increases indexing time
281 because merged indexes must be re-created on each indexing run.
282
283 =head2 EXPORT
284
285 None by default.
286
287
288
289 =head1 SEE ALSO
290
291 L<SWISH::API>,
292 L<http://www.swish-e.org/>
293
294 =head1 AUTHOR
295
296 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
297
298 =head1 COPYRIGHT AND LICENSE
299
300 Copyright (C) 2004 by Dobrica Pavlinusic
301
302 This library is free software; you can redistribute it and/or modify
303 it under the same terms as Perl itself, either Perl version 5.8.4 or,
304 at your option, any later version of Perl 5 you may have available.
305
306
307 =cut

  ViewVC Help
Powered by ViewVC 1.1.26