/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (show annotations)
Sun Aug 8 10:09:55 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 5283 byte(s)
initial import of SWISH::Split. Lot of documentation, less code.

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.00';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15
16 use Data::Dumper;
17
18 =head1 NAME
19
20 SWISH::Split - Perl interface to split index variant of Swish-e
21
22 =head1 SYNOPSIS
23
24 use SWISH::Split;
25
26
27 =head1 DESCRIPTION
28
29 This is alternative interface for indexing data with swish-e. It's designed
30 to split indexes over multiple files to allow updates of records in index
31 by reindexing just changed parts.
32
33 Data is stored in index using intrface which is somewhat similar to
34 L<Plucene::Simple>. This could make your migration (or supporting two index
35 engines) easier.
36
37 In the background, it will fork swish-e binaries (one for each index slice)
38 and produce UTF-8 encoded XML files. So, if your imput charset isn't
39 C<ISO-8859-1> you will have to specify it.
40
41 =head1 Methods used for indexing
42
43 =head2 open
44
45 Create new object for index.
46
47 my $i = SWISH::Split->open({
48 index => '/path/to/index',
49 slice_name => \&slice_on_path,
50 slices => 30,
51 merge => 1,
52 codepage => 'ISO-8859-2'
53 );
54
55 # split index on first component of path
56 sub slice_on_path {
57 return shift split(/\//,$_[0]);
58 }
59
60
61 C<slices> is maximum number of index slices. See L<"in_slice"> for
62 more explanation.
63
64 =cut
65
66 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67
68 sub open {
69 my $class = shift;
70 my $self = {@_};
71 bless($self, $class);
72
73 print Dumper($self->{'slice_name'});
74
75 croak "need slice_name coderef" unless ref $self->{'slice_name'};
76 croak "need slices" unless $self->{'slices'};
77
78 croak "need index" unless $self->{'index'};
79 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81
82 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83
84 memoize('in_slice');
85
86 $self ? return $self : return undef;
87
88 }
89
90 =head2 add
91
92 Add document to index.
93
94 $i->add($swishpath, {
95 headline => 'foobar result',
96 property => 'data',
97 })
98
99 =cut
100
101 sub add {
102 my $self = shift;
103 }
104
105 =head2 delete
106
107 Delete document from index.
108
109 $i->delete($swishpath);
110
111 =cut
112
113 sub delete {
114 my $self = shift;
115 }
116
117
118 =head2 close
119
120 Close index file and finish indexing.
121
122 $i->close;
123
124 This is most time-consuming operation. When it's called, it will re-index
125 all entries which haven't changed in all slices.
126
127 =cut
128
129 sub close {
130 my $self = shift;
131 }
132
133
134
135 =head1 Reporting methods
136
137 This methods return statistics about your index.
138
139 =head2 swishpaths
140
141 Return array of C<swishpath>s in index.
142
143 my @p = $i->swishpaths;
144
145 =cut
146
147 sub swishpaths {
148 my $self = shift;
149 }
150
151 =head2 swishpaths_updated
152
153 Return array with updated C<swishpath>s.
154
155 my @d = $i->swishpaths_updated;
156
157 =cut
158
159 sub swishpaths_updated {
160 my $self = shift;
161 }
162
163
164 =head2 swishpaths_deleted
165
166 Return array with deleted C<swishpath>s.
167
168 my $n = $i->swishpaths_deleted;
169
170 =cut
171
172 sub swishpaths_deleted {
173 my $self = shift;
174 }
175
176
177 =head2 slices
178
179 Return array with all slice names.
180
181 my @s = $i->slices;
182
183 =cut
184
185 sub slices {
186 my $self = shift;
187 }
188
189 =head1 Helper methods
190
191 This methods are used internally, but they might be useful.
192
193 =head2 in_slice
194
195 Takes path and return slice in which this path belongs.
196
197 my $s = $i->in_slice('path/to/document/in/index');
198
199 If there are C<slices> parametar to L<"open"> it will use
200 MD5 hash to spread documents across slices. That will produce random
201 distribution of your documents in slices, which might or might not be best
202 for your data. If you have to re-index large number of slices on each
203 run, think about creating your own C<slice> function and distributing
204 documents manually across slices.
205
206 This function is C<Memoize>ed for performance reasons.
207
208 =cut
209
210 sub in_slice {
211 my $self = shift;
212
213 my $path = shift || confess "need path";
214
215 print Dumper($self->{'slice_name'});
216 confess "need slice_name function" unless ref ($self->{'slice_name'});
217
218 if ($self->{'slices'}) {
219 # first, pass path through slice_name function
220 my $slice = &{$self->{'slice_name'}}($path);
221 # then calculate MD5 hash
222 $slice = md5_hex($slice);
223 # take first 8 chars to produce number
224 # FIXME how random is this?
225 $slice = hex(substr($slice,0,8));
226
227 print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
228 return ($slice % $self->{'slices'});
229 } else {
230 return &{$self->{'split'}}($path);
231 }
232 }
233
234
235
236 1;
237 __END__
238
239
240 =head2 Searching
241
242 Searching is still conducted using L<SWISH::API>, but you have to glob
243 index names.
244
245 use SWISH::API;
246
247 my $swish = SWISH::API->new( glob('index.swish-e/*') );
248
249 You can also alternativly create merged index (using C<merge> option) and
250 not change your source code at all.
251
252 That would also benefit performance, but it increases indexing time
253 because merged indexes must be re-created on each indexing run.
254
255 =head2 EXPORT
256
257 None by default.
258
259
260
261 =head1 SEE ALSO
262
263 L<SWISH::API>,
264 L<http://www.swish-e.org/>
265
266 =head1 AUTHOR
267
268 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
269
270 =head1 COPYRIGHT AND LICENSE
271
272 Copyright (C) 2004 by Dobrica Pavlinusic
273
274 This library is free software; you can redistribute it and/or modify
275 it under the same terms as Perl itself, either Perl version 5.8.4 or,
276 at your option, any later version of Perl 5 you may have available.
277
278
279 =cut

  ViewVC Help
Powered by ViewVC 1.1.26