/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3 - (hide annotations)
Sun Aug 8 10:53:04 2004 UTC (19 years, 7 months ago) by dpavlin
File size: 5721 byte(s)
one more planned call: find_paths

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15    
16     use Data::Dumper;
17    
18     =head1 NAME
19    
20     SWISH::Split - Perl interface to split index variant of Swish-e
21    
22     =head1 SYNOPSIS
23    
24     use SWISH::Split;
25    
26    
27     =head1 DESCRIPTION
28    
29     This is alternative interface for indexing data with swish-e. It's designed
30     to split indexes over multiple files to allow updates of records in index
31     by reindexing just changed parts.
32    
33     Data is stored in index using intrface which is somewhat similar to
34     L<Plucene::Simple>. This could make your migration (or supporting two index
35     engines) easier.
36    
37     In the background, it will fork swish-e binaries (one for each index slice)
38     and produce UTF-8 encoded XML files. So, if your imput charset isn't
39     C<ISO-8859-1> you will have to specify it.
40    
41     =head1 Methods used for indexing
42    
43     =head2 open
44    
45     Create new object for index.
46    
47     my $i = SWISH::Split->open({
48     index => '/path/to/index',
49     slice_name => \&slice_on_path,
50     slices => 30,
51     merge => 1,
52     codepage => 'ISO-8859-2'
53     );
54    
55     # split index on first component of path
56     sub slice_on_path {
57     return shift split(/\//,$_[0]);
58     }
59    
60    
61     C<slices> is maximum number of index slices. See L<"in_slice"> for
62     more explanation.
63    
64     =cut
65    
66     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67    
68     sub open {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73     print Dumper($self->{'slice_name'});
74    
75     croak "need slice_name coderef" unless ref $self->{'slice_name'};
76     croak "need slices" unless $self->{'slices'};
77    
78     croak "need index" unless $self->{'index'};
79     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81    
82     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83    
84     memoize('in_slice');
85    
86     $self ? return $self : return undef;
87    
88     }
89    
90     =head2 add
91    
92     Add document to index.
93    
94     $i->add($swishpath, {
95     headline => 'foobar result',
96     property => 'data',
97     })
98    
99     =cut
100    
101     sub add {
102     my $self = shift;
103 dpavlin 3
104     my $swishpath = shift || return;
105     my $data = shift || return;
106    
107     return 1;
108 dpavlin 1 }
109    
110     =head2 delete
111    
112 dpavlin 3 Delete documents from index.
113 dpavlin 1
114 dpavlin 3 $i->delete(@swishpath);
115 dpavlin 1
116     =cut
117    
118     sub delete {
119     my $self = shift;
120 dpavlin 3
121     my @paths = @_ || return;
122    
123     return 42;
124 dpavlin 1 }
125    
126    
127     =head2 close
128    
129     Close index file and finish indexing.
130    
131     $i->close;
132    
133     This is most time-consuming operation. When it's called, it will re-index
134     all entries which haven't changed in all slices.
135    
136     =cut
137    
138     sub close {
139     my $self = shift;
140 dpavlin 3
141     return 1;
142 dpavlin 1 }
143    
144    
145    
146     =head1 Reporting methods
147    
148     This methods return statistics about your index.
149    
150     =head2 swishpaths
151    
152     Return array of C<swishpath>s in index.
153    
154     my @p = $i->swishpaths;
155    
156     =cut
157    
158     sub swishpaths {
159     my $self = shift;
160     }
161    
162     =head2 swishpaths_updated
163    
164     Return array with updated C<swishpath>s.
165    
166     my @d = $i->swishpaths_updated;
167    
168     =cut
169    
170     sub swishpaths_updated {
171     my $self = shift;
172     }
173    
174    
175     =head2 swishpaths_deleted
176    
177     Return array with deleted C<swishpath>s.
178    
179     my $n = $i->swishpaths_deleted;
180    
181     =cut
182    
183     sub swishpaths_deleted {
184     my $self = shift;
185     }
186    
187    
188     =head2 slices
189    
190     Return array with all slice names.
191    
192     my @s = $i->slices;
193    
194     =cut
195    
196     sub slices {
197     my $self = shift;
198     }
199    
200     =head1 Helper methods
201    
202     This methods are used internally, but they might be useful.
203    
204     =head2 in_slice
205    
206     Takes path and return slice in which this path belongs.
207    
208     my $s = $i->in_slice('path/to/document/in/index');
209    
210     If there are C<slices> parametar to L<"open"> it will use
211     MD5 hash to spread documents across slices. That will produce random
212     distribution of your documents in slices, which might or might not be best
213     for your data. If you have to re-index large number of slices on each
214     run, think about creating your own C<slice> function and distributing
215     documents manually across slices.
216    
217     This function is C<Memoize>ed for performance reasons.
218    
219     =cut
220    
221     sub in_slice {
222     my $self = shift;
223    
224     my $path = shift || confess "need path";
225    
226     print Dumper($self->{'slice_name'});
227     confess "need slice_name function" unless ref ($self->{'slice_name'});
228    
229     if ($self->{'slices'}) {
230     # first, pass path through slice_name function
231     my $slice = &{$self->{'slice_name'}}($path);
232     # then calculate MD5 hash
233     $slice = md5_hex($slice);
234     # take first 8 chars to produce number
235     # FIXME how random is this?
236     $slice = hex(substr($slice,0,8));
237    
238     print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
239     return ($slice % $self->{'slices'});
240     } else {
241     return &{$self->{'split'}}($path);
242     }
243     }
244    
245 dpavlin 3 =head2 find_paths
246 dpavlin 1
247 dpavlin 3 Return array of C<swishpath>s for given C<swish-e> query.
248 dpavlin 1
249 dpavlin 3 my @p = $i->find_paths("headline=test*");
250    
251     Useful for combining with L<"delete_documents"> to delete documents
252     which hasn't changed a while (so, expired).
253    
254     =cut
255    
256     sub find_paths {
257     my $self = shift;
258    
259     my $s = shift || return;
260     }
261    
262    
263    
264 dpavlin 1 1;
265     __END__
266    
267    
268     =head2 Searching
269    
270     Searching is still conducted using L<SWISH::API>, but you have to glob
271     index names.
272    
273     use SWISH::API;
274    
275     my $swish = SWISH::API->new( glob('index.swish-e/*') );
276    
277     You can also alternativly create merged index (using C<merge> option) and
278     not change your source code at all.
279    
280     That would also benefit performance, but it increases indexing time
281     because merged indexes must be re-created on each indexing run.
282    
283     =head2 EXPORT
284    
285     None by default.
286    
287    
288    
289     =head1 SEE ALSO
290    
291     L<SWISH::API>,
292     L<http://www.swish-e.org/>
293    
294     =head1 AUTHOR
295    
296     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
297    
298     =head1 COPYRIGHT AND LICENSE
299    
300     Copyright (C) 2004 by Dobrica Pavlinusic
301    
302     This library is free software; you can redistribute it and/or modify
303     it under the same terms as Perl itself, either Perl version 5.8.4 or,
304     at your option, any later version of Perl 5 you may have available.
305    
306    
307     =cut

  ViewVC Help
Powered by ViewVC 1.1.26