/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1 - (hide annotations)
Sun Aug 8 10:09:55 2004 UTC (18 years, 1 month ago) by dpavlin
File size: 5283 byte(s)
initial import of SWISH::Split. Lot of documentation, less code.

1 dpavlin 1 package SWISH::Split;
2    
3     use 5.008;
4     use strict;
5     use warnings;
6    
7     our $VERSION = '0.00';
8    
9     use SWISH::API;
10     use Text::Iconv;
11     use File::Temp qw/ :mktemp /;
12     use Carp;
13     use Digest::MD5 qw(md5_hex);
14     use Memoize;
15    
16     use Data::Dumper;
17    
18     =head1 NAME
19    
20     SWISH::Split - Perl interface to split index variant of Swish-e
21    
22     =head1 SYNOPSIS
23    
24     use SWISH::Split;
25    
26    
27     =head1 DESCRIPTION
28    
29     This is alternative interface for indexing data with swish-e. It's designed
30     to split indexes over multiple files to allow updates of records in index
31     by reindexing just changed parts.
32    
33     Data is stored in index using intrface which is somewhat similar to
34     L<Plucene::Simple>. This could make your migration (or supporting two index
35     engines) easier.
36    
37     In the background, it will fork swish-e binaries (one for each index slice)
38     and produce UTF-8 encoded XML files. So, if your imput charset isn't
39     C<ISO-8859-1> you will have to specify it.
40    
41     =head1 Methods used for indexing
42    
43     =head2 open
44    
45     Create new object for index.
46    
47     my $i = SWISH::Split->open({
48     index => '/path/to/index',
49     slice_name => \&slice_on_path,
50     slices => 30,
51     merge => 1,
52     codepage => 'ISO-8859-2'
53     );
54    
55     # split index on first component of path
56     sub slice_on_path {
57     return shift split(/\//,$_[0]);
58     }
59    
60    
61     C<slices> is maximum number of index slices. See L<"in_slice"> for
62     more explanation.
63    
64     =cut
65    
66     my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67    
68     sub open {
69     my $class = shift;
70     my $self = {@_};
71     bless($self, $class);
72    
73     print Dumper($self->{'slice_name'});
74    
75     croak "need slice_name coderef" unless ref $self->{'slice_name'};
76     croak "need slices" unless $self->{'slices'};
77    
78     croak "need index" unless $self->{'index'};
79     croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80     croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81    
82     $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83    
84     memoize('in_slice');
85    
86     $self ? return $self : return undef;
87    
88     }
89    
90     =head2 add
91    
92     Add document to index.
93    
94     $i->add($swishpath, {
95     headline => 'foobar result',
96     property => 'data',
97     })
98    
99     =cut
100    
101     sub add {
102     my $self = shift;
103     }
104    
105     =head2 delete
106    
107     Delete document from index.
108    
109     $i->delete($swishpath);
110    
111     =cut
112    
113     sub delete {
114     my $self = shift;
115     }
116    
117    
118     =head2 close
119    
120     Close index file and finish indexing.
121    
122     $i->close;
123    
124     This is most time-consuming operation. When it's called, it will re-index
125     all entries which haven't changed in all slices.
126    
127     =cut
128    
129     sub close {
130     my $self = shift;
131     }
132    
133    
134    
135     =head1 Reporting methods
136    
137     This methods return statistics about your index.
138    
139     =head2 swishpaths
140    
141     Return array of C<swishpath>s in index.
142    
143     my @p = $i->swishpaths;
144    
145     =cut
146    
147     sub swishpaths {
148     my $self = shift;
149     }
150    
151     =head2 swishpaths_updated
152    
153     Return array with updated C<swishpath>s.
154    
155     my @d = $i->swishpaths_updated;
156    
157     =cut
158    
159     sub swishpaths_updated {
160     my $self = shift;
161     }
162    
163    
164     =head2 swishpaths_deleted
165    
166     Return array with deleted C<swishpath>s.
167    
168     my $n = $i->swishpaths_deleted;
169    
170     =cut
171    
172     sub swishpaths_deleted {
173     my $self = shift;
174     }
175    
176    
177     =head2 slices
178    
179     Return array with all slice names.
180    
181     my @s = $i->slices;
182    
183     =cut
184    
185     sub slices {
186     my $self = shift;
187     }
188    
189     =head1 Helper methods
190    
191     This methods are used internally, but they might be useful.
192    
193     =head2 in_slice
194    
195     Takes path and return slice in which this path belongs.
196    
197     my $s = $i->in_slice('path/to/document/in/index');
198    
199     If there are C<slices> parametar to L<"open"> it will use
200     MD5 hash to spread documents across slices. That will produce random
201     distribution of your documents in slices, which might or might not be best
202     for your data. If you have to re-index large number of slices on each
203     run, think about creating your own C<slice> function and distributing
204     documents manually across slices.
205    
206     This function is C<Memoize>ed for performance reasons.
207    
208     =cut
209    
210     sub in_slice {
211     my $self = shift;
212    
213     my $path = shift || confess "need path";
214    
215     print Dumper($self->{'slice_name'});
216     confess "need slice_name function" unless ref ($self->{'slice_name'});
217    
218     if ($self->{'slices'}) {
219     # first, pass path through slice_name function
220     my $slice = &{$self->{'slice_name'}}($path);
221     # then calculate MD5 hash
222     $slice = md5_hex($slice);
223     # take first 8 chars to produce number
224     # FIXME how random is this?
225     $slice = hex(substr($slice,0,8));
226    
227     print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
228     return ($slice % $self->{'slices'});
229     } else {
230     return &{$self->{'split'}}($path);
231     }
232     }
233    
234    
235    
236     1;
237     __END__
238    
239    
240     =head2 Searching
241    
242     Searching is still conducted using L<SWISH::API>, but you have to glob
243     index names.
244    
245     use SWISH::API;
246    
247     my $swish = SWISH::API->new( glob('index.swish-e/*') );
248    
249     You can also alternativly create merged index (using C<merge> option) and
250     not change your source code at all.
251    
252     That would also benefit performance, but it increases indexing time
253     because merged indexes must be re-created on each indexing run.
254    
255     =head2 EXPORT
256    
257     None by default.
258    
259    
260    
261     =head1 SEE ALSO
262    
263     L<SWISH::API>,
264     L<http://www.swish-e.org/>
265    
266     =head1 AUTHOR
267    
268     Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
269    
270     =head1 COPYRIGHT AND LICENSE
271    
272     Copyright (C) 2004 by Dobrica Pavlinusic
273    
274     This library is free software; you can redistribute it and/or modify
275     it under the same terms as Perl itself, either Perl version 5.8.4 or,
276     at your option, any later version of Perl 5 you may have available.
277    
278    
279     =cut

  ViewVC Help
Powered by ViewVC 1.1.26