SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.00';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;

use Data::Dumper;

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files to allow updates of records in index
by reindexing just changed parts.

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files. So, if your imput charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open

Create new object for index.

  my $i = SWISH::Split->open({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 1,
        codepage => 'ISO-8859-2'
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }


C<slices> is maximum number of index slices. See L<"in_slice"> for
more explanation.

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        print Dumper($self->{'slice_name'});

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        memoize('in_slice');

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;

        my $swishpath = shift || return;
        my $data = shift || return;

        return 1;
}

=head2 delete

Delete documents from index.

  $i->delete(@swishpath);

=cut

sub delete {
        my $self = shift;

        my @paths = @_ || return;

        return 42;
}


=head2 close

Close index file and finish indexing.

  $i->close;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

=cut

sub close {
        my $self = shift;

        return 1;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        print Dumper($self->{'slice_name'});
        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                $slice = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $slice = hex(substr($slice,0,8));
                
                print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
                return ($slice % $self->{'slices'});
        } else {
                return &{$self->{'split'}}($path);
        }
}

=head2 find_paths

Return array of C<swishpath>s for given C<swish-e> query.

  my @p = $i->find_paths("headline=test*");

Useful for combining with L<"delete_documents"> to delete documents
which hasn't changed a while (so, expired).

=cut

sub find_paths {
        my $self = shift;

        my $s = shift || return;
}


1;
__END__


=head2 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head2 EXPORT

None by default.


=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.00';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15
16	use Data::Dumper;
17
18	=head1 NAME
19
20	SWISH::Split - Perl interface to split index variant of Swish-e
21
22	=head1 SYNOPSIS
23
24	use SWISH::Split;
25
26
27	=head1 DESCRIPTION
28
29	This is alternative interface for indexing data with swish-e. It's designed
30	to split indexes over multiple files to allow updates of records in index
31	by reindexing just changed parts.
32
33	Data is stored in index using intrface which is somewhat similar to
34	L<Plucene::Simple>. This could make your migration (or supporting two index
35	engines) easier.
36
37	In the background, it will fork swish-e binaries (one for each index slice)
38	and produce UTF-8 encoded XML files. So, if your imput charset isn't
39	C<ISO-8859-1> you will have to specify it.
40
41	=head1 Methods used for indexing
42
43	=head2 open
44
45	Create new object for index.
46
47	my $i = SWISH::Split->open({
48	index => '/path/to/index',
49	slice_name => \&slice_on_path,
50	slices => 30,
51	merge => 1,
52	codepage => 'ISO-8859-2'
53	);
54
55	# split index on first component of path
56	sub slice_on_path {
57	return shift split(/\//,$_[0]);
58	}
59
60
61	C<slices> is maximum number of index slices. See L<"in_slice"> for
62	more explanation.
63
64	=cut
65
66	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67
68	sub open {
69	my $class = shift;
70	my $self = {@_};
71	bless($self, $class);
72
73	print Dumper($self->{'slice_name'});
74
75	croak "need slice_name coderef" unless ref $self->{'slice_name'};
76	croak "need slices" unless $self->{'slices'};
77
78	croak "need index" unless $self->{'index'};
79	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81
82	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83
84	memoize('in_slice');
85
86	$self ? return $self : return undef;
87
88	}
89
90	=head2 add
91
92	Add document to index.
93
94	$i->add($swishpath, {
95	headline => 'foobar result',
96	property => 'data',
97	})
98
99	=cut
100
101	sub add {
102	my $self = shift;
103
104	my $swishpath = shift \|\| return;
105	my $data = shift \|\| return;
106
107	return 1;
108	}
109
110	=head2 delete
111
112	Delete documents from index.
113
114	$i->delete(@swishpath);
115
116	=cut
117
118	sub delete {
119	my $self = shift;
120
121	my @paths = @_ \|\| return;
122
123	return 42;
124	}
125
126
127	=head2 close
128
129	Close index file and finish indexing.
130
131	$i->close;
132
133	This is most time-consuming operation. When it's called, it will re-index
134	all entries which haven't changed in all slices.
135
136	=cut
137
138	sub close {
139	my $self = shift;
140
141	return 1;
142	}
143
144
145
146	=head1 Reporting methods
147
148	This methods return statistics about your index.
149
150	=head2 swishpaths
151
152	Return array of C<swishpath>s in index.
153
154	my @p = $i->swishpaths;
155
156	=cut
157
158	sub swishpaths {
159	my $self = shift;
160	}
161
162	=head2 swishpaths_updated
163
164	Return array with updated C<swishpath>s.
165
166	my @d = $i->swishpaths_updated;
167
168	=cut
169
170	sub swishpaths_updated {
171	my $self = shift;
172	}
173
174
175	=head2 swishpaths_deleted
176
177	Return array with deleted C<swishpath>s.
178
179	my $n = $i->swishpaths_deleted;
180
181	=cut
182
183	sub swishpaths_deleted {
184	my $self = shift;
185	}
186
187
188	=head2 slices
189
190	Return array with all slice names.
191
192	my @s = $i->slices;
193
194	=cut
195
196	sub slices {
197	my $self = shift;
198	}
199
200	=head1 Helper methods
201
202	This methods are used internally, but they might be useful.
203
204	=head2 in_slice
205
206	Takes path and return slice in which this path belongs.
207
208	my $s = $i->in_slice('path/to/document/in/index');
209
210	If there are C<slices> parametar to L<"open"> it will use
211	MD5 hash to spread documents across slices. That will produce random
212	distribution of your documents in slices, which might or might not be best
213	for your data. If you have to re-index large number of slices on each
214	run, think about creating your own C<slice> function and distributing
215	documents manually across slices.
216
217	This function is C<Memoize>ed for performance reasons.
218
219	=cut
220
221	sub in_slice {
222	my $self = shift;
223
224	my $path = shift \|\| confess "need path";
225
226	print Dumper($self->{'slice_name'});
227	confess "need slice_name function" unless ref ($self->{'slice_name'});
228
229	if ($self->{'slices'}) {
230	# first, pass path through slice_name function
231	my $slice = &{$self->{'slice_name'}}($path);
232	# then calculate MD5 hash
233	$slice = md5_hex($slice);
234	# take first 8 chars to produce number
235	# FIXME how random is this?
236	$slice = hex(substr($slice,0,8));
237
238	print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
239	return ($slice % $self->{'slices'});
240	} else {
241	return &{$self->{'split'}}($path);
242	}
243	}
244
245	=head2 find_paths
246
247	Return array of C<swishpath>s for given C<swish-e> query.
248
249	my @p = $i->find_paths("headline=test*");
250
251	Useful for combining with L<"delete_documents"> to delete documents
252	which hasn't changed a while (so, expired).
253
254	=cut
255
256	sub find_paths {
257	my $self = shift;
258
259	my $s = shift \|\| return;
260	}
261
262
263
264	1;
265	__END__
266
267
268	=head2 Searching
269
270	Searching is still conducted using L<SWISH::API>, but you have to glob
271	index names.
272
273	use SWISH::API;
274
275	my $swish = SWISH::API->new( glob('index.swish-e/*') );
276
277	You can also alternativly create merged index (using C<merge> option) and
278	not change your source code at all.
279
280	That would also benefit performance, but it increases indexing time
281	because merged indexes must be re-created on each indexing run.
282
283	=head2 EXPORT
284
285	None by default.
286
287
288
289	=head1 SEE ALSO
290
291	L<SWISH::API>,
292	L<http://www.swish-e.org/>
293
294	=head1 AUTHOR
295
296	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
297
298	=head1 COPYRIGHT AND LICENSE
299
300	Copyright (C) 2004 by Dobrica Pavlinusic
301
302	This library is free software; you can redistribute it and/or modify
303	it under the same terms as Perl itself, either Perl version 5.8.4 or,
304	at your option, any later version of Perl 5 you may have available.
305
306
307	=cut