SWISH-Split/trunk/Split.pm

package SWISH::Split;

use 5.008;
use strict;
use warnings;

our $VERSION = '0.00';

use SWISH::API;
use Text::Iconv;
use File::Temp qw/ :mktemp /;
use Carp;
use Digest::MD5 qw(md5_hex);
use Memoize;

use Data::Dumper;

=head1 NAME

SWISH::Split - Perl interface to split index variant of Swish-e

=head1 SYNOPSIS

  use SWISH::Split;


=head1 DESCRIPTION

This is alternative interface for indexing data with swish-e. It's designed
to split indexes over multiple files to allow updates of records in index
by reindexing just changed parts.

Data is stored in index using intrface which is somewhat similar to
L<Plucene::Simple>. This could make your migration (or supporting two index
engines) easier.

In the background, it will fork swish-e binaries (one for each index slice)
and produce UTF-8 encoded XML files. So, if your imput charset isn't
C<ISO-8859-1> you will have to specify it.

=head1 Methods used for indexing

=head2 open

Create new object for index.

  my $i = SWISH::Split->open({
        index => '/path/to/index',
        slice_name => \&slice_on_path,
        slices => 30,
        merge => 1,
        codepage => 'ISO-8859-2'
  );

  # split index on first component of path
  sub slice_on_path {
        return shift split(/\//,$_[0]);
  }


C<slices> is maximum number of index slices. See L<"in_slice"> for
more explanation.

=cut

my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');

sub open {
        my $class = shift;
        my $self = {@_};
        bless($self, $class);

        print Dumper($self->{'slice_name'});

        croak "need slice_name coderef" unless ref $self->{'slice_name'};
        croak "need slices" unless $self->{'slices'};

        croak "need index" unless $self->{'index'};
        croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
        croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};

        $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});

        memoize('in_slice');

        $self ? return $self : return undef;

}

=head2 add

Add document to index.

  $i->add($swishpath, {
        headline => 'foobar result',
        property => 'data',
  })

=cut

sub add {
        my $self = shift;
}

=head2 delete

Delete document from index.

  $i->delete($swishpath);

=cut

sub delete {
        my $self = shift;
}


=head2 close

Close index file and finish indexing.

  $i->close;

This is most time-consuming operation. When it's called, it will re-index
all entries which haven't changed in all slices.

=cut

sub close {
        my $self = shift;
}


=head1 Reporting methods

This methods return statistics about your index.

=head2 swishpaths

Return array of C<swishpath>s in index.

  my @p = $i->swishpaths;

=cut

sub swishpaths {
        my $self = shift;
}

=head2 swishpaths_updated

Return array with updated C<swishpath>s.

  my @d = $i->swishpaths_updated;

=cut

sub swishpaths_updated {
        my $self = shift;
}


=head2 swishpaths_deleted

Return array with deleted C<swishpath>s.

  my $n = $i->swishpaths_deleted;

=cut

sub swishpaths_deleted {
        my $self = shift;
}


=head2 slices

Return array with all slice names.

  my @s = $i->slices;

=cut

sub slices {
        my $self = shift;
}

=head1 Helper methods

This methods are used internally, but they might be useful.

=head2 in_slice

Takes path and return slice in which this path belongs.

  my $s = $i->in_slice('path/to/document/in/index');

If there are C<slices> parametar to L<"open"> it will use
MD5 hash to spread documents across slices. That will produce random
distribution of your documents in slices, which might or might not be best
for your data. If you have to re-index large number of slices on each
run, think about creating your own C<slice> function and distributing
documents manually across slices.

This function is C<Memoize>ed for performance reasons.

=cut

sub in_slice {
        my $self = shift;

        my $path = shift || confess "need path";

        print Dumper($self->{'slice_name'});
        confess "need slice_name function" unless ref ($self->{'slice_name'});

        if ($self->{'slices'}) {
                # first, pass path through slice_name function
                my $slice = &{$self->{'slice_name'}}($path);
                # then calculate MD5 hash
                $slice = md5_hex($slice);
                # take first 8 chars to produce number
                # FIXME how random is this?
                $slice = hex(substr($slice,0,8));
                
                print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
                return ($slice % $self->{'slices'});
        } else {
                return &{$self->{'split'}}($path);
        }
}


1;
__END__


=head2 Searching

Searching is still conducted using L<SWISH::API>, but you have to glob
index names.

    use SWISH::API;

    my $swish = SWISH::API->new( glob('index.swish-e/*') );

You can also alternativly create merged index (using C<merge> option) and
not change your source code at all.

That would also benefit performance, but it increases indexing time
because merged indexes must be re-created on each indexing run.

=head2 EXPORT

None by default.


=head1 SEE ALSO

L<SWISH::API>,
L<http://www.swish-e.org/>

=head1 AUTHOR

Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2004 by Dobrica Pavlinusic

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.4 or,
at your option, any later version of Perl 5 you may have available.


=cut
1	package SWISH::Split;
2
3	use 5.008;
4	use strict;
5	use warnings;
6
7	our $VERSION = '0.00';
8
9	use SWISH::API;
10	use Text::Iconv;
11	use File::Temp qw/ :mktemp /;
12	use Carp;
13	use Digest::MD5 qw(md5_hex);
14	use Memoize;
15
16	use Data::Dumper;
17
18	=head1 NAME
19
20	SWISH::Split - Perl interface to split index variant of Swish-e
21
22	=head1 SYNOPSIS
23
24	use SWISH::Split;
25
26
27	=head1 DESCRIPTION
28
29	This is alternative interface for indexing data with swish-e. It's designed
30	to split indexes over multiple files to allow updates of records in index
31	by reindexing just changed parts.
32
33	Data is stored in index using intrface which is somewhat similar to
34	L<Plucene::Simple>. This could make your migration (or supporting two index
35	engines) easier.
36
37	In the background, it will fork swish-e binaries (one for each index slice)
38	and produce UTF-8 encoded XML files. So, if your imput charset isn't
39	C<ISO-8859-1> you will have to specify it.
40
41	=head1 Methods used for indexing
42
43	=head2 open
44
45	Create new object for index.
46
47	my $i = SWISH::Split->open({
48	index => '/path/to/index',
49	slice_name => \&slice_on_path,
50	slices => 30,
51	merge => 1,
52	codepage => 'ISO-8859-2'
53	);
54
55	# split index on first component of path
56	sub slice_on_path {
57	return shift split(/\//,$_[0]);
58	}
59
60
61	C<slices> is maximum number of index slices. See L<"in_slice"> for
62	more explanation.
63
64	=cut
65
66	my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
67
68	sub open {
69	my $class = shift;
70	my $self = {@_};
71	bless($self, $class);
72
73	print Dumper($self->{'slice_name'});
74
75	croak "need slice_name coderef" unless ref $self->{'slice_name'};
76	croak "need slices" unless $self->{'slices'};
77
78	croak "need index" unless $self->{'index'};
79	croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
80	croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
81
82	$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
83
84	memoize('in_slice');
85
86	$self ? return $self : return undef;
87
88	}
89
90	=head2 add
91
92	Add document to index.
93
94	$i->add($swishpath, {
95	headline => 'foobar result',
96	property => 'data',
97	})
98
99	=cut
100
101	sub add {
102	my $self = shift;
103	}
104
105	=head2 delete
106
107	Delete document from index.
108
109	$i->delete($swishpath);
110
111	=cut
112
113	sub delete {
114	my $self = shift;
115	}
116
117
118	=head2 close
119
120	Close index file and finish indexing.
121
122	$i->close;
123
124	This is most time-consuming operation. When it's called, it will re-index
125	all entries which haven't changed in all slices.
126
127	=cut
128
129	sub close {
130	my $self = shift;
131	}
132
133
134
135	=head1 Reporting methods
136
137	This methods return statistics about your index.
138
139	=head2 swishpaths
140
141	Return array of C<swishpath>s in index.
142
143	my @p = $i->swishpaths;
144
145	=cut
146
147	sub swishpaths {
148	my $self = shift;
149	}
150
151	=head2 swishpaths_updated
152
153	Return array with updated C<swishpath>s.
154
155	my @d = $i->swishpaths_updated;
156
157	=cut
158
159	sub swishpaths_updated {
160	my $self = shift;
161	}
162
163
164	=head2 swishpaths_deleted
165
166	Return array with deleted C<swishpath>s.
167
168	my $n = $i->swishpaths_deleted;
169
170	=cut
171
172	sub swishpaths_deleted {
173	my $self = shift;
174	}
175
176
177	=head2 slices
178
179	Return array with all slice names.
180
181	my @s = $i->slices;
182
183	=cut
184
185	sub slices {
186	my $self = shift;
187	}
188
189	=head1 Helper methods
190
191	This methods are used internally, but they might be useful.
192
193	=head2 in_slice
194
195	Takes path and return slice in which this path belongs.
196
197	my $s = $i->in_slice('path/to/document/in/index');
198
199	If there are C<slices> parametar to L<"open"> it will use
200	MD5 hash to spread documents across slices. That will produce random
201	distribution of your documents in slices, which might or might not be best
202	for your data. If you have to re-index large number of slices on each
203	run, think about creating your own C<slice> function and distributing
204	documents manually across slices.
205
206	This function is C<Memoize>ed for performance reasons.
207
208	=cut
209
210	sub in_slice {
211	my $self = shift;
212
213	my $path = shift \|\| confess "need path";
214
215	print Dumper($self->{'slice_name'});
216	confess "need slice_name function" unless ref ($self->{'slice_name'});
217
218	if ($self->{'slices'}) {
219	# first, pass path through slice_name function
220	my $slice = &{$self->{'slice_name'}}($path);
221	# then calculate MD5 hash
222	$slice = md5_hex($slice);
223	# take first 8 chars to produce number
224	# FIXME how random is this?
225	$slice = hex(substr($slice,0,8));
226
227	print "slice_nr: $slice slices: ",$self->{'slices'},"\n";
228	return ($slice % $self->{'slices'});
229	} else {
230	return &{$self->{'split'}}($path);
231	}
232	}
233
234
235
236	1;
237	__END__
238
239
240	=head2 Searching
241
242	Searching is still conducted using L<SWISH::API>, but you have to glob
243	index names.
244
245	use SWISH::API;
246
247	my $swish = SWISH::API->new( glob('index.swish-e/*') );
248
249	You can also alternativly create merged index (using C<merge> option) and
250	not change your source code at all.
251
252	That would also benefit performance, but it increases indexing time
253	because merged indexes must be re-created on each indexing run.
254
255	=head2 EXPORT
256
257	None by default.
258
259
260
261	=head1 SEE ALSO
262
263	L<SWISH::API>,
264	L<http://www.swish-e.org/>
265
266	=head1 AUTHOR
267
268	Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
269
270	=head1 COPYRIGHT AND LICENSE
271
272	Copyright (C) 2004 by Dobrica Pavlinusic
273
274	This library is free software; you can redistribute it and/or modify
275	it under the same terms as Perl itself, either Perl version 5.8.4 or,
276	at your option, any later version of Perl 5 you may have available.
277
278
279	=cut