/[SWISH-Split]/trunk/Split.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/Split.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 11 - (show annotations)
Fri Apr 29 22:50:16 2005 UTC (16 years, 7 months ago) by dpavlin
File size: 12940 byte(s)
some cleanups

1 package SWISH::Split;
2
3 use 5.008;
4 use strict;
5 use warnings;
6
7 our $VERSION = '0.03';
8
9 use SWISH::API;
10 use Text::Iconv;
11 use File::Temp qw/ :mktemp /;
12 use Carp;
13 use Digest::MD5 qw(md5_hex);
14 use Memoize;
15 use File::Which;
16
17 use Data::Dumper;
18
19 use constant {
20 ADDED => 1,
21 DELETED => 2,
22 };
23
24 =head1 NAME
25
26 SWISH::Split - Perl interface to split index variant of Swish-e
27
28 =head1 SYNOPSIS
29
30 use SWISH::Split;
31
32
33 =head1 DESCRIPTION
34
35 This is alternative interface for indexing data with swish-e. It's designed
36 to split indexes over multiple files (slices) to allow updates of records in index
37 by reindexing just changed parts (slice).
38
39 Data is stored in index using intrface which is somewhat similar to
40 L<Plucene::Simple>. This could make your migration (or supporting two index
41 engines) easier.
42
43 In the background, it will fork swish-e binaries (one for each index slice)
44 and produce UTF-8 encoded XML files for it. So, if your input charset isn't
45 C<ISO-8859-1> you will have to specify it.
46
47 =head1 Methods used for indexing
48
49 =head2 open_index
50
51 Create new object for index.
52
53 my $i = SWISH::Split->open_index({
54 index => '/path/to/index',
55 slice_name => \&slice_on_path,
56 slices => 30,
57 merge => 0,
58 codepage => 'ISO-8859-2',
59 swish_config => qq{
60 PropertyNames from date
61 PropertyNamesDate date
62 },
63 memoize_to_xml => 0,
64 );
65
66 # split index on first component of path
67 sub slice_on_path {
68 return shift split(/\//,$_[0]);
69 }
70
71 Options to C<open_index> are following:
72
73 =over 5
74
75 =item C<index>
76
77 path to (existing) directory in which index slices will be created.
78
79 =item C<slice_name>
80
81 coderef to function which provide slicing from path.
82
83 =item C<slices>
84
85 maximum number of index slices. See L<"in_slice"> for
86 more explanation.
87
88 =item C<merge>
89
90 (planned) option to merge indexes into one at end.
91
92 =item C<codepage>
93
94 data codepage (needed for conversion to UTF-8).
95 By default, it's C<ISO-8859-1>.
96
97 =item C<swish_config>
98
99 additional parametars which will be inserted into
100 C<swish-e> configuration file. See C<swish-config>.
101
102 =item C<memoize_to_xml>
103
104 speed up repeatable data, see L<"to_xml">.
105
106 =back
107
108 =cut
109
110 my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8');
111
112 sub open_index {
113 my $class = shift;
114 my $self = {@_};
115 bless($self, $class);
116
117 croak "need slice_name coderef" unless ref $self->{'slice_name'};
118 croak "need slices" unless $self->{'slices'};
119
120 croak "need index" unless $self->{'index'};
121 croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'};
122 croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'};
123
124 $iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'});
125
126 # speedup
127 memoize('in_slice');
128 memoize('to_xml') if ($self->{'memoize_to_xml'});
129
130 $self ? return $self : return undef;
131
132 }
133
134 =head2 add
135
136 Add document to index.
137
138 $i->add($swishpath, {
139 headline => 'foobar result',
140 property => 'data',
141 })
142
143 =cut
144
145 sub add {
146 my $self = shift;
147
148 my $swishpath = shift || return;
149 my $data = shift || return;
150
151 my $slice = $self->put_slice($swishpath, $self->to_xml($data));
152
153 return $slice;
154 }
155
156 =head2 delete
157
158 Delete documents from index.
159
160 $i->delete(@swishpath);
161
162 =cut
163
164 sub delete {
165 my $self = shift;
166
167 my @paths = @_ || return;
168
169 foreach my $path (@paths) {
170 $self->{'paths'}->{$path} = DELETED;
171 }
172
173 return 42;
174 }
175
176
177 =head2 done
178
179 Finish indexing and close index file(s).
180
181 $i->done;
182
183 This is most time-consuming operation. When it's called, it will re-index
184 all entries which haven't changed in all slices.
185
186 Returns number of slices updated.
187
188 This method should really be called close or finish, but both of those are
189 allready used.
190
191 =cut
192
193 sub done {
194 my $self = shift;
195
196 my $ret = 0;
197
198 foreach my $s (keys %{$self->{'slice'}}) {
199 $self->_debug("closing slice $s");
200 $ret += $self->close_slice($s);
201 }
202
203 return $ret;
204 }
205
206
207
208 =head1 Reporting methods
209
210 This methods return statistics about your index.
211
212 =head2 swishpaths
213
214 Return array of C<swishpath>s in index.
215
216 my @p = $i->swishpaths;
217
218 =cut
219
220 sub swishpaths {
221 my $self = shift;
222
223 my $s = shift || return;
224 return if (! exists($self->{'slice'}->{'s'}));
225
226 return keys %{$self->{'slice'}->{'s'}};
227 }
228
229 =head2 swishpaths_updated
230
231 Return array with updated C<swishpath>s.
232
233 my @d = $i->swishpaths_updated;
234
235 =cut
236
237 sub swishpaths_updated {
238 my $self = shift;
239 }
240
241
242 =head2 swishpaths_deleted
243
244 Return array with deleted C<swishpath>s.
245
246 my $n = $i->swishpaths_deleted;
247
248 =cut
249
250 sub swishpaths_deleted {
251 my $self = shift;
252 }
253
254
255 =head2 slices
256
257 Return array with all slice names.
258
259 my @s = $i->slices;
260
261 =cut
262
263 sub slices {
264 my $self = shift;
265 }
266
267 =head1 Helper methods
268
269 This methods are used internally, but they might be useful.
270
271 =head2 in_slice
272
273 Takes path and return slice in which this path belongs.
274
275 my $s = $i->in_slice('path/to/document/in/index');
276
277 If there are C<slices> parametar to L<"open_index"> it will use
278 MD5 hash to spread documents across slices. That will produce random
279 distribution of your documents in slices, which might or might not be best
280 for your data. If you have to re-index large number of slices on each
281 run, think about creating your own C<slice> function and distributing
282 documents manually across slices.
283
284 Slice number must always be true value or various sanity checks will fail.
285
286 This function is C<Memoize>ed for performance reasons.
287
288 =cut
289
290 sub in_slice {
291 my $self = shift;
292
293 my $path = shift || confess "need path";
294
295 confess "need slice_name function" unless ref ($self->{'slice_name'});
296
297 if ($self->{'slices'}) {
298 # first, pass path through slice_name function
299 my $slice = &{$self->{'slice_name'}}($path);
300 # then calculate MD5 hash
301 my $hash = md5_hex($slice);
302 # take first 8 chars to produce number
303 # FIXME how random is this?
304 $hash = hex(substr($hash,0,8));
305
306 $slice = ($hash % $self->{'slices'}) + 1;
307 $self->_debug("hash: $hash / ",$self->{'slices'}," => $slice");
308 return $slice;
309 } else {
310 return &{$self->{'split'}}($path);
311 }
312 }
313
314 =head2 find_paths
315
316 Return array of C<swishpath>s for given C<swish-e> query.
317
318 my @p = $i->find_paths("headline=test*");
319
320 Useful for combining with L<"delete_documents"> to delete documents
321 which hasn't changed a while (so, expired).
322
323 =cut
324
325 sub find_paths {
326 my $self = shift;
327
328 }
329
330
331 =head2 make_config
332
333 Create C<swish-e> configuration file for given slice.
334
335 my $config_filename = $i->make_config('slice name');
336
337 It returns configuration filename. If no C<swish_config> was defined in
338 L<"open_index">, default swish-e configuration will be used. It will index all data for
339 searching, but none for properties.
340
341 If you want to see what is allready defined for swish-e in configuration
342 take a look at source code for C<DEFAULT_SWISH_CONF>.
343
344 It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>.
345
346 =cut
347
348 sub make_config {
349 my $self = shift;
350
351
352 my $index_file = $self->{'index'}."/";
353 $index_file .= shift || confess "need slice name";
354
355 my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX");
356
357 # find cat on filesystem
358 my $cat = which('cat');
359
360 print $tmp_fh <<"DEFAULT_SWISH_CONF";
361 # swish-e config file
362
363 IndexDir stdin
364
365 # input file definition
366 DefaultContents XML*
367
368 # indexed metatags
369 MetaNames xml swishdocpath
370
371
372 #XMLClassAttributes type
373 UndefinedMetaTags auto
374 UndefinedXMLAttributes auto
375
376 IndexFile $index_file
377
378 # Croatian ISO-8859-2 characters to unaccented equivalents
379 TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz
380
381
382 # disable output
383 ParserWarnLevel 0
384 IndexReport 1
385
386 DEFAULT_SWISH_CONF
387
388 # add user parametars (like stored properties)
389 print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'});
390
391 close($tmp_fh);
392
393 return $swish_config_filename;
394 }
395
396 =head2 create_slice
397
398 On first run, starts C<swish-e>. On subsequent calls just return
399 it's handles using C<Memoize>.
400
401 my $s = create_slice('/path/to/document');
402
403 You shouldn't need to call C<create_slice> directly because it will be called
404 from L<"put_slice"> when needed.
405
406 =cut
407
408 sub create_slice {
409 my $self = shift;
410
411 my $path = shift || confess "create_slice need path!";
412
413 my $s = $self->in_slice($path) || confess "in_slice returned null";
414
415 return $s if (exists($self->{'slice'}->{$s}));
416
417 my $swish_config = $self->make_config($s);
418
419 my $swish = qq{| swish-e };
420 if (-f $self->{'index'}.'/'.$s) {
421 $swish .= qq{ -u };
422 $self->{'slice'}->{$s}->{'update_mode'}++;
423 }
424 $swish .= qq{ -S prog -c } . $swish_config;
425
426 $self->_debug("creating slice $s using $swish");
427
428 ## Build the harness, open all pipes, and launch the subprocesses
429 open(my $fh, $swish) || croak "can't open $swish: $!";
430
431 $self->{'slice'}->{$s}->{'h'} = $fh;
432
433 $self->slice_output($s);
434
435 return $s;
436 }
437
438 =head2 put_slice
439
440 Pass XML data to swish.
441
442 my $slice = $i->put_slice('/swish/path', '<xml>data</xml>');
443
444 Returns slice in which XML ended up.
445
446 =cut
447
448 sub put_slice {
449 my $self = shift;
450
451 my $path = shift || confess "need path";
452 my $xml = shift || confess "need xml";
453
454 $xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml";
455
456 my $s = $self->create_slice($path) || confess "create_slice returned null";
457
458 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
459 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
460
461 $self->slice_output($s);
462
463 use bytes; # as opposed to chars
464 my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined";
465
466 my $update_header = "Update-Mode: Index\n";
467 $update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'});
468
469 print { $fh } "Path-Name: $path\n".
470 "Content-Length: ".(length($xml)+1)."\n" . $update_header .
471 "Document-Type: XML\n\n$xml\n";
472
473 $self->slice_output($s);
474
475 $self->_debug("dumping in slice $s: $path");
476
477 $self->{'paths'}->{$path} = ADDED;
478
479 return $s;
480 }
481
482 =head2 slice_output
483
484 Prints to STDERR output and errors from C<swish-e>.
485
486 my $slice = $i->slice_output($s);
487
488 Normally, you don't need to call it.
489
490 B<This is dummy placeholder function for very old code that assumes this
491 module is using C<IPC::Run> which it isn't any more.>
492
493 =cut
494
495 sub slice_output {
496 my $self = shift;
497
498 my $s = shift || confess "slice_output needs slice";
499
500 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
501
502 # FIXME
503
504 return $s;
505 }
506
507 =head2 close_slice
508
509 Close slice (terminates swish-e process for that slice).
510
511 my $i->close_slice($s);
512
513 Returns true if slice is closed, false otherwise.
514
515 =cut
516
517 sub close_slice {
518 my $self = shift;
519
520 my $s = shift || confess "close_slice needs slice";
521
522 confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s}));
523 confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'}));
524
525 # pump rest of content (if any)
526 close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!";
527
528 $self->slice_output($s);
529
530 undef $self->{'slice'}->{$s}->{'h'};
531
532 delete($self->{'slice'}->{$s}) && return 1;
533 return 0;
534 }
535
536 =head2 to_xml
537
538 Convert (binary safe, I hope) your data into XML for C<swish-e>.
539 Data will not yet be recoded to UTF-8. L<"put_slice"> will do that.
540
541 my $xml = $i->to_xml({ foo => 'bar' });
542
543 This function is extracted from L<"add"> method so that you can C<Memoize> it.
544 If your data set has a lot of repeatable data, and memory is not a problem, you
545 can add C<memoize_to_xml> option to L<"open_index">.
546
547 =cut
548
549 my %escape = ('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', '"'=>'&quot;');
550 my $escape_re = join '|' => keys %escape;
551
552 sub to_xml {
553 my $self = shift;
554
555 my $data = shift || return;
556
557 my $xml = qq{<xml>};
558 foreach my $tag (keys %$data) {
559 my $content = $data->{$tag};
560 next if (! $content || $content eq '');
561 # save [cr/]lf before conversion to XML
562 # $content =~ s/\n\r/##lf##/gs;
563 # $content =~ s/\n/##lf##/gs;
564 $content =~ s/($escape_re)/$escape{$1}/gs;
565 $xml .= "<$tag><![CDATA[".$content."]]></$tag>";
566 }
567 $xml .= qq{</xml>};
568 }
569
570 sub _debug {
571 my $self = shift;
572 print STDERR "## ",@_,"\n" if ($self->{'debug'});
573 return;
574 }
575
576 1;
577 __END__
578
579
580 =head1 Searching
581
582 Searching is still conducted using L<SWISH::API>, but you have to glob
583 index names.
584
585 use SWISH::API;
586
587 my $swish = SWISH::API->new( glob('index.swish-e/*') );
588
589 You can also alternativly create merged index (using C<merge> option) and
590 not change your source code at all.
591
592 That would also benefit performance, but it increases indexing time
593 because merged indexes must be re-created on each indexing run.
594
595 =head1 EXPORT
596
597 Nothing by default.
598
599 =head1 EXAMPLES
600
601 Test script for this module uses all parts of API. It's also nice example
602 how to use C<SWISH::Split>.
603
604 =head1 SEE ALSO
605
606 L<SWISH::API>,
607 L<http://www.swish-e.org/>
608
609 =head1 AUTHOR
610
611 Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt>
612
613 =head1 COPYRIGHT AND LICENSE
614
615 Copyright (C) 2004 by Dobrica Pavlinusic
616
617 This library is free software; you can redistribute it and/or modify
618 it under the same terms as Perl itself, either Perl version 5.8.4 or,
619 at your option, any later version of Perl 5 you may have available.
620
621
622 =cut

  ViewVC Help
Powered by ViewVC 1.1.26