1 |
dpavlin |
1 |
package SWISH::Split; |
2 |
|
|
|
3 |
|
|
use 5.008; |
4 |
|
|
use strict; |
5 |
|
|
use warnings; |
6 |
|
|
|
7 |
dpavlin |
9 |
our $VERSION = '0.03'; |
8 |
dpavlin |
1 |
|
9 |
|
|
use SWISH::API; |
10 |
|
|
use Text::Iconv; |
11 |
|
|
use File::Temp qw/ :mktemp /; |
12 |
|
|
use Carp; |
13 |
|
|
use Digest::MD5 qw(md5_hex); |
14 |
|
|
use Memoize; |
15 |
dpavlin |
4 |
use File::Which; |
16 |
dpavlin |
1 |
|
17 |
|
|
use Data::Dumper; |
18 |
|
|
|
19 |
dpavlin |
5 |
use constant { |
20 |
|
|
ADDED => 1, |
21 |
|
|
DELETED => 2, |
22 |
|
|
}; |
23 |
|
|
|
24 |
dpavlin |
1 |
=head1 NAME |
25 |
|
|
|
26 |
|
|
SWISH::Split - Perl interface to split index variant of Swish-e |
27 |
|
|
|
28 |
|
|
=head1 SYNOPSIS |
29 |
|
|
|
30 |
|
|
use SWISH::Split; |
31 |
|
|
|
32 |
|
|
|
33 |
|
|
=head1 DESCRIPTION |
34 |
|
|
|
35 |
|
|
This is alternative interface for indexing data with swish-e. It's designed |
36 |
dpavlin |
4 |
to split indexes over multiple files (slices) to allow updates of records in index |
37 |
|
|
by reindexing just changed parts (slice). |
38 |
dpavlin |
1 |
|
39 |
|
|
Data is stored in index using intrface which is somewhat similar to |
40 |
|
|
L<Plucene::Simple>. This could make your migration (or supporting two index |
41 |
|
|
engines) easier. |
42 |
|
|
|
43 |
|
|
In the background, it will fork swish-e binaries (one for each index slice) |
44 |
dpavlin |
6 |
and produce UTF-8 encoded XML files for it. So, if your input charset isn't |
45 |
dpavlin |
1 |
C<ISO-8859-1> you will have to specify it. |
46 |
|
|
|
47 |
|
|
=head1 Methods used for indexing |
48 |
|
|
|
49 |
dpavlin |
8 |
=head2 open_index |
50 |
dpavlin |
1 |
|
51 |
|
|
Create new object for index. |
52 |
|
|
|
53 |
dpavlin |
8 |
my $i = SWISH::Split->open_index({ |
54 |
dpavlin |
1 |
index => '/path/to/index', |
55 |
|
|
slice_name => \&slice_on_path, |
56 |
|
|
slices => 30, |
57 |
dpavlin |
4 |
merge => 0, |
58 |
|
|
codepage => 'ISO-8859-2', |
59 |
|
|
swish_config => qq{ |
60 |
|
|
PropertyNames from date |
61 |
|
|
PropertyNamesDate date |
62 |
|
|
}, |
63 |
|
|
memoize_to_xml => 0, |
64 |
dpavlin |
1 |
); |
65 |
|
|
|
66 |
|
|
# split index on first component of path |
67 |
|
|
sub slice_on_path { |
68 |
|
|
return shift split(/\//,$_[0]); |
69 |
|
|
} |
70 |
|
|
|
71 |
dpavlin |
8 |
Options to C<open_index> are following: |
72 |
dpavlin |
1 |
|
73 |
dpavlin |
4 |
=over 5 |
74 |
|
|
|
75 |
|
|
=item C<index> |
76 |
|
|
|
77 |
|
|
path to (existing) directory in which index slices will be created. |
78 |
|
|
|
79 |
|
|
=item C<slice_name> |
80 |
|
|
|
81 |
|
|
coderef to function which provide slicing from path. |
82 |
|
|
|
83 |
|
|
=item C<slices> |
84 |
|
|
|
85 |
|
|
maximum number of index slices. See L<"in_slice"> for |
86 |
dpavlin |
1 |
more explanation. |
87 |
|
|
|
88 |
dpavlin |
4 |
=item C<merge> |
89 |
|
|
|
90 |
|
|
(planned) option to merge indexes into one at end. |
91 |
|
|
|
92 |
|
|
=item C<codepage> |
93 |
|
|
|
94 |
|
|
data codepage (needed for conversion to UTF-8). |
95 |
|
|
By default, it's C<ISO-8859-1>. |
96 |
|
|
|
97 |
|
|
=item C<swish_config> |
98 |
|
|
|
99 |
|
|
additional parametars which will be inserted into |
100 |
dpavlin |
11 |
C<swish-e> configuration file. See C<swish-config>. |
101 |
dpavlin |
4 |
|
102 |
|
|
=item C<memoize_to_xml> |
103 |
|
|
|
104 |
|
|
speed up repeatable data, see L<"to_xml">. |
105 |
|
|
|
106 |
|
|
=back |
107 |
|
|
|
108 |
dpavlin |
1 |
=cut |
109 |
|
|
|
110 |
|
|
my $iso2utf = Text::Iconv->new('ISO-8859-1','UTF-8'); |
111 |
|
|
|
112 |
dpavlin |
8 |
sub open_index { |
113 |
dpavlin |
1 |
my $class = shift; |
114 |
|
|
my $self = {@_}; |
115 |
|
|
bless($self, $class); |
116 |
|
|
|
117 |
|
|
croak "need slice_name coderef" unless ref $self->{'slice_name'}; |
118 |
|
|
croak "need slices" unless $self->{'slices'}; |
119 |
|
|
|
120 |
|
|
croak "need index" unless $self->{'index'}; |
121 |
|
|
croak "index '",$self->{'index'},"' doesn't exist" unless -e $self->{'index'}; |
122 |
|
|
croak "index '",$self->{'index'},"' is not directory" unless -d $self->{'index'}; |
123 |
|
|
|
124 |
|
|
$iso2utf = Text::Iconv->new($self->{'codepage'},'UTF-8') if ($self->{'codepage'}); |
125 |
|
|
|
126 |
dpavlin |
4 |
# speedup |
127 |
dpavlin |
1 |
memoize('in_slice'); |
128 |
dpavlin |
4 |
memoize('to_xml') if ($self->{'memoize_to_xml'}); |
129 |
dpavlin |
1 |
|
130 |
|
|
$self ? return $self : return undef; |
131 |
|
|
|
132 |
|
|
} |
133 |
|
|
|
134 |
|
|
=head2 add |
135 |
|
|
|
136 |
|
|
Add document to index. |
137 |
|
|
|
138 |
|
|
$i->add($swishpath, { |
139 |
|
|
headline => 'foobar result', |
140 |
|
|
property => 'data', |
141 |
|
|
}) |
142 |
|
|
|
143 |
|
|
=cut |
144 |
|
|
|
145 |
|
|
sub add { |
146 |
|
|
my $self = shift; |
147 |
dpavlin |
3 |
|
148 |
|
|
my $swishpath = shift || return; |
149 |
|
|
my $data = shift || return; |
150 |
|
|
|
151 |
dpavlin |
7 |
my $slice = $self->put_slice($swishpath, $self->to_xml($data)); |
152 |
dpavlin |
4 |
|
153 |
dpavlin |
7 |
return $slice; |
154 |
dpavlin |
1 |
} |
155 |
|
|
|
156 |
|
|
=head2 delete |
157 |
|
|
|
158 |
dpavlin |
3 |
Delete documents from index. |
159 |
dpavlin |
1 |
|
160 |
dpavlin |
3 |
$i->delete(@swishpath); |
161 |
dpavlin |
1 |
|
162 |
dpavlin |
13 |
B<This function is not implemented.> |
163 |
|
|
|
164 |
dpavlin |
1 |
=cut |
165 |
|
|
|
166 |
|
|
sub delete { |
167 |
|
|
my $self = shift; |
168 |
dpavlin |
3 |
|
169 |
|
|
my @paths = @_ || return; |
170 |
|
|
|
171 |
dpavlin |
5 |
foreach my $path (@paths) { |
172 |
|
|
$self->{'paths'}->{$path} = DELETED; |
173 |
|
|
} |
174 |
|
|
|
175 |
dpavlin |
13 |
die "delete is not yet implemented"; |
176 |
|
|
|
177 |
dpavlin |
3 |
return 42; |
178 |
dpavlin |
1 |
} |
179 |
|
|
|
180 |
|
|
|
181 |
dpavlin |
5 |
=head2 done |
182 |
dpavlin |
1 |
|
183 |
dpavlin |
4 |
Finish indexing and close index file(s). |
184 |
dpavlin |
1 |
|
185 |
dpavlin |
5 |
$i->done; |
186 |
dpavlin |
1 |
|
187 |
|
|
This is most time-consuming operation. When it's called, it will re-index |
188 |
|
|
all entries which haven't changed in all slices. |
189 |
|
|
|
190 |
dpavlin |
4 |
Returns number of slices updated. |
191 |
|
|
|
192 |
dpavlin |
5 |
This method should really be called close or finish, but both of those are |
193 |
|
|
allready used. |
194 |
|
|
|
195 |
dpavlin |
1 |
=cut |
196 |
|
|
|
197 |
dpavlin |
5 |
sub done { |
198 |
dpavlin |
1 |
my $self = shift; |
199 |
dpavlin |
3 |
|
200 |
dpavlin |
4 |
my $ret = 0; |
201 |
|
|
|
202 |
|
|
foreach my $s (keys %{$self->{'slice'}}) { |
203 |
dpavlin |
8 |
$self->_debug("closing slice $s"); |
204 |
dpavlin |
4 |
$ret += $self->close_slice($s); |
205 |
|
|
} |
206 |
|
|
|
207 |
|
|
return $ret; |
208 |
dpavlin |
1 |
} |
209 |
|
|
|
210 |
|
|
|
211 |
|
|
|
212 |
|
|
=head1 Reporting methods |
213 |
|
|
|
214 |
|
|
This methods return statistics about your index. |
215 |
|
|
|
216 |
|
|
=head2 swishpaths |
217 |
|
|
|
218 |
|
|
Return array of C<swishpath>s in index. |
219 |
|
|
|
220 |
|
|
my @p = $i->swishpaths; |
221 |
|
|
|
222 |
|
|
=cut |
223 |
|
|
|
224 |
|
|
sub swishpaths { |
225 |
|
|
my $self = shift; |
226 |
dpavlin |
5 |
|
227 |
|
|
my $s = shift || return; |
228 |
|
|
return if (! exists($self->{'slice'}->{'s'})); |
229 |
|
|
|
230 |
|
|
return keys %{$self->{'slice'}->{'s'}}; |
231 |
dpavlin |
1 |
} |
232 |
|
|
|
233 |
|
|
=head2 swishpaths_updated |
234 |
|
|
|
235 |
|
|
Return array with updated C<swishpath>s. |
236 |
|
|
|
237 |
|
|
my @d = $i->swishpaths_updated; |
238 |
|
|
|
239 |
|
|
=cut |
240 |
|
|
|
241 |
|
|
sub swishpaths_updated { |
242 |
|
|
my $self = shift; |
243 |
|
|
} |
244 |
|
|
|
245 |
|
|
|
246 |
|
|
=head2 swishpaths_deleted |
247 |
|
|
|
248 |
|
|
Return array with deleted C<swishpath>s. |
249 |
|
|
|
250 |
|
|
my $n = $i->swishpaths_deleted; |
251 |
|
|
|
252 |
|
|
=cut |
253 |
|
|
|
254 |
|
|
sub swishpaths_deleted { |
255 |
|
|
my $self = shift; |
256 |
|
|
} |
257 |
|
|
|
258 |
|
|
|
259 |
|
|
=head2 slices |
260 |
|
|
|
261 |
|
|
Return array with all slice names. |
262 |
|
|
|
263 |
|
|
my @s = $i->slices; |
264 |
|
|
|
265 |
|
|
=cut |
266 |
|
|
|
267 |
|
|
sub slices { |
268 |
|
|
my $self = shift; |
269 |
|
|
} |
270 |
|
|
|
271 |
|
|
=head1 Helper methods |
272 |
|
|
|
273 |
|
|
This methods are used internally, but they might be useful. |
274 |
|
|
|
275 |
|
|
=head2 in_slice |
276 |
|
|
|
277 |
|
|
Takes path and return slice in which this path belongs. |
278 |
|
|
|
279 |
|
|
my $s = $i->in_slice('path/to/document/in/index'); |
280 |
|
|
|
281 |
dpavlin |
8 |
If there are C<slices> parametar to L<"open_index"> it will use |
282 |
dpavlin |
1 |
MD5 hash to spread documents across slices. That will produce random |
283 |
|
|
distribution of your documents in slices, which might or might not be best |
284 |
|
|
for your data. If you have to re-index large number of slices on each |
285 |
|
|
run, think about creating your own C<slice> function and distributing |
286 |
|
|
documents manually across slices. |
287 |
|
|
|
288 |
dpavlin |
4 |
Slice number must always be true value or various sanity checks will fail. |
289 |
|
|
|
290 |
dpavlin |
1 |
This function is C<Memoize>ed for performance reasons. |
291 |
|
|
|
292 |
|
|
=cut |
293 |
|
|
|
294 |
|
|
sub in_slice { |
295 |
|
|
my $self = shift; |
296 |
|
|
|
297 |
|
|
my $path = shift || confess "need path"; |
298 |
|
|
|
299 |
|
|
confess "need slice_name function" unless ref ($self->{'slice_name'}); |
300 |
|
|
|
301 |
|
|
if ($self->{'slices'}) { |
302 |
|
|
# first, pass path through slice_name function |
303 |
|
|
my $slice = &{$self->{'slice_name'}}($path); |
304 |
|
|
# then calculate MD5 hash |
305 |
dpavlin |
8 |
my $hash = md5_hex($slice); |
306 |
dpavlin |
1 |
# take first 8 chars to produce number |
307 |
|
|
# FIXME how random is this? |
308 |
dpavlin |
8 |
$hash = hex(substr($hash,0,8)); |
309 |
dpavlin |
1 |
|
310 |
dpavlin |
8 |
$slice = ($hash % $self->{'slices'}) + 1; |
311 |
|
|
$self->_debug("hash: $hash / ",$self->{'slices'}," => $slice"); |
312 |
dpavlin |
4 |
return $slice; |
313 |
dpavlin |
1 |
} else { |
314 |
|
|
return &{$self->{'split'}}($path); |
315 |
|
|
} |
316 |
|
|
} |
317 |
|
|
|
318 |
dpavlin |
3 |
=head2 find_paths |
319 |
dpavlin |
1 |
|
320 |
dpavlin |
3 |
Return array of C<swishpath>s for given C<swish-e> query. |
321 |
dpavlin |
1 |
|
322 |
dpavlin |
3 |
my @p = $i->find_paths("headline=test*"); |
323 |
|
|
|
324 |
|
|
Useful for combining with L<"delete_documents"> to delete documents |
325 |
|
|
which hasn't changed a while (so, expired). |
326 |
|
|
|
327 |
|
|
=cut |
328 |
|
|
|
329 |
|
|
sub find_paths { |
330 |
|
|
my $self = shift; |
331 |
|
|
|
332 |
|
|
} |
333 |
|
|
|
334 |
|
|
|
335 |
dpavlin |
4 |
=head2 make_config |
336 |
dpavlin |
3 |
|
337 |
dpavlin |
4 |
Create C<swish-e> configuration file for given slice. |
338 |
|
|
|
339 |
|
|
my $config_filename = $i->make_config('slice name'); |
340 |
|
|
|
341 |
|
|
It returns configuration filename. If no C<swish_config> was defined in |
342 |
dpavlin |
8 |
L<"open_index">, default swish-e configuration will be used. It will index all data for |
343 |
dpavlin |
4 |
searching, but none for properties. |
344 |
|
|
|
345 |
|
|
If you want to see what is allready defined for swish-e in configuration |
346 |
|
|
take a look at source code for C<DEFAULT_SWISH_CONF>. |
347 |
|
|
|
348 |
dpavlin |
7 |
It uses C<stdin> as C<IndexDir> to comunicate with C<swish-e>. |
349 |
dpavlin |
4 |
|
350 |
|
|
=cut |
351 |
|
|
|
352 |
|
|
sub make_config { |
353 |
|
|
my $self = shift; |
354 |
|
|
|
355 |
|
|
|
356 |
|
|
my $index_file = $self->{'index'}."/"; |
357 |
|
|
$index_file .= shift || confess "need slice name"; |
358 |
|
|
|
359 |
|
|
my ($tmp_fh, $swish_config_filename) = mkstemp("/tmp/swishXXXXX"); |
360 |
|
|
|
361 |
|
|
# find cat on filesystem |
362 |
|
|
my $cat = which('cat'); |
363 |
|
|
|
364 |
|
|
print $tmp_fh <<"DEFAULT_SWISH_CONF"; |
365 |
|
|
# swish-e config file |
366 |
|
|
|
367 |
dpavlin |
7 |
IndexDir stdin |
368 |
dpavlin |
4 |
|
369 |
|
|
# input file definition |
370 |
|
|
DefaultContents XML* |
371 |
|
|
|
372 |
|
|
# indexed metatags |
373 |
|
|
MetaNames xml swishdocpath |
374 |
|
|
|
375 |
|
|
|
376 |
|
|
#XMLClassAttributes type |
377 |
|
|
UndefinedMetaTags auto |
378 |
|
|
UndefinedXMLAttributes auto |
379 |
|
|
|
380 |
|
|
IndexFile $index_file |
381 |
|
|
|
382 |
|
|
# Croatian ISO-8859-2 characters to unaccented equivalents |
383 |
|
|
TranslateCharacters ¹©ðÐèÈæƾ® ssddcccczz |
384 |
|
|
|
385 |
|
|
|
386 |
|
|
# disable output |
387 |
|
|
ParserWarnLevel 0 |
388 |
|
|
IndexReport 1 |
389 |
|
|
|
390 |
|
|
DEFAULT_SWISH_CONF |
391 |
|
|
|
392 |
|
|
# add user parametars (like stored properties) |
393 |
|
|
print $tmp_fh $self->{'swish_config'} if ($self->{'swish_config'}); |
394 |
|
|
|
395 |
|
|
close($tmp_fh); |
396 |
|
|
|
397 |
|
|
return $swish_config_filename; |
398 |
|
|
} |
399 |
|
|
|
400 |
|
|
=head2 create_slice |
401 |
|
|
|
402 |
dpavlin |
8 |
On first run, starts C<swish-e>. On subsequent calls just return |
403 |
dpavlin |
11 |
it's handles using C<Memoize>. |
404 |
dpavlin |
4 |
|
405 |
|
|
my $s = create_slice('/path/to/document'); |
406 |
|
|
|
407 |
|
|
You shouldn't need to call C<create_slice> directly because it will be called |
408 |
|
|
from L<"put_slice"> when needed. |
409 |
|
|
|
410 |
|
|
=cut |
411 |
|
|
|
412 |
|
|
sub create_slice { |
413 |
|
|
my $self = shift; |
414 |
|
|
|
415 |
|
|
my $path = shift || confess "create_slice need path!"; |
416 |
|
|
|
417 |
|
|
my $s = $self->in_slice($path) || confess "in_slice returned null"; |
418 |
|
|
|
419 |
|
|
return $s if (exists($self->{'slice'}->{$s})); |
420 |
|
|
|
421 |
|
|
my $swish_config = $self->make_config($s); |
422 |
|
|
|
423 |
dpavlin |
8 |
my $swish = qq{| swish-e }; |
424 |
dpavlin |
9 |
if (-f $self->{'index'}.'/'.$s) { |
425 |
|
|
$swish .= qq{ -u }; |
426 |
|
|
$self->{'slice'}->{$s}->{'update_mode'}++; |
427 |
|
|
} |
428 |
dpavlin |
8 |
$swish .= qq{ -S prog -c } . $swish_config; |
429 |
dpavlin |
4 |
|
430 |
dpavlin |
8 |
$self->_debug("creating slice $s using $swish"); |
431 |
dpavlin |
4 |
|
432 |
|
|
## Build the harness, open all pipes, and launch the subprocesses |
433 |
dpavlin |
8 |
open(my $fh, $swish) || croak "can't open $swish: $!"; |
434 |
dpavlin |
4 |
|
435 |
dpavlin |
8 |
$self->{'slice'}->{$s}->{'h'} = $fh; |
436 |
dpavlin |
4 |
|
437 |
|
|
$self->slice_output($s); |
438 |
|
|
|
439 |
|
|
return $s; |
440 |
|
|
} |
441 |
|
|
|
442 |
|
|
=head2 put_slice |
443 |
|
|
|
444 |
dpavlin |
7 |
Pass XML data to swish. |
445 |
dpavlin |
4 |
|
446 |
dpavlin |
7 |
my $slice = $i->put_slice('/swish/path', '<xml>data</xml>'); |
447 |
dpavlin |
4 |
|
448 |
dpavlin |
7 |
Returns slice in which XML ended up. |
449 |
|
|
|
450 |
dpavlin |
4 |
=cut |
451 |
|
|
|
452 |
|
|
sub put_slice { |
453 |
|
|
my $self = shift; |
454 |
|
|
|
455 |
|
|
my $path = shift || confess "need path"; |
456 |
|
|
my $xml = shift || confess "need xml"; |
457 |
|
|
|
458 |
|
|
$xml = $iso2utf->convert($xml) || carp "XML conversion error in $xml"; |
459 |
|
|
|
460 |
|
|
my $s = $self->create_slice($path) || confess "create_slice returned null"; |
461 |
|
|
|
462 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
463 |
|
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
464 |
|
|
|
465 |
|
|
$self->slice_output($s); |
466 |
|
|
|
467 |
|
|
use bytes; # as opposed to chars |
468 |
dpavlin |
8 |
my $fh = $self->{'slice'}->{$s}->{'h'} || confess "handle for slice $s undefined"; |
469 |
dpavlin |
9 |
|
470 |
|
|
my $update_header = "Update-Mode: Index\n"; |
471 |
|
|
$update_header = '' unless ($self->{'slice'}->{$s}->{'update_mode'}); |
472 |
|
|
|
473 |
dpavlin |
8 |
print { $fh } "Path-Name: $path\n". |
474 |
dpavlin |
9 |
"Content-Length: ".(length($xml)+1)."\n" . $update_header . |
475 |
dpavlin |
4 |
"Document-Type: XML\n\n$xml\n"; |
476 |
|
|
|
477 |
|
|
$self->slice_output($s); |
478 |
|
|
|
479 |
dpavlin |
8 |
$self->_debug("dumping in slice $s: $path"); |
480 |
|
|
|
481 |
dpavlin |
5 |
$self->{'paths'}->{$path} = ADDED; |
482 |
|
|
|
483 |
dpavlin |
4 |
return $s; |
484 |
|
|
} |
485 |
|
|
|
486 |
|
|
=head2 slice_output |
487 |
|
|
|
488 |
|
|
Prints to STDERR output and errors from C<swish-e>. |
489 |
|
|
|
490 |
dpavlin |
7 |
my $slice = $i->slice_output($s); |
491 |
dpavlin |
4 |
|
492 |
|
|
Normally, you don't need to call it. |
493 |
|
|
|
494 |
dpavlin |
8 |
B<This is dummy placeholder function for very old code that assumes this |
495 |
|
|
module is using C<IPC::Run> which it isn't any more.> |
496 |
|
|
|
497 |
dpavlin |
4 |
=cut |
498 |
|
|
|
499 |
|
|
sub slice_output { |
500 |
|
|
my $self = shift; |
501 |
|
|
|
502 |
|
|
my $s = shift || confess "slice_output needs slice"; |
503 |
|
|
|
504 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
505 |
|
|
|
506 |
dpavlin |
8 |
# FIXME |
507 |
dpavlin |
4 |
|
508 |
dpavlin |
7 |
return $s; |
509 |
dpavlin |
4 |
} |
510 |
|
|
|
511 |
dpavlin |
5 |
=head2 close_slice |
512 |
dpavlin |
4 |
|
513 |
|
|
Close slice (terminates swish-e process for that slice). |
514 |
|
|
|
515 |
|
|
my $i->close_slice($s); |
516 |
|
|
|
517 |
|
|
Returns true if slice is closed, false otherwise. |
518 |
|
|
|
519 |
|
|
=cut |
520 |
|
|
|
521 |
|
|
sub close_slice { |
522 |
|
|
my $self = shift; |
523 |
|
|
|
524 |
|
|
my $s = shift || confess "close_slice needs slice"; |
525 |
|
|
|
526 |
|
|
confess "no slice $s" unless(exists($self->{'slice'}) && exists($self->{'slice'}->{$s})); |
527 |
|
|
confess "no 'h' in slice $s: ".Dumper($s) unless (exists($self->{'slice'}->{$s}->{'h'})); |
528 |
|
|
|
529 |
|
|
# pump rest of content (if any) |
530 |
dpavlin |
8 |
close $self->{'slice'}->{$s}->{'h'} || carp "can't close slice $s: $!"; |
531 |
dpavlin |
4 |
|
532 |
|
|
$self->slice_output($s); |
533 |
|
|
|
534 |
dpavlin |
8 |
undef $self->{'slice'}->{$s}->{'h'}; |
535 |
dpavlin |
4 |
|
536 |
|
|
delete($self->{'slice'}->{$s}) && return 1; |
537 |
|
|
return 0; |
538 |
|
|
} |
539 |
|
|
|
540 |
|
|
=head2 to_xml |
541 |
|
|
|
542 |
|
|
Convert (binary safe, I hope) your data into XML for C<swish-e>. |
543 |
|
|
Data will not yet be recoded to UTF-8. L<"put_slice"> will do that. |
544 |
|
|
|
545 |
|
|
my $xml = $i->to_xml({ foo => 'bar' }); |
546 |
|
|
|
547 |
dpavlin |
11 |
This function is extracted from L<"add"> method so that you can C<Memoize> it. |
548 |
dpavlin |
4 |
If your data set has a lot of repeatable data, and memory is not a problem, you |
549 |
dpavlin |
8 |
can add C<memoize_to_xml> option to L<"open_index">. |
550 |
dpavlin |
4 |
|
551 |
|
|
=cut |
552 |
|
|
|
553 |
|
|
my %escape = ('<'=>'<', '>'=>'>', '&'=>'&', '"'=>'"'); |
554 |
|
|
my $escape_re = join '|' => keys %escape; |
555 |
|
|
|
556 |
|
|
sub to_xml { |
557 |
|
|
my $self = shift; |
558 |
|
|
|
559 |
|
|
my $data = shift || return; |
560 |
|
|
|
561 |
|
|
my $xml = qq{<xml>}; |
562 |
|
|
foreach my $tag (keys %$data) { |
563 |
|
|
my $content = $data->{$tag}; |
564 |
|
|
next if (! $content || $content eq ''); |
565 |
|
|
# save [cr/]lf before conversion to XML |
566 |
|
|
# $content =~ s/\n\r/##lf##/gs; |
567 |
|
|
# $content =~ s/\n/##lf##/gs; |
568 |
|
|
$content =~ s/($escape_re)/$escape{$1}/gs; |
569 |
|
|
$xml .= "<$tag><![CDATA[".$content."]]></$tag>"; |
570 |
|
|
} |
571 |
|
|
$xml .= qq{</xml>}; |
572 |
|
|
} |
573 |
|
|
|
574 |
dpavlin |
8 |
sub _debug { |
575 |
|
|
my $self = shift; |
576 |
|
|
print STDERR "## ",@_,"\n" if ($self->{'debug'}); |
577 |
|
|
return; |
578 |
|
|
} |
579 |
|
|
|
580 |
dpavlin |
1 |
1; |
581 |
|
|
__END__ |
582 |
|
|
|
583 |
|
|
|
584 |
dpavlin |
5 |
=head1 Searching |
585 |
dpavlin |
1 |
|
586 |
|
|
Searching is still conducted using L<SWISH::API>, but you have to glob |
587 |
|
|
index names. |
588 |
|
|
|
589 |
|
|
use SWISH::API; |
590 |
|
|
|
591 |
|
|
my $swish = SWISH::API->new( glob('index.swish-e/*') ); |
592 |
|
|
|
593 |
|
|
You can also alternativly create merged index (using C<merge> option) and |
594 |
|
|
not change your source code at all. |
595 |
|
|
|
596 |
|
|
That would also benefit performance, but it increases indexing time |
597 |
|
|
because merged indexes must be re-created on each indexing run. |
598 |
|
|
|
599 |
dpavlin |
5 |
=head1 EXPORT |
600 |
dpavlin |
1 |
|
601 |
dpavlin |
4 |
Nothing by default. |
602 |
dpavlin |
1 |
|
603 |
dpavlin |
5 |
=head1 EXAMPLES |
604 |
dpavlin |
1 |
|
605 |
dpavlin |
4 |
Test script for this module uses all parts of API. It's also nice example |
606 |
|
|
how to use C<SWISH::Split>. |
607 |
dpavlin |
1 |
|
608 |
|
|
=head1 SEE ALSO |
609 |
|
|
|
610 |
|
|
L<SWISH::API>, |
611 |
|
|
L<http://www.swish-e.org/> |
612 |
|
|
|
613 |
|
|
=head1 AUTHOR |
614 |
|
|
|
615 |
|
|
Dobrica Pavlinusic, E<lt>dpavlin@rot13.orgE<gt> |
616 |
|
|
|
617 |
|
|
=head1 COPYRIGHT AND LICENSE |
618 |
|
|
|
619 |
|
|
Copyright (C) 2004 by Dobrica Pavlinusic |
620 |
|
|
|
621 |
|
|
This library is free software; you can redistribute it and/or modify |
622 |
|
|
it under the same terms as Perl itself, either Perl version 5.8.4 or, |
623 |
|
|
at your option, any later version of Perl 5 you may have available. |
624 |
|
|
|
625 |
|
|
|
626 |
|
|
=cut |