1 |
#!/usr/bin/perl -w |
2 |
|
3 |
package MWS::Clucene; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
use MWS::Indexer; |
8 |
our @ISA=qw(MWS::Indexer); |
9 |
|
10 |
our $VERSION = '0.01'; |
11 |
|
12 |
BEGIN { |
13 |
# this is so utterly wrong that I won't even start to talk |
14 |
# about it. But, FulltextSearch::Clucene still miss real |
15 |
# Makefile.PL |
16 |
my $dir = $ENV{'HOME'}.'/clucene.output'; |
17 |
$ENV{'LD_LIBRARY_PATH'} = "$dir/perl"; |
18 |
unshift @INC, "$dir/perl"; |
19 |
} |
20 |
|
21 |
use FulltextSearch::CLucene; |
22 |
use Carp; |
23 |
use Data::Dumper; |
24 |
|
25 |
=head1 NAME |
26 |
|
27 |
MWS::Clucene - index your data using CLucene |
28 |
|
29 |
=head1 DESCRIPTION |
30 |
|
31 |
Simple implementation to use Lucene port to C++ |
32 |
|
33 |
=head1 METHODS |
34 |
|
35 |
=head2 open_index |
36 |
|
37 |
$self->open_index; |
38 |
|
39 |
=cut |
40 |
|
41 |
sub open_index { |
42 |
my $self = shift; |
43 |
|
44 |
$self->{index} = FulltextSearch::CLucene->new( path => "./index" ) || croak "can't open index '",$self->{index_dir},"': $!"; |
45 |
|
46 |
return $self->{index}; |
47 |
|
48 |
} |
49 |
|
50 |
=head2 search_index |
51 |
|
52 |
my @results = $self->search_index('message:funny'); |
53 |
|
54 |
Date limits are, well, cludged and sort isn't supported! |
55 |
|
56 |
=cut |
57 |
|
58 |
sub search_index { |
59 |
my $self = shift; |
60 |
|
61 |
my $s = shift || croak "search_index needs query"; |
62 |
|
63 |
$self->open_index if (! $self->{index}); |
64 |
|
65 |
# kill sort:something [asc|desc] |
66 |
$s =~ s/sort:\w+\s+\w+//; |
67 |
|
68 |
my ($y,$m,$d); |
69 |
|
70 |
if ($s =~ s/date:"*(\d{4})(?:-(\d{2}))*(?:-(\d{2}))*"*//) { |
71 |
($y,$m,$d) = ($1,$2,$3); |
72 |
|
73 |
my ($df,$dt); # date from, to |
74 |
|
75 |
if ($y && $m && $d) { |
76 |
$df = $self->fmtdate($y,$m,$y); |
77 |
$dt = $df; |
78 |
} elsif ($y && $m) { |
79 |
$df = $self->fmtdate($y,$m)."-01"; |
80 |
$dt = $self->fmtdate($y,$m)."-31"; |
81 |
} elsif ($y) { |
82 |
$df = $self->fmtdate($y)."-01-01"; |
83 |
$dt = $self->fmtdate($y)."-12-31"; |
84 |
} |
85 |
print STDERR "Plucene query $s [from $df to $dt]\n" if ($self->{debug}); |
86 |
return $self->{index}->search_during($s, $df => $dt); |
87 |
} else { |
88 |
print STDERR "Plucene query $s\n" if ($self->{debug}); |
89 |
return $self->{index}->search($s); |
90 |
} |
91 |
} |
92 |
|
93 |
=head2 add_index |
94 |
|
95 |
$self->add_index($mbox_id, $document); |
96 |
|
97 |
This method will try to fake "message" field which is used to search whole |
98 |
message (including headers) with body and subject. Duplicating content is |
99 |
wrong, but using Plucene::Simple has it's limitations. |
100 |
|
101 |
It will also remove time from date to make Plucene happy. |
102 |
|
103 |
=cut |
104 |
|
105 |
sub add_index { |
106 |
my $self = shift; |
107 |
|
108 |
my $mbox_id = shift || croak "add_index needs mbox_id"; |
109 |
my $document = shift || croak "add_index needs document"; |
110 |
|
111 |
$self->open_index if (! $self->{index}); |
112 |
|
113 |
# does index withh document allready exist? |
114 |
if (-f $self->{index_dir}."/segmets") { |
115 |
return if ($self->{index}->search( field => "id:", query => $document->{id} )); |
116 |
} |
117 |
|
118 |
print STDERR "add_index($mbox_id)\n" if ($self->{debug}); |
119 |
|
120 |
$document->{message} = ($document->{body} || '') . ($document->{subject} || ''); |
121 |
|
122 |
$document->{date} =~ s/\d{4}-\d+-\d+.*//; |
123 |
|
124 |
$self->{index}->new_document; |
125 |
foreach my $f (keys %{$document}) { |
126 |
if (lc($f) eq 'date') { |
127 |
$self->{index}->add_date( field => $f, value => $document->{$f}); |
128 |
} else { |
129 |
$self->{index}->add_field( field => $f, value => $document->{$f}); |
130 |
} |
131 |
} |
132 |
|
133 |
$self->{index}->insert_document; |
134 |
} |
135 |
|
136 |
=head2 close_index |
137 |
|
138 |
$self->close_index; |
139 |
|
140 |
This will also optimize Plucene index file. |
141 |
|
142 |
=cut |
143 |
|
144 |
sub close_index { |
145 |
my $self = shift; |
146 |
|
147 |
$self->{index}->close; |
148 |
} |
149 |
|
150 |
1; |