1 |
#!/usr/bin/perl -w |
2 |
|
3 |
package MWS::Plucene; |
4 |
use strict; |
5 |
use warnings; |
6 |
|
7 |
use MWS::Indexer; |
8 |
our @ISA=qw(MWS::Indexer); |
9 |
|
10 |
our $VERSION = '0.01'; |
11 |
|
12 |
use Plucene::Simple; |
13 |
use Carp; |
14 |
use Data::Dumper; |
15 |
|
16 |
=head1 NAME |
17 |
|
18 |
MWS::Plucene - index your data using Plucene |
19 |
|
20 |
=head1 DESCRIPTION |
21 |
|
22 |
Simple implementation to use Lucene port to perl |
23 |
|
24 |
=head1 METHODS |
25 |
|
26 |
=head2 open_index |
27 |
|
28 |
$self->open_index; |
29 |
|
30 |
=cut |
31 |
|
32 |
sub open_index { |
33 |
my $self = shift; |
34 |
|
35 |
$self->{index} = Plucene::Simple->open($self->{index_dir}) || croak "can't open index '",$self->{index_dir},"': $!"; |
36 |
|
37 |
return $self->{index}; |
38 |
|
39 |
} |
40 |
|
41 |
=head2 search_index |
42 |
|
43 |
my @results = $self->search_index('message:funny'); |
44 |
|
45 |
Date limits are, well, cludged and sort isn't supported! |
46 |
|
47 |
=cut |
48 |
|
49 |
sub search_index { |
50 |
my $self = shift; |
51 |
|
52 |
my $s = shift || croak "search_index needs query"; |
53 |
|
54 |
$self->open_index if (! $self->{index}); |
55 |
|
56 |
# kill sort:something [asc|desc] |
57 |
$s =~ s/sort:\w+\s+\w+//; |
58 |
|
59 |
my ($y,$m,$d); |
60 |
|
61 |
if ($s =~ s/date:"*(\d{4})(?:-(\d{2}))*(?:-(\d{2}))*"*//) { |
62 |
($y,$m,$d) = ($1,$2,$3); |
63 |
|
64 |
my ($df,$dt); # date from, to |
65 |
|
66 |
if ($y && $m && $d) { |
67 |
$df = $self->fmtdate($y,$m,$y); |
68 |
$dt = $df; |
69 |
} elsif ($y && $m) { |
70 |
$df = $self->fmtdate($y,$m)."-01"; |
71 |
$dt = $self->fmtdate($y,$m)."-31"; |
72 |
} elsif ($y) { |
73 |
$df = $self->fmtdate($y)."-01-01"; |
74 |
$dt = $self->fmtdate($y)."-12-31"; |
75 |
} |
76 |
print STDERR "Plucene query $s [from $df to $dt]\n" if ($self->{debug}); |
77 |
return $self->{index}->search_during($s, $df => $dt); |
78 |
} else { |
79 |
print STDERR "Plucene query $s\n" if ($self->{debug}); |
80 |
return $self->{index}->search($s); |
81 |
} |
82 |
} |
83 |
|
84 |
=head2 add_index |
85 |
|
86 |
$self->add_index($mbox_id, $document); |
87 |
|
88 |
This method will try to fake "message" field which is used to search whole |
89 |
message (including headers) with body and subject. Duplicating content is |
90 |
wrong, but using Plucene::Simple has it's limitations. |
91 |
|
92 |
It will also remove time from date to make Plucene happy. |
93 |
|
94 |
=cut |
95 |
|
96 |
sub add_index { |
97 |
my $self = shift; |
98 |
|
99 |
my $mbox_id = shift || croak "add_index needs mbox_id"; |
100 |
my $document = shift || croak "add_index needs document"; |
101 |
|
102 |
$self->open_index if (! $self->{index}); |
103 |
|
104 |
# does index withh document allready exist? |
105 |
if (-f $self->{index_dir}."/segmets") { |
106 |
return if ($self->{index}->search("id:".$document->{id})); |
107 |
} |
108 |
|
109 |
print STDERR "add_index($mbox_id)\n" if ($self->{debug}); |
110 |
|
111 |
$document->{message} = ($document->{body} || '') . ($document->{subject} || ''); |
112 |
|
113 |
$document->{date} =~ s/\d{4}-\d+-\d+.*//; |
114 |
|
115 |
$self->{index}->add($mbox_id => $document ); |
116 |
} |
117 |
|
118 |
=head2 close_index |
119 |
|
120 |
$self->close_index; |
121 |
|
122 |
This will also optimize Plucene index file. |
123 |
|
124 |
=cut |
125 |
|
126 |
sub close_index { |
127 |
my $self = shift; |
128 |
|
129 |
$self->{index}->optimize; |
130 |
} |
131 |
|
132 |
1; |