1 |
#!/usr/bin/perl -w |
#!/usr/bin/perl -w |
2 |
|
|
3 |
|
package MWS::Plucene; |
4 |
use strict; |
use strict; |
5 |
|
use warnings; |
6 |
|
|
7 |
# |
use MWS::Indexer; |
8 |
# simple implementation to use Lucene port to perl |
our @ISA=qw(MWS::Indexer); |
9 |
# |
|
10 |
|
our $VERSION = '0.01'; |
11 |
|
|
12 |
use Plucene::Simple; |
use Plucene::Simple; |
13 |
|
use Carp; |
14 |
|
use Data::Dumper; |
15 |
|
|
16 |
|
=head1 NAME |
17 |
|
|
18 |
|
MWS::Plucene - index your data using Plucene |
19 |
|
|
20 |
|
=head1 DESCRIPTION |
21 |
|
|
22 |
|
Simple implementation to use Lucene port to perl |
23 |
|
|
24 |
|
=head1 METHODS |
25 |
|
|
26 |
|
=head2 open_index |
27 |
|
|
28 |
|
$self->open_index; |
29 |
|
|
30 |
|
=cut |
31 |
|
|
32 |
sub open_index { |
sub open_index { |
33 |
my $self = shift; |
my $self = shift; |
34 |
|
|
35 |
my $index_file = shift || croak "open_index needs index filename"; |
$self->{index} = Plucene::Simple->open($self->{index_dir}) || croak "can't open index '",$self->{index_dir},"': $!"; |
36 |
|
|
|
$self->{index} = Plucene::Simple->open($index_file) || croak "can't open index '$index_file': $!"; |
|
37 |
return $self->{index}; |
return $self->{index}; |
38 |
|
|
39 |
} |
} |
40 |
|
|
41 |
|
=head2 search_index |
42 |
|
|
43 |
|
my @results = $self->search_index('message:funny'); |
44 |
|
|
45 |
|
Date limits are, well, cludged and sort isn't supported! |
46 |
|
|
47 |
|
=cut |
48 |
|
|
49 |
sub search_index { |
sub search_index { |
50 |
my $self = shift; |
my $self = shift; |
51 |
|
|
52 |
my $s = shift || croak "search_index needs query"; |
my $s = shift || croak "search_index needs query"; |
53 |
|
|
54 |
return $self->{index}->search($s); |
$self->open_index if (! $self->{index}); |
55 |
|
|
56 |
|
# kill sort:something [asc|desc] |
57 |
|
$s =~ s/sort:\w+\s+\w+//; |
58 |
|
|
59 |
|
my ($y,$m,$d); |
60 |
|
|
61 |
|
if ($s =~ s/date:"*(\d{4})(?:-(\d{2}))*(?:-(\d{2}))*"*//) { |
62 |
|
($y,$m,$d) = ($1,$2,$3); |
63 |
|
|
64 |
|
my ($df,$dt); # date from, to |
65 |
|
|
66 |
|
if ($y && $m && $d) { |
67 |
|
$df = $self->fmtdate($y,$m,$y); |
68 |
|
$dt = $df; |
69 |
|
} elsif ($y && $m) { |
70 |
|
$df = $self->fmtdate($y,$m)."-01"; |
71 |
|
$dt = $self->fmtdate($y,$m)."-31"; |
72 |
|
} elsif ($y) { |
73 |
|
$df = $self->fmtdate($y)."-01-01"; |
74 |
|
$dt = $self->fmtdate($y)."-12-31"; |
75 |
|
} |
76 |
|
print STDERR "Plucene query $s [from $df to $dt]\n" if ($self->{debug}); |
77 |
|
return $self->{index}->search_during($s, $df => $dt); |
78 |
|
} else { |
79 |
|
print STDERR "Plucene query $s\n" if ($self->{debug}); |
80 |
|
return $self->{index}->search($s); |
81 |
|
} |
82 |
} |
} |
83 |
|
|
84 |
|
=head2 add_index |
85 |
|
|
86 |
|
$self->add_index($mbox_id, $document); |
87 |
|
|
88 |
|
This method will try to fake "message" field which is used to search whole |
89 |
|
message (including headers) with body and subject. Duplicating content is |
90 |
|
wrong, but using Plucene::Simple has it's limitations. |
91 |
|
|
92 |
|
It will also remove time from date to make Plucene happy. |
93 |
|
|
94 |
|
=cut |
95 |
|
|
96 |
sub add_index { |
sub add_index { |
97 |
my $self = shift; |
my $self = shift; |
98 |
|
|
99 |
$self->{index}->add(@_); |
my $mbox_id = shift || croak "add_index needs mbox_id"; |
100 |
|
my $document = shift || croak "add_index needs document"; |
101 |
|
|
102 |
|
$self->open_index if (! $self->{index}); |
103 |
|
|
104 |
|
# does index withh document allready exist? |
105 |
|
if (-f $self->{index_dir}."/segmets") { |
106 |
|
return if ($self->{index}->search("id:".$document->{id})); |
107 |
|
} |
108 |
|
|
109 |
|
print STDERR "add_index($mbox_id)\n" if ($self->{debug}); |
110 |
|
|
111 |
|
$document->{message} = ($document->{body} || '') . ($document->{subject} || ''); |
112 |
|
|
113 |
|
$document->{date} =~ s/\d{4}-\d+-\d+.*//; |
114 |
|
|
115 |
|
$self->{index}->add($mbox_id => $document ); |
116 |
} |
} |
117 |
|
|
118 |
|
=head2 close_index |
119 |
|
|
120 |
|
$self->close_index; |
121 |
|
|
122 |
|
This will also optimize Plucene index file. |
123 |
|
|
124 |
|
=cut |
125 |
|
|
126 |
sub close_index { |
sub close_index { |
127 |
my $self = shift; |
my $self = shift; |
128 |
|
|