/[mws]/trunk/MWS/Plucene.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/MWS/Plucene.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (show annotations)
Mon May 10 20:26:17 2004 UTC (20 years ago) by dpavlin
File size: 2625 byte(s)
major code re-structuring: separation of indexer code into target independent
and depended, documentation improvements

1 #!/usr/bin/perl -w
2
3 package MWS::Plucene;
4 use strict;
5 use warnings;
6
7 use MWS::Indexer;
8 our @ISA=qw(MWS::Indexer);
9
10 our $VERSION = '0.01';
11
12 use Plucene::Simple;
13 use Carp;
14 use Data::Dumper;
15
16 =head1 NAME
17
18 MWS::Plucene - index your data using Plucene
19
20 =head1 DESCRIPTION
21
22 Simple implementation to use Lucene port to perl
23
24 =head1 METHODS
25
26 =head2 open_index
27
28 $self->open_index;
29
30 =cut
31
32 sub open_index {
33 my $self = shift;
34
35 $self->{index} = Plucene::Simple->open($self->{index_dir}) || croak "can't open index '",$self->{index_dir},"': $!";
36
37 return $self->{index};
38
39 }
40
41 =head2 search_index
42
43 my @results = $self->search_index('message:funny');
44
45 Date limits are, well, cludged and sort isn't supported!
46
47 =cut
48
49 sub search_index {
50 my $self = shift;
51
52 my $s = shift || croak "search_index needs query";
53
54 $self->open_index if (! $self->{index});
55
56 # kill sort:something [asc|desc]
57 $s =~ s/sort:\w+\s+\w+//;
58
59 my ($y,$m,$d);
60
61 if ($s =~ s/date:"*(\d{4})(?:-(\d{2}))*(?:-(\d{2}))*"*//) {
62 ($y,$m,$d) = ($1,$2,$3);
63
64 my ($df,$dt); # date from, to
65
66 if ($y && $m && $d) {
67 $df = $self->fmtdate($y,$m,$y);
68 $dt = $df;
69 } elsif ($y && $m) {
70 $df = $self->fmtdate($y,$m)."-01";
71 $dt = $self->fmtdate($y,$m)."-31";
72 } elsif ($y) {
73 $df = $self->fmtdate($y)."-01-01";
74 $dt = $self->fmtdate($y)."-12-31";
75 }
76 print STDERR "Plucene query $s [from $df to $dt]\n" if ($self->{debug});
77 return $self->{index}->search_during($s, $df => $dt);
78 } else {
79 print STDERR "Plucene query $s\n" if ($self->{debug});
80 return $self->{index}->search($s);
81 }
82 }
83
84 =head2 add_index
85
86 $self->add_index($mbox_id, $document);
87
88 This method will try to fake "message" field which is used to search whole
89 message (including headers) with body and subject. Duplicating content is
90 wrong, but using Plucene::Simple has it's limitations.
91
92 It will also remove time from date to make Plucene happy.
93
94 =cut
95
96 sub add_index {
97 my $self = shift;
98
99 my $mbox_id = shift || croak "add_index needs mbox_id";
100 my $document = shift || croak "add_index needs document";
101
102 $self->open_index if (! $self->{index});
103
104 # does index withh document allready exist?
105 if (-f $self->{index_dir}."/segmets") {
106 return if ($self->{index}->search("id:".$document->{id}));
107 }
108
109 print STDERR "add_index($mbox_id)\n" if ($self->{debug});
110
111 $document->{message} = ($document->{body} || '') . ($document->{subject} || '');
112
113 $document->{date} =~ s/\d{4}-\d+-\d+.*//;
114
115 $self->{index}->add($mbox_id => $document );
116 }
117
118 =head2 close_index
119
120 $self->close_index;
121
122 This will also optimize Plucene index file.
123
124 =cut
125
126 sub close_index {
127 my $self = shift;
128
129 $self->{index}->optimize;
130 }
131
132 1;

  ViewVC Help
Powered by ViewVC 1.1.26