/[mws]/trunk/mbox2index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/mbox2index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 56 - (show annotations)
Wed Dec 1 22:33:19 2004 UTC (19 years, 5 months ago) by dpavlin
File MIME type: text/plain
File size: 2876 byte(s)
If I have class inheritance, I might as well use it :-)

1 #!/usr/bin/perl -w
2
3 BEGIN {
4 my $basedir = readlink($0) || $0; $basedir =~ s#/[^/]+$#/lib#;
5 unshift(@INC, $basedir);
6 }
7
8 =head1 NAME
9
10 mbox2index.pl - indexing script for Mail::Box Web Search
11
12 =head1 SYNOPSYS
13
14 mbox2index.pl [local.conf]
15
16 =head1 DESCRIPTION
17
18 This script will index mailboxes defined in C<global.conf> or local
19 configuration file supplied at command line.
20
21 In normal operation, using MWS::SWISH it will exec swish-e which will in
22 turn again call this script, but this time with C<--recursive> option.
23
24 =head1 SEE ALSO
25
26 C<MWS> perl modules which are part of this package
27
28 =cut
29
30 #use MWS::SWISH;
31 use MWS::Plucene;
32 use Data::Dumper;
33 use Date::Parse;
34 use POSIX qw(strftime);
35 use Getopt::Long;
36
37 # are we called from this script?
38 my $recursive = 0;
39
40 my $r = GetOptions("recursive" => \$recursive);
41
42 my $config_file = shift @ARGV || 'global.conf';
43
44 if (! -f $config_file) {
45 print qq{Usage: $0 [/path/to/local.conf]
46
47 If local.conf is not specified, global.conf in current directory will
48 be used.
49 };
50 exit 1;
51 }
52
53 #my $mws = MWS::SWISH->new(config_file => $config_file);
54 my $mws = MWS::Indexer->new(config_file => $config_file);
55
56 $mws->create_index if (! $recursive);
57
58 print STDERR "starting indexing...\n";
59
60 my $debug = 1;
61
62 foreach my $mbox ($mws->{config}->Parameters('folders')) {
63 my $mbox_path = $mws->{config}->val('folders', $mbox);
64
65 print STDERR "working on $mbox [$mbox_path]\n" if ($debug);
66
67 my $folder = $mws->open_folder($mbox);
68
69 my $total = scalar $folder->messageIds;
70
71 print STDERR "$total messages\n" if ($debug);
72
73 my $count = 0;
74
75 foreach my $message ($folder->messages) {
76
77 my $id = $message->messageId;
78
79 my $document = {
80 id => $id,
81 folder => $mbox,
82 };
83
84 foreach my $direction (qw(to from cc bcc)) {
85 foreach my $part (qw(phrase address comment)) {
86 my @data = $mws->unroll($message,$direction,$part);
87 if (@data) {
88 $document->{$direction.'_'.$part} = join("##", @data);
89 $document->{$direction.'_'.$part} =~ s/\s*\(e\s*-\s*mail\)\s*//gi;
90 }
91 }
92 }
93
94 $document->{'subject'} = $mws->decode_qp($message->subject) || 'no subject';
95
96 $document->{'body'} = $mws->plain_text_body($message);
97
98 my $utime = $message->timestamp;
99
100 $document->{'date_utime'} = $utime;
101 $document->{'date'} = strftime("%Y-%m-%d %H:%M:%S", localtime($utime)) if ($utime);
102
103 # print Dumper($document);
104 $mws->add_index("$mbox $id" => $document);
105
106 # clear internal MWS cache to keep memory usage down
107 # (this should be replaced by garbage collector in MWS,
108 # but without it this is the best solution to keep machine
109 # alive while indexing)
110 $mws->{cache} = {};
111
112 # this is not complete solution. see mailbox-destruct.diff
113 $message->destruct();
114
115 $count++;
116 printf STDERR "%d messages in $mbox done [%d %%]\n",$count,($count * 100/$total) if ($count % 100 == 0);
117
118 }
119
120 $mws->close_folder($mbox);
121
122 }
123
124 $mws->close_index;

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26