/[mws]/trunk/mbox2index.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/mbox2index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 56 - (hide annotations)
Wed Dec 1 22:33:19 2004 UTC (19 years, 5 months ago) by dpavlin
File MIME type: text/plain
File size: 2876 byte(s)
If I have class inheritance, I might as well use it :-)

1 dpavlin 12 #!/usr/bin/perl -w
2    
3 dpavlin 43 BEGIN {
4 dpavlin 53 my $basedir = readlink($0) || $0; $basedir =~ s#/[^/]+$#/lib#;
5 dpavlin 43 unshift(@INC, $basedir);
6     }
7 dpavlin 41
8 dpavlin 47 =head1 NAME
9    
10     mbox2index.pl - indexing script for Mail::Box Web Search
11    
12     =head1 SYNOPSYS
13    
14     mbox2index.pl [local.conf]
15    
16     =head1 DESCRIPTION
17    
18     This script will index mailboxes defined in C<global.conf> or local
19     configuration file supplied at command line.
20    
21     In normal operation, using MWS::SWISH it will exec swish-e which will in
22     turn again call this script, but this time with C<--recursive> option.
23    
24     =head1 SEE ALSO
25    
26     C<MWS> perl modules which are part of this package
27    
28     =cut
29    
30 dpavlin 56 #use MWS::SWISH;
31     use MWS::Plucene;
32 dpavlin 12 use Data::Dumper;
33 dpavlin 13 use Date::Parse;
34     use POSIX qw(strftime);
35 dpavlin 27 use Getopt::Long;
36 dpavlin 12
37 dpavlin 27 # are we called from this script?
38     my $recursive = 0;
39 dpavlin 12
40 dpavlin 27 my $r = GetOptions("recursive" => \$recursive);
41    
42     my $config_file = shift @ARGV || 'global.conf';
43    
44     if (! -f $config_file) {
45     print qq{Usage: $0 [/path/to/local.conf]
46    
47     If local.conf is not specified, global.conf in current directory will
48     be used.
49     };
50     exit 1;
51     }
52    
53 dpavlin 56 #my $mws = MWS::SWISH->new(config_file => $config_file);
54     my $mws = MWS::Indexer->new(config_file => $config_file);
55 dpavlin 27
56     $mws->create_index if (! $recursive);
57    
58 dpavlin 43 print STDERR "starting indexing...\n";
59 dpavlin 27
60 dpavlin 12 my $debug = 1;
61    
62     foreach my $mbox ($mws->{config}->Parameters('folders')) {
63     my $mbox_path = $mws->{config}->val('folders', $mbox);
64    
65     print STDERR "working on $mbox [$mbox_path]\n" if ($debug);
66    
67     my $folder = $mws->open_folder($mbox);
68    
69 dpavlin 47 my $total = scalar $folder->messageIds;
70    
71     print STDERR "$total messages\n" if ($debug);
72 dpavlin 14
73 dpavlin 47 my $count = 0;
74    
75 dpavlin 12 foreach my $message ($folder->messages) {
76    
77     my $id = $message->messageId;
78    
79     my $document = {
80     id => $id,
81     folder => $mbox,
82     };
83    
84     foreach my $direction (qw(to from cc bcc)) {
85 dpavlin 13 foreach my $part (qw(phrase address comment)) {
86 dpavlin 41 my @data = $mws->unroll($message,$direction,$part);
87 dpavlin 47 if (@data) {
88     $document->{$direction.'_'.$part} = join("##", @data);
89     $document->{$direction.'_'.$part} =~ s/\s*\(e\s*-\s*mail\)\s*//gi;
90     }
91 dpavlin 12 }
92     }
93    
94 dpavlin 17 $document->{'subject'} = $mws->decode_qp($message->subject) || 'no subject';
95 dpavlin 12
96 dpavlin 14 $document->{'body'} = $mws->plain_text_body($message);
97 dpavlin 13
98 dpavlin 53 my $utime = $message->timestamp;
99 dpavlin 13
100     $document->{'date_utime'} = $utime;
101 dpavlin 47 $document->{'date'} = strftime("%Y-%m-%d %H:%M:%S", localtime($utime)) if ($utime);
102 dpavlin 13
103 dpavlin 12 # print Dumper($document);
104 dpavlin 14 $mws->add_index("$mbox $id" => $document);
105 dpavlin 12
106 dpavlin 14 # clear internal MWS cache to keep memory usage down
107     # (this should be replaced by garbage collector in MWS,
108     # but without it this is the best solution to keep machine
109     # alive while indexing)
110     $mws->{cache} = {};
111 dpavlin 12
112 dpavlin 15 # this is not complete solution. see mailbox-destruct.diff
113     $message->destruct();
114 dpavlin 12
115 dpavlin 47 $count++;
116     printf STDERR "%d messages in $mbox done [%d %%]\n",$count,($count * 100/$total) if ($count % 100 == 0);
117    
118 dpavlin 12 }
119    
120 dpavlin 14 $mws->close_folder($mbox);
121 dpavlin 15
122 dpavlin 12 }
123    
124     $mws->close_index;

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26