--- trunk/MWS.pm 2004/05/03 21:28:46 2 +++ trunk/MWS.pm 2004/05/07 20:52:34 19 @@ -6,6 +6,16 @@ use warnings; use Carp; +use Mail::Box::Manager; +use Config::IniFiles; +use POSIX qw(strftime); +use Text::Autoformat; +use Text::Iconv; +use Text::Unaccent; + +#use MWS_plucene; +use MWS_swish; + require Exporter; our @ISA = qw(Exporter); @@ -16,28 +26,118 @@ our $VERSION = '1.00'; - my $folder; # placeholder for folders -my $debug = 1; +my $debug = 2; sub new { my $class = shift; my $self = {}; bless($self, $class); - my $index_file = shift || die "need index file"; + my $config_file = shift || die "need index file"; + + $self->{config} = new Config::IniFiles( -file => $config_file ); - $self->{index} = Plucene::Simple->open($index_file) || die "can't open index '$index_file': $!"; + my $index_file = $self->{config}->val('global', 'index') || croak "can't find [index] section in config file with path of index"; - $self->{mgr} = Mail::Box::Manager->new; + $self->{mgr} = Mail::Box::Manager->new(access => 'r'); + $self->{index_file} = $index_file; # placeholder for opened folders $self->{folder} = {}; + $self->{wrap_margin} = $self->{config}->val('global', 'wrap_margin'); + $self->{max_results} = $self->{config}->val('global', 'max_results') || 100; + $self->reset_counters; + return $self; } +sub normalize_string { + my $self = shift; + + my $v = shift || return; + + $v = unac_string('ISO-8859-2', $v); + $v = join('',sort split(/\s+/,$v)); + $v =~ s/\W+//g; + + return $v; +} + +# reset tables for search results +sub reset_counters { + my $self = shift; + + $self->{counter} = {}; + +# foreach my $c (qw(thread from to cc bcc lists links att)) { +# $self->{counter}->{$c} = {}; +# } + +} + +sub add_counter($$) { + my $self = shift; + + my ($c,$v) = @_; + my $k = $self->normalize_string($v); + + $self->{counter}->{$c}->{$k}->{name} = $v; + return $self->{counter}->{$c}->{$k}->{usage}++; +} + +sub counter { + my $self = shift; + + my $c = shift || return; + + return if (! $self->{counter}->{$c}); + + return $self->{counter}->{$c}; +} + +sub mbox_name2path { + my $self = shift; + + my $mbox = shift || croak "folder_name2path needs mbox name"; + + return $self->{config}->val('folders', $mbox) || croak "comeone removed folder $mbox from config?"; +} + +sub open_folder { + my $self = shift; + + my $mbox = shift || croak "open_folder needs mbox name"; + + if (! $self->{folder}->{$mbox}) { + my $mbox_path = $self->mbox_name2path($mbox); + + print STDERR "about to open_folder($mbox)\n" if ($debug == 2); + + $self->{folder}->{$mbox} = $self->{mgr}->open($mbox_path) || croak "can't open folder $mbox at '$mbox_path': $!"; + + print STDERR "open_folder($mbox) ok\n" if ($debug); + } + + $self->{fetch_count} = 0; + + return $self->{folder}->{$mbox}; + +} + +sub close_folder { + my $self = shift; + + my $mbox = shift || croak "open_folder needs mbox name"; + + $self->{folder}->{$mbox}->close(write => 'NEVER') || croak "can't close folder $mbox"; + + # XXX this is rather agressive!!! + $self->{folder} = {}; + return +} sub fetch_message { my $self = shift; @@ -45,15 +145,21 @@ my $mbox_id = shift || die "need mbox_id!"; my ($mbox,$id) = split(/ /,$mbox_id); - if (! $self->{folder}->{$mbox}) { - $self->{folder}->{$mbox} = $self->{mgr}->open($mbox); - print STDERR "## open($mbox)\n" if ($debug); - } + # return message with ID + print STDERR "fetch $id from $mbox\n" if ($debug); - my $message = $self->{folder}->{$mbox}->find($id) || - print STDERR "can't find message $id in $mbox. Time to re-index?\n"; + if ($self->{fetch_count}++ > 100) { + $self->close_folder($mbox); + print STDERR "close_folder($mbox) forced on ",$self->{fetch_count},"iteration\n"; + } - return $message; + my $msg = $self->open_folder($mbox)->find($id); + if ($msg) { + return $msg; + } else { + print STDERR "can't find message $id in $mbox. Time to re-index?\n"; + return; + } } @@ -62,46 +168,157 @@ my $s = shift || carp "search called without argument!"; - my @index_ids = $self->{index}->search($s); + $self->reset_counters; + + print STDERR "search_index($s)\n" if ($debug == 2); + my @index_ids = $self->search_index($s); $self->{'index_ids'} = \@index_ids; - my $results = $#index_ids + 1; - $self->{'results'} = $results; + #my $results = $#index_ids + 1; + #$self->{'results'} = $results; + + my $results = $self->{'total_hits'} || ($#index_ids + 1); $self->{'curr_result'} = 0; + print STDERR "$results results\n" if ($debug == 2); + return $results || 'error'; } +sub decode_qp($) { + my $self = shift; + + my $tmp = shift || return; + + sub decode($$) { + my ($cp,$qp) = @_; + my $iconv = Text::Iconv->new($cp,'ISO-8859-2'); + print STDERR "decode($cp,$qp) -> " if ($debug == 2); + $qp =~ s/=([a-f0-9][a-f0-9])/chr(hex($1))/ieg; + $qp =~ s/_/ /g; + print STDERR "$qp\n" if ($debug == 2); + return $iconv->convert($qp) || $qp; + } + + $tmp =~ s/=\?([^\?]+)\?Q\?(.+?)\?=/decode($1,$2)/ex; + $tmp =~ s/^\s*["']+(.*?)["']+\s*$/$1/g; + return $tmp; +} + sub unroll($$$) { + my $self = shift; + my ($message,$part,$sub) = @_; my @arr; + return if (! $message->$part); + foreach my $from ($message->$part) { - push @arr, $from->$sub; + my $tmp = $from->$sub || next; + + $tmp = $self->decode_qp($tmp); + push @arr, $tmp; } - return \@arr; + + return @arr; } - + +sub fetch_all_results { + my $self = shift; + + croak "results called before search!" if (! $self->{'index_ids'}); + + print STDERR "fetch_all_results [",scalar @{$self->{'index_ids'}},"]\n" if ($debug == 2); + + my @arr; + + foreach my $id (@{$self->{'index_ids'}}) { + push @arr, $self->fetch_result_by_id($id); + } + + + return @arr; +} + sub fetch_result { my $self = shift; + my $args = {@_}; + croak "results called before search!" if (! $self->{'index_ids'}); my $curr = $self->{'curr_result'}++; my $id = $self->{'index_ids'}->[$curr]; + + print STDERR "fetch_result: $curr = $id\n" if ($debug == 2); + + return $self->fetch_result_by_id($id); +} - return if (! $id); +sub plain_text_body { + my $self = shift; + my $message = shift || croak "plain_text_body needs message!"; - my $message = $self->fetch_message($id); + my $body; - my $row; + if (! $message->isMultipart) { + $body = $message->decoded->string; + } else { + foreach my $part ($message->parts) { + if ($part->body->mimeType eq 'text/plain') { + $body = $part->decoded->string; + last; + } + } + } + + # reformat with Text::Autoformat + my $wrap = $self->{wrap_margin}; + if ($wrap && $body && $body =~ m/^.{$wrap}..*$/m) { + $body =~ s/[\r\n]/\n/gs; + $body = autoformat($body, {right=>$wrap}); + $body .="\n[reformated using autoformat, margin at $wrap]" if ($debug == 2); + } - $row->{'from'} = unroll($message,'from','phrase'); - $row->{'subject'} = $message->get('Subject'); + return $body; +} + + +sub fetch_result_by_id { + my $self = shift; + + my $id = shift || return; + + my $row = $self->{cache}->{$id}; + + if (! $row) { + + print STDERR "fetch_result_by_id($id) not in cache, hitting disk\n" if ($debug == 2); + + my $message = $self->fetch_message($id) || return; + + $row->{'id'} = $id; + + foreach my $p (qw(from to cc bcc)) { + foreach my $v ($self->unroll($message,'from','phrase')) { + push @{$row->{$p}},$v; + $self->add_counter($p,$v); + } + } + $row->{'subject'} = $self->decode_qp($message->subject); + $row->{'body'} = $self->plain_text_body($message); + $row->{'date'} = $message->date; + + # XXX store in cache? + $self->{cache}->{$id} = $row; + print STDERR "$id stored in cache\n" if ($debug == 2); + } else { + print STDERR "fetch_result_by_id($id) in cache\n" if ($debug == 2); + } return $row;