--- bin/reindex.pl 2007/02/21 03:04:48 47 +++ bin/reindex.pl 2008/05/23 22:31:37 193 @@ -7,24 +7,64 @@ use lib 'lib'; use Jifty; -use Lucene; use Grep::Search; use Data::Dump qw/dump/; +use Text::DeDuper; +use Encode; +use Getopt::Long; + +my $keep_duplicates = 1; + +GetOptions( + 'duplicates!' => $keep_duplicates, +); + +$|=1; BEGIN { Jifty->new; }; -my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 ); +my $system_user = Grep::CurrentUser->superuser; +my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user ); $coll->unlimit; -print "indexing ", $coll->count, " items "; +Jifty->log->info( "indexing ", $coll->count, " items ", $keep_duplicates ? "" : "removing duplicates " ); + +my $search = Grep::Search->new({ create => 1 }); +my $deduper = Text::DeDuper->new(); + +my ( $total, $duplicates ) = ( 0, 0 ); while ( my $i = $coll->next ) { - Grep::Search->add( $i ); + print $i->id; + + if ( ! $keep_duplicates ) { + + my $c = encode('utf-8', $i->content); - print $i->id, ' '; + my @s = sort $deduper->find_similar( $c ); + if ( @s ) { + #warn " similar = ",dump( @s ); + + foreach my $id ( @s ) { + next if $id == $i->id; # keep current + my $si = Grep::Model::Item->new(); + $si->load( $id ) or die "can't find similar item $id"; + print " -$id-"; + $si->delete; + $duplicates++; + $search->invindexer->delete_by_term( 'id', $id ); + } + } + + $deduper->add_doc( $i->id, $c ); + } + + $search->add( $i, $i->in_feed->owner->id ); + print ' '; + $total++; } -print "\n"; +Jifty->log->info( "$total records indexed", $duplicates ? " ($duplicates duplicates)" : "" ); -Grep::Search->finish; +$search->finish;