--- bin/reindex.pl 2007/04/29 00:48:04 128 +++ bin/reindex.pl 2008/05/23 22:31:37 193 @@ -11,8 +11,13 @@ use Data::Dump qw/dump/; use Text::DeDuper; use Encode; +use Getopt::Long; -my $remove_duplicate = 1; +my $keep_duplicates = 1; + +GetOptions( + 'duplicates!' => $keep_duplicates, +); $|=1; @@ -22,30 +27,44 @@ my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user ); $coll->unlimit; -print "indexing ", $coll->count, " items "; +Jifty->log->info( "indexing ", $coll->count, " items ", $keep_duplicates ? "" : "removing duplicates " ); -my $search = Grep::Search->new(); +my $search = Grep::Search->new({ create => 1 }); my $deduper = Text::DeDuper->new(); my ( $total, $duplicates ) = ( 0, 0 ); while ( my $i = $coll->next ) { - my $c = encode('utf-8', $i->content); + print $i->id; + + if ( ! $keep_duplicates ) { + + my $c = encode('utf-8', $i->content); + + my @s = sort $deduper->find_similar( $c ); + if ( @s ) { + #warn " similar = ",dump( @s ); + + foreach my $id ( @s ) { + next if $id == $i->id; # keep current + my $si = Grep::Model::Item->new(); + $si->load( $id ) or die "can't find similar item $id"; + print " -$id-"; + $si->delete; + $duplicates++; + $search->invindexer->delete_by_term( 'id', $id ); + } + } - if ( $remove_duplicate && $deduper->find_similar( $c ) ) { - $i->delete; - print "-",$i->id,"- "; - $duplicates++; - } else { - $search->add( $i, $i->in_feed->owner->id ); - print $i->id; $deduper->add_doc( $i->id, $c ); - print ' '; } + + $search->add( $i, $i->in_feed->owner->id ); + print ' '; $total++; } -print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n"; +Jifty->log->info( "$total records indexed", $duplicates ? " ($duplicates duplicates)" : "" ); $search->finish;