--- bin/reindex.pl 2007/04/29 00:48:04 128 +++ bin/reindex.pl 2007/04/29 11:37:28 129 @@ -12,7 +12,7 @@ use Text::DeDuper; use Encode; -my $remove_duplicate = 1; +my $remove_duplicates = 1; $|=1; @@ -31,21 +31,34 @@ while ( my $i = $coll->next ) { - my $c = encode('utf-8', $i->content); + print $i->id; + + if ( $remove_duplicates ) { + + my $c = encode('utf-8', $i->content); + + my @s = sort $deduper->find_similar( $c ); + if ( @s ) { + #warn " similar = ",dump( @s ); + + foreach my $id ( @s ) { + next if $id == $i->id; # keep current + my $si = Grep::Model::Item->new(); + $si->load( $id ) or die "can't find similar item $id"; + print " -$id-"; + $si->delete; + $duplicates++; + } + } - if ( $remove_duplicate && $deduper->find_similar( $c ) ) { - $i->delete; - print "-",$i->id,"- "; - $duplicates++; - } else { - $search->add( $i, $i->in_feed->owner->id ); - print $i->id; $deduper->add_doc( $i->id, $c ); - print ' '; } + + $search->add( $i, $i->in_feed->owner->id ); + print ' '; $total++; } -print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n"; +print "$total records indexed", $remove_duplicates ? " ($duplicates duplicates)" : "", "\n"; $search->finish;