/[Grep]/bin/reindex.pl

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /bin/reindex.pl

Parent Directory | Revision Log | View Patch Patch

-revision 47 by dpavlin,
Wed Feb 21 03:04:48 2007 UTC
+revision 129 by dpavlin,
Sun Apr 29 11:37:28 2007 UTC
 Line 7 
 use strict;
  use lib 'lib';
  use Jifty;
- use Lucene;
  use Grep::Search;
  use Data::Dump qw/dump/;
+ use Text::DeDuper;
+ use Encode;
+ my $remove_duplicates = 1;
+ $|=1;
  BEGIN { Jifty->new; };
- my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 );
+ my $system_user = Grep::CurrentUser->superuser;
+ my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user );
  $coll->unlimit;
  print "indexing ", $coll->count, " items ";
+ my $search = Grep::Search->new();
+ my $deduper = Text::DeDuper->new();
+ my ( $total, $duplicates ) = ( 0, 0 );
  while ( my $i = $coll->next ) {
-         Grep::Search->add( $i );
+         print $i->id;
+         if ( $remove_duplicates ) {
+                 my $c = encode('utf-8', $i->content);
-         print $i->id, ' ';
+                 my @s = sort $deduper->find_similar( $c );
+                 if ( @s ) {
+                         #warn " similar = ",dump( @s );
+                         foreach my $id ( @s ) {
+                                 next if $id == $i->id;  # keep current
+                                 my $si = Grep::Model::Item->new();
+                                 $si->load( $id ) or die "can't find similar item $id";
+                                 print " -$id-";
+                                 $si->delete;
+                                 $duplicates++;
+                         }
+                 }
+                 $deduper->add_doc( $i->id, $c );
+         }
+         $search->add( $i, $i->in_feed->owner->id );
+         print ' ';
+         $total++;
  }
- print "\n";
+ print "$total records indexed", $remove_duplicates ? " ($duplicates duplicates)" : "", "\n";
- Grep::Search->finish;
+ $search->finish;

 Legend:



Removed from v.47
 


changed lines


 
Added in v.129
 Legend:



Removed from v.47
 


changed lines


 
Added in v.129
-Removed from v.47
+Added in v.129

	ViewVC Help
Powered by ViewVC 1.1.26