/[Grep]/bin/reindex.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /bin/reindex.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 127 by dpavlin, Sun Apr 29 00:16:05 2007 UTC revision 144 by dpavlin, Tue May 8 14:11:38 2007 UTC
# Line 11  use Grep::Search; Line 11  use Grep::Search;
11  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
12  use Text::DeDuper;  use Text::DeDuper;
13  use Encode;  use Encode;
14    use Getopt::Long;
15    
16  my $remove_duplicate = 1;  my $keep_duplicates = 0;
17    
18    GetOptions(
19            'duplicates!' => $keep_duplicates,
20    );
21    
22  $|=1;  $|=1;
23    
24  BEGIN { Jifty->new; };  BEGIN { Jifty->new; };
25    
26  my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 );  my $system_user = Grep::CurrentUser->superuser;
27    my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user );
28  $coll->unlimit;  $coll->unlimit;
29    
30  print "indexing ", $coll->count, " items ";  print "indexing ", $coll->count, " items ", $keep_duplicates ? "" : "removing duplicates ";
31    
32  my $search = Grep::Search->new();  my $search = Grep::Search->new({ create => 1 });
33  my $deduper = Text::DeDuper->new();  my $deduper = Text::DeDuper->new();
34    
35  my ( $total, $duplicates ) = ( 0, 0 );  my ( $total, $duplicates ) = ( 0, 0 );
36    
37  while ( my $i = $coll->next ) {  while ( my $i = $coll->next ) {
38    
39          my $c = encode('utf-8', $i->content);          print $i->id;
40    
41            if ( ! $keep_duplicates ) {
42    
43                    my $c = encode('utf-8', $i->content);
44    
45                    my @s = sort $deduper->find_similar( $c );
46                    if ( @s ) {
47                            #warn " similar = ",dump( @s );
48    
49                            foreach my $id ( @s ) {
50                                    next if $id == $i->id;  # keep current
51                                    my $si = Grep::Model::Item->new();
52                                    $si->load( $id ) or die "can't find similar item $id";
53                                    print " -$id-";
54                                    $si->delete;
55                                    $duplicates++;
56                            }
57                    }
58    
         if ( $remove_duplicate && $deduper->find_similar( $c ) ) {  
                 $i->delete;  
                 print "-",$i->id,"- ";  
                 $duplicates++;  
         } else {  
                 $search->add( $i, $i->in_feed->owner->id );  
                 print $i->id;  
59                  $deduper->add_doc( $i->id, $c );                  $deduper->add_doc( $i->id, $c );
                 print ' ';  
60          }          }
61    
62            $search->add( $i, $i->in_feed->owner->id );
63            print ' ';
64          $total++;          $total++;
65  }  }
66    
67  print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n";  print "$total records indexed", $duplicates ? "($duplicates duplicates)" : "", "\n";
68    
69  $search->finish;  $search->finish;

Legend:
Removed from v.127  
changed lines
  Added in v.144

  ViewVC Help
Powered by ViewVC 1.1.26