trunk/deduper/reblog-dupe.pl

#!/usr/bin/perl -w

# reblog-dupe.pl - remove unread duplicate posts which have need read
#
# currently works without any care about users, so use is limited
# to single-user installations
#
# 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>

use strict;
use DBI;
use Text::DeDuper;
use Data::Dump qw/dump/;

$|++;

my $connect = "DBI:mysql:database=reblog";
my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;

# select all posts which have been read or unread
my $sql = qq{
        select
                id, content
        from items
        join items_userdata on id=item_id
        where label = 'read' and value_numeric = ?
};

my $sth = $dbh->prepare($sql) || die $dbh->errstr();
$sth->execute( 1 ) || die $sth->errstr();

print "found ",$sth->rows," items to process...";

my $deduper = new Text::DeDuper();

sub strip {
        my $t = shift || return;
        $t =~ s/<[^>]*>//gs;
        $t =~ s/\s+/ /gs;
        return $t if ($t ne ' ');
}

while (my $row = $sth->fetchrow_hashref() ) {

        my $t = strip( $row->{content} ) || next;

        $deduper->add_doc( $row->{id}, $t );

        print ".";

}

print STDERR "\n";

# now, take unread posts to find duplicates
$sth->execute( 0 ) || die $sth->errstr();

print "comparing with ", $sth->rows," unread items...\n";

my @duplicates;

while (my $row = $sth->fetchrow_hashref() ) {

        my $id = $row->{id} || die "no id in now";

        my $t = strip( $row->{content} ) || next;

        my @s = $deduper->find_similar($t);
        next if (! @s);

        print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
        push @duplicates, $id;

}

my $ids = join(",", @duplicates);
print "found ", $#duplicates + 1, " duplicate items: $ids\n";

$sql = qq{
        update items_userdata
        set value_numeric = 1
        where label = 'read' and item_id in ($ids)
};

#$dbh->do( $sql );
print "de-dupe sql:\n$sql\n" if ($ids);
1	#!/usr/bin/perl -w
2
3	# reblog-dupe.pl - remove unread duplicate posts which have need read
4	#
5	# currently works without any care about users, so use is limited
6	# to single-user installations
7	#
8	# 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10	use strict;
11	use DBI;
12	use Text::DeDuper;
13	use Data::Dump qw/dump/;
14
15	$\|++;
16
17	my $connect = "DBI:mysql:database=reblog";
18	my $dbh = DBI->connect($connect,"","") \|\| die $DBI::errstr;
19
20	# select all posts which have been read or unread
21	my $sql = qq{
22	select
23	id, content
24	from items
25	join items_userdata on id=item_id
26	where label = 'read' and value_numeric = ?
27	};
28
29	my $sth = $dbh->prepare($sql) \|\| die $dbh->errstr();
30	$sth->execute( 1 ) \|\| die $sth->errstr();
31
32	print "found ",$sth->rows," items to process...";
33
34	my $deduper = new Text::DeDuper();
35
36	sub strip {
37	my $t = shift \|\| return;
38	$t =~ s/<[^>]*>//gs;
39	$t =~ s/\s+/ /gs;
40	return $t if ($t ne ' ');
41	}
42
43	while (my $row = $sth->fetchrow_hashref() ) {
44
45	my $t = strip( $row->{content} ) \|\| next;
46
47	$deduper->add_doc( $row->{id}, $t );
48
49	print ".";
50
51	}
52
53	print STDERR "\n";
54
55	# now, take unread posts to find duplicates
56	$sth->execute( 0 ) \|\| die $sth->errstr();
57
58	print "comparing with ", $sth->rows," unread items...\n";
59
60	my @duplicates;
61
62	while (my $row = $sth->fetchrow_hashref() ) {
63
64	my $id = $row->{id} \|\| die "no id in now";
65
66	my $t = strip( $row->{content} ) \|\| next;
67
68	my @s = $deduper->find_similar($t);
69	next if (! @s);
70
71	print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
72	push @duplicates, $id;
73
74	}
75
76	my $ids = join(",", @duplicates);
77	print "found ", $#duplicates + 1, " duplicate items: $ids\n";
78
79	$sql = qq{
80	update items_userdata
81	set value_numeric = 1
82	where label = 'read' and item_id in ($ids)
83	};
84
85	#$dbh->do( $sql );
86	print "de-dupe sql:\n$sql\n" if ($ids);