trunk/deduper/reblog-dupe.pl

#!/usr/bin/perl -w

# reblog-dupe.pl - remove unread duplicate posts which have need read
#
# currently works without any care about users, so use is limited
# to single-user installations
#
# 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>

use strict;
use DBI;
use Text::DeDuper;
use POSIX qw/strftime/;
use Data::Dump qw/dump/;

$|++;

my $delete_read_duplicates = 0;
my $limit_items_timestamp = strftime('%Y-%m-%d', localtime( time() - 7 * 24 * 60 * 60 ) );

my $connect = "DBI:mysql:database=reblog";
my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;

my $user_id = 1;

# select all posts which have been read or unread
my $sql = qq{
        select
                id, content, feed_id
        from items
        join items_userdata on id=item_id
        where label = 'read'
                and value_numeric = ?
                and user_id = $user_id
                and items.timestamp > '$limit_items_timestamp'
        order by id
};

my $sth = $dbh->prepare($sql) || die $dbh->errstr();
$sth->execute( 1 ) || die $sth->errstr();

print "found ",$sth->rows," items back to $limit_items_timestamp to process...";

my $deduper = new Text::DeDuper();

sub strip {
        my $t = shift || return;
        $t =~ s/<[^>]*>//gs;
        $t =~ s/\s+/ /gs;
        return $t if ($t ne ' ');
}

my $i = 0;

while (my $row = $sth->fetchrow_hashref() ) {

        my $t = strip( $row->{content} ) || next;

        $deduper->add_doc( $row->{id}, $t );

        $i++;
        print "$i " if ($i % 100 == 0);

}

print "\n";

if ( $delete_read_duplicates ) {

        print "find duplicates...";

        my $sth_delete = $dbh->prepare($sql) || die $dbh->errstr();

        my @delete;

        $sth->execute( 1 ) || die $sth->errstr();
        while (my $row = $sth->fetchrow_hashref() ) {

                my $id = $row->{id} || die "no id in now";
                my $t = strip( $row->{content} ) || next;

                my @s = sort { $a <=> $b } $deduper->find_similar($t);
                next if (! @s || $#s == 0);

                my $first = shift @s;

                next if ($first != $id);

                print " $id [", join(",", @s), "]";

                $dbh->do( "delete from items where id in (" . join(",", @s) . ")" );

        }
        print "\n";

        print "about to delete associated items_userdata\n";
        $dbh->do( "delete from items_userdata where item_id not in (select id from items)" );

}

my @duplicates;
my $feeds;

my $deduper_unread = new Text::DeDuper();

# now, take unread posts to find duplicates
$sth->execute( 0 ) || die $sth->errstr();

print "comparing with ", $sth->rows," unread items...\n";

while (my $row = $sth->fetchrow_hashref() ) {

        my $id = $row->{id} || die "no id in now";

        my $t = strip( $row->{content} ) || next;

        $deduper_unread->add_doc( $row->{id}, $t );

        my @s = $deduper->find_similar($t);
        next if (! @s);

        print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
        push @duplicates, $id;

        $feeds->{ $row->{feed_id} }++;
}

# and again, but compare just unread items

$sth->execute( 0 ) || die $sth->errstr();
print "finding dulicates in ", $sth->rows," unread items...\n";

while (my $row = $sth->fetchrow_hashref() ) {

        my $id = $row->{id} || die "no id in now";

        my $t = strip( $row->{content} ) || next;

        my @s;
        foreach my $d_id ( $deduper_unread->find_similar($t) ) {
                push @s, $d_id if ($d_id > $id);
        }
        next if (! @s);

        print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
        push @duplicates, ( @s );

        $feeds->{ $row->{feed_id} }++;
}

# mark duplicates as read

my $ids = join(",", @duplicates);
if (! $ids) {
        print "no duplicates found\n";
        exit;
}

print "found ", $#duplicates + 1, " duplicate items: $ids\n";

$sql = qq{
        update items_userdata
        set value_numeric = 1
        where label = 'read' and item_id in ($ids) and user_id = $user_id
};

$dbh->do( $sql );

# update usage_unread on modified feeds

$sql = qq{
        update feeds_userdata
        set value_numeric = value_numeric - ?
        where label = 'usage_unread' and feed_id = ? and user_id = $user_id
};

$sth = $dbh->prepare($sql) || die $dbh->errstr();
foreach my $feed_id (keys %$feeds) {
        my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
        $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
        print "removed $nr_read messages from feed $feed_id\n";
}

1	#!/usr/bin/perl -w
2
3	# reblog-dupe.pl - remove unread duplicate posts which have need read
4	#
5	# currently works without any care about users, so use is limited
6	# to single-user installations
7	#
8	# 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10	use strict;
11	use DBI;
12	use Text::DeDuper;
13	use POSIX qw/strftime/;
14	use Data::Dump qw/dump/;
15
16	$\|++;
17
18	my $delete_read_duplicates = 0;
19	my $limit_items_timestamp = strftime('%Y-%m-%d', localtime( time() - 7 * 24 * 60 * 60 ) );
20
21	my $connect = "DBI:mysql:database=reblog";
22	my $dbh = DBI->connect($connect,"","") \|\| die $DBI::errstr;
23
24	my $user_id = 1;
25
26	# select all posts which have been read or unread
27	my $sql = qq{
28	select
29	id, content, feed_id
30	from items
31	join items_userdata on id=item_id
32	where label = 'read'
33	and value_numeric = ?
34	and user_id = $user_id
35	and items.timestamp > '$limit_items_timestamp'
36	order by id
37	};
38
39	my $sth = $dbh->prepare($sql) \|\| die $dbh->errstr();
40	$sth->execute( 1 ) \|\| die $sth->errstr();
41
42	print "found ",$sth->rows," items back to $limit_items_timestamp to process...";
43
44	my $deduper = new Text::DeDuper();
45
46	sub strip {
47	my $t = shift \|\| return;
48	$t =~ s/<[^>]*>//gs;
49	$t =~ s/\s+/ /gs;
50	return $t if ($t ne ' ');
51	}
52
53	my $i = 0;
54
55	while (my $row = $sth->fetchrow_hashref() ) {
56
57	my $t = strip( $row->{content} ) \|\| next;
58
59	$deduper->add_doc( $row->{id}, $t );
60
61	$i++;
62	print "$i " if ($i % 100 == 0);
63
64	}
65
66	print "\n";
67
68	if ( $delete_read_duplicates ) {
69
70	print "find duplicates...";
71
72	my $sth_delete = $dbh->prepare($sql) \|\| die $dbh->errstr();
73
74	my @delete;
75
76	$sth->execute( 1 ) \|\| die $sth->errstr();
77	while (my $row = $sth->fetchrow_hashref() ) {
78
79	my $id = $row->{id} \|\| die "no id in now";
80	my $t = strip( $row->{content} ) \|\| next;
81
82	my @s = sort { $a <=> $b } $deduper->find_similar($t);
83	next if (! @s \|\| $#s == 0);
84
85	my $first = shift @s;
86
87	next if ($first != $id);
88
89	print " $id [", join(",", @s), "]";
90
91	$dbh->do( "delete from items where id in (" . join(",", @s) . ")" );
92
93	}
94	print "\n";
95
96	print "about to delete associated items_userdata\n";
97	$dbh->do( "delete from items_userdata where item_id not in (select id from items)" );
98
99	}
100
101	my @duplicates;
102	my $feeds;
103
104	my $deduper_unread = new Text::DeDuper();
105
106	# now, take unread posts to find duplicates
107	$sth->execute( 0 ) \|\| die $sth->errstr();
108
109	print "comparing with ", $sth->rows," unread items...\n";
110
111	while (my $row = $sth->fetchrow_hashref() ) {
112
113	my $id = $row->{id} \|\| die "no id in now";
114
115	my $t = strip( $row->{content} ) \|\| next;
116
117	$deduper_unread->add_doc( $row->{id}, $t );
118
119	my @s = $deduper->find_similar($t);
120	next if (! @s);
121
122	print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
123	push @duplicates, $id;
124
125	$feeds->{ $row->{feed_id} }++;
126	}
127
128	# and again, but compare just unread items
129
130	$sth->execute( 0 ) \|\| die $sth->errstr();
131	print "finding dulicates in ", $sth->rows," unread items...\n";
132
133	while (my $row = $sth->fetchrow_hashref() ) {
134
135	my $id = $row->{id} \|\| die "no id in now";
136
137	my $t = strip( $row->{content} ) \|\| next;
138
139	my @s;
140	foreach my $d_id ( $deduper_unread->find_similar($t) ) {
141	push @s, $d_id if ($d_id > $id);
142	}
143	next if (! @s);
144
145	print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
146	push @duplicates, ( @s );
147
148	$feeds->{ $row->{feed_id} }++;
149	}
150
151	# mark duplicates as read
152
153	my $ids = join(",", @duplicates);
154	if (! $ids) {
155	print "no duplicates found\n";
156	exit;
157	}
158
159	print "found ", $#duplicates + 1, " duplicate items: $ids\n";
160
161	$sql = qq{
162	update items_userdata
163	set value_numeric = 1
164	where label = 'read' and item_id in ($ids) and user_id = $user_id
165	};
166
167	$dbh->do( $sql );
168
169	# update usage_unread on modified feeds
170
171	$sql = qq{
172	update feeds_userdata
173	set value_numeric = value_numeric - ?
174	where label = 'usage_unread' and feed_id = ? and user_id = $user_id
175	};
176
177	$sth = $dbh->prepare($sql) \|\| die $dbh->errstr();
178	foreach my $feed_id (keys %$feeds) {
179	my $nr_read = $feeds->{$feed_id} \|\| die "no messages marked as read";
180	$sth->execute( $nr_read, $feed_id ) \|\| die $sth->errstr();
181	print "removed $nr_read messages from feed $feed_id\n";
182	}
183