/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (show annotations)
Thu Feb 15 13:57:42 2007 UTC (17 years, 3 months ago) by dpavlin
File MIME type: text/plain
File size: 3855 byte(s)
ability to delete items which are read and duplicate
1 #!/usr/bin/perl -w
2
3 # reblog-dupe.pl - remove unread duplicate posts which have need read
4 #
5 # currently works without any care about users, so use is limited
6 # to single-user installations
7 #
8 # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10 use strict;
11 use DBI;
12 use Text::DeDuper;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $delete_read_duplicates = 0;
18
19 my $connect = "DBI:mysql:database=reblog";
20 my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
21
22 my $user_id = 1;
23
24 # select all posts which have been read or unread
25 my $sql = qq{
26 select
27 id, content, feed_id
28 from items
29 join items_userdata on id=item_id
30 where label = 'read'
31 and value_numeric = ?
32 and user_id = $user_id
33 order by id
34 };
35
36 my $sth = $dbh->prepare($sql) || die $dbh->errstr();
37 $sth->execute( 1 ) || die $sth->errstr();
38
39 print "found ",$sth->rows," items to process...";
40
41 my $deduper = new Text::DeDuper();
42
43 sub strip {
44 my $t = shift || return;
45 $t =~ s/<[^>]*>//gs;
46 $t =~ s/\s+/ /gs;
47 return $t if ($t ne ' ');
48 }
49
50 my $i = 0;
51
52 while (my $row = $sth->fetchrow_hashref() ) {
53
54 my $t = strip( $row->{content} ) || next;
55
56 $deduper->add_doc( $row->{id}, $t );
57
58 $i++;
59 print "$i " if ($i % 100 == 0);
60
61 }
62
63 print "\n";
64
65 if ( $delete_read_duplicates ) {
66
67 print "find duplicates...";
68
69 my $sth_delete = $dbh->prepare($sql) || die $dbh->errstr();
70
71 my @delete;
72
73 $sth->execute( 1 ) || die $sth->errstr();
74 while (my $row = $sth->fetchrow_hashref() ) {
75
76 my $id = $row->{id} || die "no id in now";
77 my $t = strip( $row->{content} ) || next;
78
79 my @s = sort { $a <=> $b } $deduper->find_similar($t);
80 next if (! @s || $#s == 0);
81
82 my $first = shift @s;
83
84 next if ($first != $id);
85
86 print " $id [", join(",", @s), "]";
87
88 $dbh->do( "delete from items where id in (" . join(",", @s) . ")" );
89
90 }
91 print "\n";
92
93 print "about to delete associated items_userdata\n";
94 $dbh->do( "delete from items_userdata where item_id not in (select id from items)" );
95
96 }
97
98 my @duplicates;
99 my $feeds;
100
101 my $deduper_unread = new Text::DeDuper();
102
103 # now, take unread posts to find duplicates
104 $sth->execute( 0 ) || die $sth->errstr();
105
106 print "comparing with ", $sth->rows," unread items...\n";
107
108 while (my $row = $sth->fetchrow_hashref() ) {
109
110 my $id = $row->{id} || die "no id in now";
111
112 my $t = strip( $row->{content} ) || next;
113
114 $deduper_unread->add_doc( $row->{id}, $t );
115
116 my @s = $deduper->find_similar($t);
117 next if (! @s);
118
119 print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
120 push @duplicates, $id;
121
122 $feeds->{ $row->{feed_id} }++;
123 }
124
125 # and again, but compare just unread items
126
127 $sth->execute( 0 ) || die $sth->errstr();
128 print "finding dulicates in ", $sth->rows," unread items...\n";
129
130 while (my $row = $sth->fetchrow_hashref() ) {
131
132 my $id = $row->{id} || die "no id in now";
133
134 my $t = strip( $row->{content} ) || next;
135
136 my @s;
137 foreach my $d_id ( $deduper_unread->find_similar($t) ) {
138 push @s, $d_id if ($d_id > $id);
139 }
140 next if (! @s);
141
142 print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
143 push @duplicates, ( @s );
144
145 $feeds->{ $row->{feed_id} }++;
146 }
147
148 # mark duplicates as read
149
150 my $ids = join(",", @duplicates);
151 if (! $ids) {
152 print "no duplicates found\n";
153 exit;
154 }
155
156 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
157
158 $sql = qq{
159 update items_userdata
160 set value_numeric = 1
161 where label = 'read' and item_id in ($ids) and user_id = $user_id
162 };
163
164 $dbh->do( $sql );
165
166 # update usage_unread on modified feeds
167
168 $sql = qq{
169 update feeds_userdata
170 set value_numeric = value_numeric - ?
171 where label = 'usage_unread' and feed_id = ? and user_id = $user_id
172 };
173
174 $sth = $dbh->prepare($sql) || die $dbh->errstr();
175 foreach my $feed_id (keys %$feeds) {
176 my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
177 $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
178 print "removed $nr_read messages from feed $feed_id\n";
179 }
180

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26