/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 22 - (show annotations)
Fri Apr 6 08:06:35 2007 UTC (17 years, 1 month ago) by dpavlin
File MIME type: text/plain
File size: 4050 byte(s)
re-scan just last 7 days, not all items
1 #!/usr/bin/perl -w
2
3 # reblog-dupe.pl - remove unread duplicate posts which have need read
4 #
5 # currently works without any care about users, so use is limited
6 # to single-user installations
7 #
8 # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10 use strict;
11 use DBI;
12 use Text::DeDuper;
13 use POSIX qw/strftime/;
14 use Data::Dump qw/dump/;
15
16 $|++;
17
18 my $delete_read_duplicates = 0;
19 my $limit_items_timestamp = strftime('%Y-%m-%d', localtime( time() - 7 * 24 * 60 * 60 ) );
20
21 my $connect = "DBI:mysql:database=reblog";
22 my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
23
24 my $user_id = 1;
25
26 # select all posts which have been read or unread
27 my $sql = qq{
28 select
29 id, content, feed_id
30 from items
31 join items_userdata on id=item_id
32 where label = 'read'
33 and value_numeric = ?
34 and user_id = $user_id
35 and items.timestamp > '$limit_items_timestamp'
36 order by id
37 };
38
39 my $sth = $dbh->prepare($sql) || die $dbh->errstr();
40 $sth->execute( 1 ) || die $sth->errstr();
41
42 print "found ",$sth->rows," items back to $limit_items_timestamp to process...";
43
44 my $deduper = new Text::DeDuper();
45
46 sub strip {
47 my $t = shift || return;
48 $t =~ s/<[^>]*>//gs;
49 $t =~ s/\s+/ /gs;
50 return $t if ($t ne ' ');
51 }
52
53 my $i = 0;
54
55 while (my $row = $sth->fetchrow_hashref() ) {
56
57 my $t = strip( $row->{content} ) || next;
58
59 $deduper->add_doc( $row->{id}, $t );
60
61 $i++;
62 print "$i " if ($i % 100 == 0);
63
64 }
65
66 print "\n";
67
68 if ( $delete_read_duplicates ) {
69
70 print "find duplicates...";
71
72 my $sth_delete = $dbh->prepare($sql) || die $dbh->errstr();
73
74 my @delete;
75
76 $sth->execute( 1 ) || die $sth->errstr();
77 while (my $row = $sth->fetchrow_hashref() ) {
78
79 my $id = $row->{id} || die "no id in now";
80 my $t = strip( $row->{content} ) || next;
81
82 my @s = sort { $a <=> $b } $deduper->find_similar($t);
83 next if (! @s || $#s == 0);
84
85 my $first = shift @s;
86
87 next if ($first != $id);
88
89 print " $id [", join(",", @s), "]";
90
91 $dbh->do( "delete from items where id in (" . join(",", @s) . ")" );
92
93 }
94 print "\n";
95
96 print "about to delete associated items_userdata\n";
97 $dbh->do( "delete from items_userdata where item_id not in (select id from items)" );
98
99 }
100
101 my @duplicates;
102 my $feeds;
103
104 my $deduper_unread = new Text::DeDuper();
105
106 # now, take unread posts to find duplicates
107 $sth->execute( 0 ) || die $sth->errstr();
108
109 print "comparing with ", $sth->rows," unread items...\n";
110
111 while (my $row = $sth->fetchrow_hashref() ) {
112
113 my $id = $row->{id} || die "no id in now";
114
115 my $t = strip( $row->{content} ) || next;
116
117 $deduper_unread->add_doc( $row->{id}, $t );
118
119 my @s = $deduper->find_similar($t);
120 next if (! @s);
121
122 print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
123 push @duplicates, $id;
124
125 $feeds->{ $row->{feed_id} }++;
126 }
127
128 # and again, but compare just unread items
129
130 $sth->execute( 0 ) || die $sth->errstr();
131 print "finding dulicates in ", $sth->rows," unread items...\n";
132
133 while (my $row = $sth->fetchrow_hashref() ) {
134
135 my $id = $row->{id} || die "no id in now";
136
137 my $t = strip( $row->{content} ) || next;
138
139 my @s;
140 foreach my $d_id ( $deduper_unread->find_similar($t) ) {
141 push @s, $d_id if ($d_id > $id);
142 }
143 next if (! @s);
144
145 print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
146 push @duplicates, ( @s );
147
148 $feeds->{ $row->{feed_id} }++;
149 }
150
151 # mark duplicates as read
152
153 my $ids = join(",", @duplicates);
154 if (! $ids) {
155 print "no duplicates found\n";
156 exit;
157 }
158
159 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
160
161 $sql = qq{
162 update items_userdata
163 set value_numeric = 1
164 where label = 'read' and item_id in ($ids) and user_id = $user_id
165 };
166
167 $dbh->do( $sql );
168
169 # update usage_unread on modified feeds
170
171 $sql = qq{
172 update feeds_userdata
173 set value_numeric = value_numeric - ?
174 where label = 'usage_unread' and feed_id = ? and user_id = $user_id
175 };
176
177 $sth = $dbh->prepare($sql) || die $dbh->errstr();
178 foreach my $feed_id (keys %$feeds) {
179 my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
180 $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
181 print "removed $nr_read messages from feed $feed_id\n";
182 }
183

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26