/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 20 - (show annotations)
Sat Jan 20 13:41:52 2007 UTC (17 years, 3 months ago) by dpavlin
File MIME type: text/plain
File size: 3087 byte(s)
added user for which de-duping is done, all output is now to STDOUT, so you
can redirect it to /dev/null if running from cron
1 #!/usr/bin/perl -w
2
3 # reblog-dupe.pl - remove unread duplicate posts which have need read
4 #
5 # currently works without any care about users, so use is limited
6 # to single-user installations
7 #
8 # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10 use strict;
11 use DBI;
12 use Text::DeDuper;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $connect = "DBI:mysql:database=reblog";
18 my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
19
20 my $user_id = 1;
21
22 # select all posts which have been read or unread
23 my $sql = qq{
24 select
25 id, content, feed_id
26 from items
27 join items_userdata on id=item_id
28 where label = 'read'
29 and value_numeric = ?
30 and user_id = $user_id
31 order by id
32 };
33
34 my $sth = $dbh->prepare($sql) || die $dbh->errstr();
35 $sth->execute( 1 ) || die $sth->errstr();
36
37 print "found ",$sth->rows," items to process...";
38
39 my $deduper = new Text::DeDuper();
40
41 sub strip {
42 my $t = shift || return;
43 $t =~ s/<[^>]*>//gs;
44 $t =~ s/\s+/ /gs;
45 return $t if ($t ne ' ');
46 }
47
48 my $i = 0;
49
50 while (my $row = $sth->fetchrow_hashref() ) {
51
52 my $t = strip( $row->{content} ) || next;
53
54 $deduper->add_doc( $row->{id}, $t );
55
56 $i++;
57 print "$i " if ($i % 100 == 0);
58
59 }
60
61 print "\n";
62
63 my @duplicates;
64 my $feeds;
65
66 my $deduper_unread = new Text::DeDuper();
67
68 # now, take unread posts to find duplicates
69 $sth->execute( 0 ) || die $sth->errstr();
70
71 print "comparing with ", $sth->rows," unread items...\n";
72
73 while (my $row = $sth->fetchrow_hashref() ) {
74
75 my $id = $row->{id} || die "no id in now";
76
77 my $t = strip( $row->{content} ) || next;
78
79 $deduper_unread->add_doc( $row->{id}, $t );
80
81 my @s = $deduper->find_similar($t);
82 next if (! @s);
83
84 print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
85 push @duplicates, $id;
86
87 $feeds->{ $row->{feed_id} }++;
88 }
89
90 # and again, but compare just unread items
91
92 $sth->execute( 0 ) || die $sth->errstr();
93 print "finding dulicates in ", $sth->rows," unread items...\n";
94
95 while (my $row = $sth->fetchrow_hashref() ) {
96
97 my $id = $row->{id} || die "no id in now";
98
99 my $t = strip( $row->{content} ) || next;
100
101 my @s;
102 foreach my $d_id ( $deduper_unread->find_similar($t) ) {
103 push @s, $d_id if ($d_id > $id);
104 }
105 next if (! @s);
106
107 print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
108 push @duplicates, ( @s );
109
110 $feeds->{ $row->{feed_id} }++;
111 }
112
113 # mark duplicates as read
114
115 my $ids = join(",", @duplicates);
116 if (! $ids) {
117 print "no duplicates found\n";
118 exit;
119 }
120
121 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
122
123 $sql = qq{
124 update items_userdata
125 set value_numeric = 1
126 where label = 'read' and item_id in ($ids) and user_id = $user_id
127 };
128
129 $dbh->do( $sql );
130
131 # update usage_unread on modified feeds
132
133 $sql = qq{
134 update feeds_userdata
135 set value_numeric = value_numeric - ?
136 where label = 'usage_unread' and feed_id = ? and user_id = $user_id
137 };
138
139 $sth = $dbh->prepare($sql) || die $dbh->errstr();
140 foreach my $feed_id (keys %$feeds) {
141 my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
142 $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
143 print "removed $nr_read messages from feed $feed_id\n";
144 }
145

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26