/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations)
Mon Jul 17 10:35:44 2006 UTC (17 years, 11 months ago) by dpavlin
File MIME type: text/plain
File size: 2898 byte(s)
compare unread items with itself to remove all duplicates of unread items
1 #!/usr/bin/perl -w
2
3 # reblog-dupe.pl - remove unread duplicate posts which have need read
4 #
5 # currently works without any care about users, so use is limited
6 # to single-user installations
7 #
8 # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10 use strict;
11 use DBI;
12 use Text::DeDuper;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $connect = "DBI:mysql:database=reblog";
18 my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
19
20 # select all posts which have been read or unread
21 my $sql = qq{
22 select
23 id, content, feed_id
24 from items
25 join items_userdata on id=item_id
26 where label = 'read' and value_numeric = ?
27 };
28
29 my $sth = $dbh->prepare($sql) || die $dbh->errstr();
30 $sth->execute( 1 ) || die $sth->errstr();
31
32 print "found ",$sth->rows," items to process...";
33
34 my $deduper = new Text::DeDuper();
35
36 sub strip {
37 my $t = shift || return;
38 $t =~ s/<[^>]*>//gs;
39 $t =~ s/\s+/ /gs;
40 return $t if ($t ne ' ');
41 }
42
43 while (my $row = $sth->fetchrow_hashref() ) {
44
45 my $t = strip( $row->{content} ) || next;
46
47 $deduper->add_doc( $row->{id}, $t );
48
49 print ".";
50
51 }
52
53 print STDERR "\n";
54
55 my @duplicates;
56 my $feeds;
57
58 my $deduper_unread = new Text::DeDuper();
59
60 # now, take unread posts to find duplicates
61 $sth->execute( 0 ) || die $sth->errstr();
62
63 print "comparing with ", $sth->rows," unread items...\n";
64
65 while (my $row = $sth->fetchrow_hashref() ) {
66
67 my $id = $row->{id} || die "no id in now";
68
69 my $t = strip( $row->{content} ) || next;
70
71 $deduper_unread->add_doc( $row->{id}, $t );
72
73 my @s = $deduper->find_similar($t);
74 next if (! @s);
75
76 print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
77 push @duplicates, $id;
78
79 $feeds->{ $row->{feed_id} }++;
80 }
81
82 # and again, but compare just unread items
83
84 $sth->execute( 0 ) || die $sth->errstr();
85 print "finding dulicates in ", $sth->rows," unread items...\n";
86
87 while (my $row = $sth->fetchrow_hashref() ) {
88
89 my $id = $row->{id} || die "no id in now";
90
91 my $t = strip( $row->{content} ) || next;
92
93 my @s = grep(! /^$id$/, $deduper->find_similar($t) );
94 next if (! @s);
95
96 print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
97 push @duplicates, $id;
98
99 $feeds->{ $row->{feed_id} }++;
100 }
101
102 # mark duplicates as read
103
104 my $ids = join(",", @duplicates);
105 if (! $ids) {
106 print "no duplicates found\n";
107 exit;
108 }
109
110 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
111
112 $sql = qq{
113 update items_userdata
114 set value_numeric = 1
115 where label = 'read' and item_id in ($ids)
116 };
117
118 $dbh->do( $sql );
119
120 # update usage_unread on modified feeds
121
122 $sql = qq{
123 update feeds_userdata
124 set value_numeric = value_numeric - ?
125 where label = 'usage_unread' and feed_id = ?
126 };
127
128 $sth = $dbh->prepare($sql) || die $dbh->errstr();
129 foreach my $feed_id (keys %$feeds) {
130 my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
131 $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
132 print "removed $nr_read messages from feed $feed_id\n";
133 }
134

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26