/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (hide annotations)
Mon Jul 17 10:35:44 2006 UTC (17 years, 10 months ago) by dpavlin
File MIME type: text/plain
File size: 2898 byte(s)
compare unread items with itself to remove all duplicates of unread items
1 dpavlin 12 #!/usr/bin/perl -w
2    
3     # reblog-dupe.pl - remove unread duplicate posts which have need read
4     #
5     # currently works without any care about users, so use is limited
6     # to single-user installations
7     #
8     # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9    
10     use strict;
11     use DBI;
12     use Text::DeDuper;
13     use Data::Dump qw/dump/;
14    
15     $|++;
16    
17     my $connect = "DBI:mysql:database=reblog";
18     my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
19    
20     # select all posts which have been read or unread
21     my $sql = qq{
22     select
23 dpavlin 13 id, content, feed_id
24 dpavlin 12 from items
25     join items_userdata on id=item_id
26     where label = 'read' and value_numeric = ?
27     };
28    
29     my $sth = $dbh->prepare($sql) || die $dbh->errstr();
30     $sth->execute( 1 ) || die $sth->errstr();
31    
32     print "found ",$sth->rows," items to process...";
33    
34     my $deduper = new Text::DeDuper();
35    
36     sub strip {
37     my $t = shift || return;
38     $t =~ s/<[^>]*>//gs;
39     $t =~ s/\s+/ /gs;
40     return $t if ($t ne ' ');
41     }
42    
43     while (my $row = $sth->fetchrow_hashref() ) {
44    
45     my $t = strip( $row->{content} ) || next;
46    
47     $deduper->add_doc( $row->{id}, $t );
48    
49     print ".";
50    
51     }
52    
53     print STDERR "\n";
54    
55 dpavlin 15 my @duplicates;
56     my $feeds;
57    
58     my $deduper_unread = new Text::DeDuper();
59    
60 dpavlin 12 # now, take unread posts to find duplicates
61     $sth->execute( 0 ) || die $sth->errstr();
62    
63     print "comparing with ", $sth->rows," unread items...\n";
64    
65     while (my $row = $sth->fetchrow_hashref() ) {
66    
67     my $id = $row->{id} || die "no id in now";
68    
69     my $t = strip( $row->{content} ) || next;
70    
71 dpavlin 15 $deduper_unread->add_doc( $row->{id}, $t );
72    
73 dpavlin 12 my @s = $deduper->find_similar($t);
74     next if (! @s);
75    
76     print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
77     push @duplicates, $id;
78    
79 dpavlin 13 $feeds->{ $row->{feed_id} }++;
80 dpavlin 12 }
81    
82 dpavlin 15 # and again, but compare just unread items
83    
84     $sth->execute( 0 ) || die $sth->errstr();
85     print "finding dulicates in ", $sth->rows," unread items...\n";
86    
87     while (my $row = $sth->fetchrow_hashref() ) {
88    
89     my $id = $row->{id} || die "no id in now";
90    
91     my $t = strip( $row->{content} ) || next;
92    
93     my @s = grep(! /^$id$/, $deduper->find_similar($t) );
94     next if (! @s);
95    
96     print $id, " has ", $#s + 1, " duplicates: ", join(",", @s), "\n";
97     push @duplicates, $id;
98    
99     $feeds->{ $row->{feed_id} }++;
100     }
101    
102 dpavlin 13 # mark duplicates as read
103    
104 dpavlin 12 my $ids = join(",", @duplicates);
105 dpavlin 14 if (! $ids) {
106     print "no duplicates found\n";
107     exit;
108     }
109    
110 dpavlin 12 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
111    
112     $sql = qq{
113     update items_userdata
114     set value_numeric = 1
115     where label = 'read' and item_id in ($ids)
116     };
117    
118 dpavlin 13 $dbh->do( $sql );
119    
120     # update usage_unread on modified feeds
121    
122     $sql = qq{
123     update feeds_userdata
124     set value_numeric = value_numeric - ?
125     where label = 'usage_unread' and feed_id = ?
126     };
127    
128     $sth = $dbh->prepare($sql) || die $dbh->errstr();
129     foreach my $feed_id (keys %$feeds) {
130     my $nr_read = $feeds->{$feed_id} || die "no messages marked as read";
131     $sth->execute( $nr_read, $feed_id ) || die $sth->errstr();
132     print "removed $nr_read messages from feed $feed_id\n";
133     }
134    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26