/[refeed]/trunk/deduper/reblog-dupe.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/deduper/reblog-dupe.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 12 - (show annotations)
Sat Jul 8 23:16:43 2006 UTC (17 years, 11 months ago) by dpavlin
File MIME type: text/plain
File size: 1784 byte(s)
perl script using Text::DeDuper to archive posts which have been allready
archived. It works for single-user installations currently.
1 #!/usr/bin/perl -w
2
3 # reblog-dupe.pl - remove unread duplicate posts which have need read
4 #
5 # currently works without any care about users, so use is limited
6 # to single-user installations
7 #
8 # 07/08/2006 06:26:47 PM CEST Dobrica Pavlinusic <dpavlin@rot13.org>
9
10 use strict;
11 use DBI;
12 use Text::DeDuper;
13 use Data::Dump qw/dump/;
14
15 $|++;
16
17 my $connect = "DBI:mysql:database=reblog";
18 my $dbh = DBI->connect($connect,"","") || die $DBI::errstr;
19
20 # select all posts which have been read or unread
21 my $sql = qq{
22 select
23 id, content
24 from items
25 join items_userdata on id=item_id
26 where label = 'read' and value_numeric = ?
27 };
28
29 my $sth = $dbh->prepare($sql) || die $dbh->errstr();
30 $sth->execute( 1 ) || die $sth->errstr();
31
32 print "found ",$sth->rows," items to process...";
33
34 my $deduper = new Text::DeDuper();
35
36 sub strip {
37 my $t = shift || return;
38 $t =~ s/<[^>]*>//gs;
39 $t =~ s/\s+/ /gs;
40 return $t if ($t ne ' ');
41 }
42
43 while (my $row = $sth->fetchrow_hashref() ) {
44
45 my $t = strip( $row->{content} ) || next;
46
47 $deduper->add_doc( $row->{id}, $t );
48
49 print ".";
50
51 }
52
53 print STDERR "\n";
54
55 # now, take unread posts to find duplicates
56 $sth->execute( 0 ) || die $sth->errstr();
57
58 print "comparing with ", $sth->rows," unread items...\n";
59
60 my @duplicates;
61
62 while (my $row = $sth->fetchrow_hashref() ) {
63
64 my $id = $row->{id} || die "no id in now";
65
66 my $t = strip( $row->{content} ) || next;
67
68 my @s = $deduper->find_similar($t);
69 next if (! @s);
70
71 print $id, " has ", $#s + 1, " copies: ", join(",", @s), "\n";
72 push @duplicates, $id;
73
74 }
75
76 my $ids = join(",", @duplicates);
77 print "found ", $#duplicates + 1, " duplicate items: $ids\n";
78
79 $sql = qq{
80 update items_userdata
81 set value_numeric = 1
82 where label = 'read' and item_id in ($ids)
83 };
84
85 #$dbh->do( $sql );
86 print "de-dupe sql:\n$sql\n" if ($ids);

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26