/[scripts]/trunk/google-groups2mbox.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/google-groups2mbox.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 141 - (hide annotations)
Fri Nov 26 14:54:52 2010 UTC (13 years, 4 months ago) by dpavlin
File MIME type: text/plain
File size: 1174 byte(s)
craws whole google groups archive

1 dpavlin 140 #!/usr/bin/perl
2     use warnings;
3     use strict;
4    
5 dpavlin 141 # Craws google ground and create mbox archive
6     # Dobrica Pavlinusic <dpavlin@rot13.org> 2010-11-26
7     #
8     # usage:
9     #
10     # ./google-groups2mbox.pl angular > angular
11     #
12     # You can also continue import from selected offset:
13     #
14     # START=30 ./google-groups2mbox.pl angular >> angular
15 dpavlin 140
16 dpavlin 141 my $group = $ARGV[0] || die "usage: $0 google-groups-name\n";
17    
18 dpavlin 140 use WWW::Mechanize;
19     use Data::Dump qw(dump);
20    
21     my $mech = WWW::Mechanize->new();
22    
23     $mech->get( "http://groups.google.com/group/$group/topics?gvc=2" );
24    
25 dpavlin 141 my $start = $ENV{START} || 0;
26    
27     while (1) {
28     $mech->follow_link( url_regex => qr/start=$start/ ) if $start;
29    
30     foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) {
31     $start++;
32     print STDERR "# $start ",$link->text;
33     $mech->follow_link( url => $link->url );
34     foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) {
35     $mech->get( $m_link->url . '&output=gplain' );
36     print STDERR ".";
37     my $msg = $mech->content;
38     $msg =~ s/\r//gs;
39     $msg =~ s/^\s+//s;
40     print "From $group\@googlegroups.com " . localtime() . "\n$msg\n";
41     $mech->back;
42     sleep 1;
43     }
44     print STDERR "\n";
45 dpavlin 140 $mech->back;
46     }
47     }

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26