1 |
#!/usr/bin/perl |
2 |
use warnings; |
3 |
use strict; |
4 |
|
5 |
# Craws google ground and create mbox archive |
6 |
# Dobrica Pavlinusic <dpavlin@rot13.org> 2010-11-26 |
7 |
# |
8 |
# usage: |
9 |
# |
10 |
# ./google-groups2mbox.pl angular > angular |
11 |
# |
12 |
# You can also continue import from selected offset: |
13 |
# |
14 |
# START=30 ./google-groups2mbox.pl angular >> angular |
15 |
|
16 |
my $group = $ARGV[0] || die "usage: $0 google-groups-name\n"; |
17 |
|
18 |
use WWW::Mechanize; |
19 |
use Data::Dump qw(dump); |
20 |
|
21 |
my $mech = WWW::Mechanize->new(); |
22 |
|
23 |
$mech->get( "http://groups.google.com/group/$group/topics?gvc=2" ); |
24 |
|
25 |
my $start = $ENV{START} || 0; |
26 |
|
27 |
while (1) { |
28 |
$mech->follow_link( url_regex => qr/start=$start/ ) if $start; |
29 |
|
30 |
foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) { |
31 |
$start++; |
32 |
print STDERR "# $start ",$link->text; |
33 |
$mech->follow_link( url => $link->url ); |
34 |
foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) { |
35 |
$mech->get( $m_link->url . '&output=gplain' ); |
36 |
print STDERR "."; |
37 |
my $msg = $mech->content; |
38 |
$msg =~ s/\r//gs; |
39 |
$msg =~ s/^\s+//s; |
40 |
print "From $group\@googlegroups.com " . localtime() . "\n$msg\n"; |
41 |
$mech->back; |
42 |
sleep 1; |
43 |
} |
44 |
print STDERR "\n"; |
45 |
$mech->back; |
46 |
} |
47 |
} |