/[scripts]/trunk/google-groups2mbox.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/google-groups2mbox.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 141 - (show annotations)
Fri Nov 26 14:54:52 2010 UTC (12 years, 2 months ago) by dpavlin
File MIME type: text/plain
File size: 1174 byte(s)
craws whole google groups archive

1 #!/usr/bin/perl
2 use warnings;
3 use strict;
4
5 # Craws google ground and create mbox archive
6 # Dobrica Pavlinusic <dpavlin@rot13.org> 2010-11-26
7 #
8 # usage:
9 #
10 # ./google-groups2mbox.pl angular > angular
11 #
12 # You can also continue import from selected offset:
13 #
14 # START=30 ./google-groups2mbox.pl angular >> angular
15
16 my $group = $ARGV[0] || die "usage: $0 google-groups-name\n";
17
18 use WWW::Mechanize;
19 use Data::Dump qw(dump);
20
21 my $mech = WWW::Mechanize->new();
22
23 $mech->get( "http://groups.google.com/group/$group/topics?gvc=2" );
24
25 my $start = $ENV{START} || 0;
26
27 while (1) {
28 $mech->follow_link( url_regex => qr/start=$start/ ) if $start;
29
30 foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) {
31 $start++;
32 print STDERR "# $start ",$link->text;
33 $mech->follow_link( url => $link->url );
34 foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) {
35 $mech->get( $m_link->url . '&output=gplain' );
36 print STDERR ".";
37 my $msg = $mech->content;
38 $msg =~ s/\r//gs;
39 $msg =~ s/^\s+//s;
40 print "From $group\@googlegroups.com " . localtime() . "\n$msg\n";
41 $mech->back;
42 sleep 1;
43 }
44 print STDERR "\n";
45 $mech->back;
46 }
47 }

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26