/[scripts]/trunk/google-groups2mbox.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/google-groups2mbox.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 140 by dpavlin, Fri Nov 26 14:37:09 2010 UTC revision 141 by dpavlin, Fri Nov 26 14:54:52 2010 UTC
# Line 2  Line 2 
2  use warnings;  use warnings;
3  use strict;  use strict;
4    
5  my $group = 'angular';  # Craws google ground and create mbox archive
6    # Dobrica Pavlinusic <dpavlin@rot13.org> 2010-11-26
7    #
8    # usage:
9    #
10    # ./google-groups2mbox.pl angular > angular
11    #
12    # You can also continue import from selected offset:
13    #
14    # START=30 ./google-groups2mbox.pl angular >> angular
15    
16    my $group = $ARGV[0] || die "usage: $0 google-groups-name\n";
17    
18  use WWW::Mechanize;  use WWW::Mechanize;
19  use Data::Dump qw(dump);  use Data::Dump qw(dump);
# Line 11  my $mech = WWW::Mechanize->new(); Line 22  my $mech = WWW::Mechanize->new();
22    
23  $mech->get( "http://groups.google.com/group/$group/topics?gvc=2" );  $mech->get( "http://groups.google.com/group/$group/topics?gvc=2" );
24    
25  foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) {  my $start = $ENV{START} || 0;
26          print STDERR "# ",$link->text;  
27          $mech->follow_link( url => $link->url );  while (1) {
28          foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) {          $mech->follow_link( url_regex => qr/start=$start/ ) if $start;
29                  $mech->get( $m_link->url . '&output=gplain' );  
30                  print STDERR ".";          foreach my $link ( $mech->find_all_links( url_regex => qr/browse_thread/ ) ) {
31                  my $msg = $mech->content;                  $start++;
32                  $msg =~ s/\r//gs;                  print STDERR "# $start ",$link->text;
33                  $msg =~ s/^\s+//s;                  $mech->follow_link( url => $link->url );
34                  print "From $group\@googlegroups.com " . localtime() . "\n$msg\n";                  foreach my $m_link ( $mech->find_all_links( url_regex => qr/dmode=source/ ) ) {
35                            $mech->get( $m_link->url . '&output=gplain' );
36                            print STDERR ".";
37                            my $msg = $mech->content;
38                            $msg =~ s/\r//gs;
39                            $msg =~ s/^\s+//s;
40                            print "From $group\@googlegroups.com " . localtime() . "\n$msg\n";
41                            $mech->back;
42                            sleep 1;
43                    }
44                    print STDERR "\n";
45                  $mech->back;                  $mech->back;
46          }          }
         print STDERR "\n";  
         $mech->back;  
47  }  }

Legend:
Removed from v.140  
changed lines
  Added in v.141

  ViewVC Help
Powered by ViewVC 1.1.26