/[webpac2]/trunk/bin/isi-download-results.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/bin/isi-download-results.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1289 by dpavlin, Sat Sep 19 10:23:35 2009 UTC revision 1291 by dpavlin, Sat Sep 19 15:05:12 2009 UTC
# Line 3  Line 3 
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  my $q = 'AD=Croatia';  our $q = 'AD=Croatia';
7  my $range_size = 500;  my $range_size = 500;
8    
9  my $dump = @ARGV ? 1 : 0;  my $dump = @ARGV ? 1 : 0;
# Line 11  my $dump = @ARGV ? 1 : 0; Line 11  my $dump = @ARGV ? 1 : 0;
11  $q = 'TS=psychology AND AD=Croatia';  $q = 'TS=psychology AND AD=Croatia';
12    
13  use WWW::Mechanize;  use WWW::Mechanize;
14  use Data::Dump qw/dump/;  use Data::Dump qw(dump);
15    use File::Path;
16    
17  our $mech = WWW::Mechanize->new(  our $mech = WWW::Mechanize->new(
18          autocheck => 1,          autocheck => 1,
# Line 20  our $mech = WWW::Mechanize->new( Line 21  our $mech = WWW::Mechanize->new(
21    
22  our $step = 0;  our $step = 0;
23    
24    my $dir = '/tmp/isi/';
25    rmtree $dir if -e $dir;
26    mkdir $dir;
27    
28  sub save_mech {  sub save_mech {
29          my ( $mech, $path ) = @_;          my ( $mech, $path ) = @_;
30          $step++;          $step++;
31          $path ||= sprintf('/tmp/isi.%02d.%s', $step, $mech->{ct} =~ m{html}i ? 'html' : 'txt' );          my $base_path = sprintf('%s/%04d', $dir,$step);
32            $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );
33          $mech->save_content( $path );          $mech->save_content( $path );
34          warn "# [$step] $path ", -s $path, " ", $mech->ct;          warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
35          $mech->dump_all if $dump;          open(my $dump, '>', "$base_path.dump.txt");
36            $mech->dump_all($dump);
37  }  }
38    
39  warn "# get session";  warn "# get session";
40  $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );  $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );
41  save_mech $mech;  save_mech $mech;
42    
43  warn "# advanced serach";  sub search {
44  $mech->follow_link( url_regex => qr/AdvancedSearch/ );          warn "# advanced serach";
45  save_mech $mech;          $mech->follow_link( url_regex => qr/AdvancedSearch/ );
46            save_mech $mech;
47  warn "# cookie_jar ", dump $mech->cookie_jar;  
48            warn "# cookie_jar ", dump $mech->cookie_jar;
49  $mech->submit_form(  
50          fields => {          $mech->submit_form(
51                  'value(input1)' => $q,                  fields => {
52          }                          'value(input1)' => $q,
53  );                  }
54  save_mech $mech;          );
55            save_mech $mech;
56    
57  warn "# summary";          warn "# summary";
58  $mech->follow_link( url_regex => qr/summary/ );          $mech->follow_link( url_regex => qr/summary/ );
59  save_mech $mech;          save_mech $mech;
60    }
61    
62  sub get_results {  sub get_results {
         my $q = shift;  
63          my $from = 1;          my $from = 1;
64    
65          while ( 1 ) {          while ( 1 ) {
# Line 80  sub get_results { Line 88  sub get_results {
88    
89                  if ( $mech->content =~ m{invalid API call} ) {                  if ( $mech->content =~ m{invalid API call} ) {
90                          $mech->back;                          $mech->back;
91                          return;                          last;
92                  }                  }
93    
94                  warn "# save_file $from - $to [$q]";                  warn "range $from - $to [$q]\n";
95                  $mech->follow_link( url_regex => qr/save_file/ );                  $mech->follow_link( url_regex => qr/save_file/ );
96                  save_mech $mech => "/tmp/isi.$q.$from-$to.txt";                  save_mech $mech => "/tmp/isi.$q.$from-$to.txt";
97    
# Line 93  sub get_results { Line 101  sub get_results {
101                  $mech->back;                  $mech->back;
102                  #save_mech $mech;                  #save_mech $mech;
103    
104          } # while          }
105    
106  }  }
107    
 get_results $q;  
108    
109  save_mech $mech;  sub citations {
110  warn "# citations";          save_mech $mech;
111  $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );          warn "# citation report";
112  save_mech $mech;          $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
113            save_mech $mech;
114    
115            warn "view citing articles";
116            $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
117            save_mech $mech;
118    }
119    
120    sub years {
121            my $years_url = $mech->find_link( text_regex => qr/more options/ );
122            if ( ! $years_url ) {
123                    warn "W: can't find years\n";
124                    return;
125            }
126            $years_url = $years_url->url_abs;
127            warn "## $years_url";
128            if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
129                    warn "W: no ra_name\n";
130                    return;
131            }
132            warn "# refine years (hidden by javascript)";
133            warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
134            $mech->get( $years_url );
135            save_mech $mech;
136    
137            my $html = $mech->content;
138            my $years;
139            while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {
140                    $years->{$1} = $2;
141            }
142            warn "# years ",dump $years;
143            $mech->back;
144            return $years;
145    }
146    
147    search;
148    years;
149    get_results;
150    
 $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );  
 save_mech $mech;  
151    
152  get_results $q . '.citing';  citations;
153    years;
154    $q .= '.citing';
155    get_results;
156    

Legend:
Removed from v.1289  
changed lines
  Added in v.1291

  ViewVC Help
Powered by ViewVC 1.1.26