/[webpac2]/trunk/bin/isi-download-results.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/bin/isi-download-results.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1290 by dpavlin, Sat Sep 19 12:43:03 2009 UTC revision 1291 by dpavlin, Sat Sep 19 15:05:12 2009 UTC
# Line 3  Line 3 
3  use warnings;  use warnings;
4  use strict;  use strict;
5    
6  my $q = 'AD=Croatia';  our $q = 'AD=Croatia';
7  my $range_size = 500;  my $range_size = 500;
8    
9  my $dump = @ARGV ? 1 : 0;  my $dump = @ARGV ? 1 : 0;
# Line 11  my $dump = @ARGV ? 1 : 0; Line 11  my $dump = @ARGV ? 1 : 0;
11  $q = 'TS=psychology AND AD=Croatia';  $q = 'TS=psychology AND AD=Croatia';
12    
13  use WWW::Mechanize;  use WWW::Mechanize;
14  use Data::Dump qw/dump/;  use Data::Dump qw(dump);
15    use File::Path;
16    
17  our $mech = WWW::Mechanize->new(  our $mech = WWW::Mechanize->new(
18          autocheck => 1,          autocheck => 1,
# Line 20  our $mech = WWW::Mechanize->new( Line 21  our $mech = WWW::Mechanize->new(
21    
22  our $step = 0;  our $step = 0;
23    
24    my $dir = '/tmp/isi/';
25    rmtree $dir if -e $dir;
26    mkdir $dir;
27    
28  sub save_mech {  sub save_mech {
29          my ( $mech, $path ) = @_;          my ( $mech, $path ) = @_;
30          $step++;          $step++;
31          mkdir '/tmp/isi/' unless -e '/tmp/isi';          my $base_path = sprintf('%s/%04d', $dir,$step);
         my $base_path = sprintf('/tmp/isi/%04d', $step);  
32          $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );          $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );
33          $mech->save_content( $path );          $mech->save_content( $path );
34          warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";          warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
# Line 37  $mech->get( 'http://isiknowledge.com/?De Line 41  $mech->get( 'http://isiknowledge.com/?De
41  save_mech $mech;  save_mech $mech;
42    
43  sub search {  sub search {
         my $q = shift;  
   
44          warn "# advanced serach";          warn "# advanced serach";
45          $mech->follow_link( url_regex => qr/AdvancedSearch/ );          $mech->follow_link( url_regex => qr/AdvancedSearch/ );
46          save_mech $mech;          save_mech $mech;
# Line 58  sub search { Line 60  sub search {
60  }  }
61    
62  sub get_results {  sub get_results {
         my $q = shift;  
63          my $from = 1;          my $from = 1;
64    
65          while ( 1 ) {          while ( 1 ) {
# Line 117  sub citations { Line 118  sub citations {
118  }  }
119    
120  sub years {  sub years {
121          my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs;          my $years_url = $mech->find_link( text_regex => qr/more options/ );
122            if ( ! $years_url ) {
123                    warn "W: can't find years\n";
124                    return;
125            }
126            $years_url = $years_url->url_abs;
127          warn "## $years_url";          warn "## $years_url";
128          $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name";          if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
129                    warn "W: no ra_name\n";
130                    return;
131            }
132          warn "# refine years (hidden by javascript)";          warn "# refine years (hidden by javascript)";
133          warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";          warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
134          $mech->get( $years_url );          $mech->get( $years_url );
135          save_mech $mech;          save_mech $mech;
136    
137          my $html = $mech->content;          my $html = $mech->content;
138          my @years;          my $years;
139          while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {          while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {
140                  push @years, [ $1 => $2 ];                  $years->{$1} = $2;
141          }          }
142          warn "# years ",dump @years;          warn "# years ",dump $years;
143          $mech->back;          $mech->back;
144          return @years;          return $years;
145  }  }
146    
147  search $q;  search;
148  years;  years;
149  get_results $q;  get_results;
150    
151    
152  citations;  citations;
153  years;  years;
154  get_results $q . '.citing';  $q .= '.citing';
155    get_results;
156    

Legend:
Removed from v.1290  
changed lines
  Added in v.1291

  ViewVC Help
Powered by ViewVC 1.1.26