/[webpac2]/trunk/bin/isi-download-results.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/bin/isi-download-results.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1290 - (hide annotations)
Sat Sep 19 12:43:03 2009 UTC (14 years, 7 months ago) by dpavlin
File MIME type: text/plain
File size: 2940 byte(s)
refactor into small mini-DSL at bottom of code
and added report about year breakdown of results

1 dpavlin 1280 #!/usr/bin/perl
2    
3     use warnings;
4     use strict;
5    
6 dpavlin 1289 my $q = 'AD=Croatia';
7     my $range_size = 500;
8 dpavlin 1280
9 dpavlin 1282 my $dump = @ARGV ? 1 : 0;
10    
11 dpavlin 1289 $q = 'TS=psychology AND AD=Croatia';
12 dpavlin 1282
13 dpavlin 1280 use WWW::Mechanize;
14     use Data::Dump qw/dump/;
15    
16 dpavlin 1289 our $mech = WWW::Mechanize->new(
17 dpavlin 1280 autocheck => 1,
18     cookie_jar => undef,
19     );
20    
21 dpavlin 1281 our $step = 0;
22 dpavlin 1280
23     sub save_mech {
24 dpavlin 1281 my ( $mech, $path ) = @_;
25     $step++;
26 dpavlin 1290 mkdir '/tmp/isi/' unless -e '/tmp/isi';
27     my $base_path = sprintf('/tmp/isi/%04d', $step);
28     $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );
29 dpavlin 1281 $mech->save_content( $path );
30 dpavlin 1290 warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
31     open(my $dump, '>', "$base_path.dump.txt");
32     $mech->dump_all($dump);
33 dpavlin 1280 }
34    
35     warn "# get session";
36     $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );
37     save_mech $mech;
38    
39 dpavlin 1290 sub search {
40     my $q = shift;
41 dpavlin 1280
42 dpavlin 1290 warn "# advanced serach";
43     $mech->follow_link( url_regex => qr/AdvancedSearch/ );
44     save_mech $mech;
45 dpavlin 1280
46 dpavlin 1290 warn "# cookie_jar ", dump $mech->cookie_jar;
47 dpavlin 1280
48 dpavlin 1290 $mech->submit_form(
49     fields => {
50     'value(input1)' => $q,
51     }
52     );
53     save_mech $mech;
54 dpavlin 1280
55 dpavlin 1290 warn "# summary";
56     $mech->follow_link( url_regex => qr/summary/ );
57     save_mech $mech;
58     }
59    
60 dpavlin 1289 sub get_results {
61     my $q = shift;
62     my $from = 1;
63 dpavlin 1280
64 dpavlin 1289 while ( 1 ) {
65 dpavlin 1280
66 dpavlin 1289 my $to = $from + $range_size;
67 dpavlin 1280
68 dpavlin 1289 $mech->submit_form(
69     form_name => 'summary_output_form',
70     fields => {
71     record_select_type => 'range',
72     mark_from => $from,
73     mark_to => $to,
74     mark_id => 'WOS',
75 dpavlin 1280
76 dpavlin 1289 qo_fields => 'fullrecord',
77     citedref => 'citedref',
78 dpavlin 1280
79 dpavlin 1289 save_options => 'plain_text',
80 dpavlin 1280
81 dpavlin 1289 fields => 'Full',
82     format => 'save',
83     },
84     button => 'save',
85     );
86     save_mech $mech;
87 dpavlin 1281
88 dpavlin 1289 if ( $mech->content =~ m{invalid API call} ) {
89     $mech->back;
90 dpavlin 1290 last;
91 dpavlin 1289 }
92 dpavlin 1281
93 dpavlin 1290 warn "range $from - $to [$q]\n";
94 dpavlin 1289 $mech->follow_link( url_regex => qr/save_file/ );
95     save_mech $mech => "/tmp/isi.$q.$from-$to.txt";
96 dpavlin 1281
97 dpavlin 1289 $from += $range_size;
98 dpavlin 1281
99 dpavlin 1289 $mech->back;
100     $mech->back;
101     #save_mech $mech;
102 dpavlin 1281
103 dpavlin 1290 }
104 dpavlin 1289
105 dpavlin 1281 }
106 dpavlin 1289
107    
108 dpavlin 1290 sub citations {
109     save_mech $mech;
110     warn "# citation report";
111     $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
112     save_mech $mech;
113 dpavlin 1289
114 dpavlin 1290 warn "view citing articles";
115     $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
116     save_mech $mech;
117     }
118 dpavlin 1289
119 dpavlin 1290 sub years {
120     my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs;
121     warn "## $years_url";
122     $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name";
123     warn "# refine years (hidden by javascript)";
124     warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
125     $mech->get( $years_url );
126     save_mech $mech;
127    
128     my $html = $mech->content;
129     my @years;
130     while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {
131     push @years, [ $1 => $2 ];
132     }
133     warn "# years ",dump @years;
134     $mech->back;
135     return @years;
136     }
137    
138     search $q;
139     years;
140     get_results $q;
141    
142     citations;
143     years;
144 dpavlin 1289 get_results $q . '.citing';
145    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26