--- trunk/bin/isi-download-results.pl 2009/09/19 15:05:12 1291 +++ trunk/bin/isi-download-results.pl 2009/09/19 20:35:43 1292 @@ -3,12 +3,17 @@ use warnings; use strict; +# Advanced search syntax: +# http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html + our $q = 'AD=Croatia'; my $range_size = 500; +my $overlap = 10; # between previous and this range -my $dump = @ARGV ? 1 : 0; +$q = 'TS=psychology AND AD=Croatia' if @ARGV; # FIXME debug -$q = 'TS=psychology AND AD=Croatia'; +my $max_cites = 5000; # ISI limit to get cites +our @ranges; use WWW::Mechanize; use Data::Dump qw(dump); @@ -26,10 +31,11 @@ mkdir $dir; sub save_mech { - my ( $mech, $path ) = @_; + my $path = shift; $step++; my $base_path = sprintf('%s/%04d', $dir,$step); - $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' ); + $path ||= $base_path; + $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt'; $mech->save_content( $path ); warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n"; open(my $dump, '>', "$base_path.dump.txt"); @@ -38,34 +44,44 @@ warn "# get session"; $mech->get( 'http://isiknowledge.com/?DestApp=WOS' ); -save_mech $mech; +save_mech; sub search { warn "# advanced serach"; $mech->follow_link( url_regex => qr/AdvancedSearch/ ); - save_mech $mech; + save_mech; warn "# cookie_jar ", dump $mech->cookie_jar; + my $q_this = $q; + + if ( @ranges ) { + $q_this .= ' AND (' . join(' OR ', map { "PY=$_" } @{ shift @ranges } ) . ')'; + } + + warn "# submit_form search: $q_this\n"; $mech->submit_form( fields => { - 'value(input1)' => $q, - } + 'value(input1)' => $q_this, + }, ); - save_mech $mech; + save_mech; warn "# summary"; $mech->follow_link( url_regex => qr/summary/ ); - save_mech $mech; + save_mech; } sub get_results { + my $desc = shift; my $from = 1; while ( 1 ) { my $to = $from + $range_size; + warn "# submit_form results $from - $to\n"; + $mech->submit_form( form_name => 'summary_output_form', fields => { @@ -84,63 +100,97 @@ }, button => 'save', ); - save_mech $mech; + save_mech; + if ( $mech->content =~ m{invalid API call} ) { $mech->back; last; } - warn "range $from - $to [$q]\n"; + + my $path = "/tmp/isi.$q.$from-$to"; + $path .= '.' . $desc if $desc; + + warn "save $from - $to into $path\n"; $mech->follow_link( url_regex => qr/save_file/ ); - save_mech $mech => "/tmp/isi.$q.$from-$to.txt"; + save_mech $path; - $from += $range_size; + $from += $range_size - $overlap; $mech->back; $mech->back; - #save_mech $mech; - + #save_mech; } - } sub citations { - save_mech $mech; warn "# citation report"; $mech->follow_link( url_regex => qr/search_mode=CitationReport/ ); - save_mech $mech; + save_mech; warn "view citing articles"; $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ ); - save_mech $mech; + save_mech; } sub years { - my $years_url = $mech->find_link( text_regex => qr/more options/ ); + my $years_url = $mech->find_link( url_regex => qr/ra_name=/ ); if ( ! $years_url ) { - warn "W: can't find years\n"; + warn "W: can't find ra_name link\n"; return; } $years_url = $years_url->url_abs; warn "## $years_url"; if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) { - warn "W: no ra_name\n"; + warn "W: no ra_name in $years_url\n"; return; } warn "# refine years (hidden by javascript)"; - warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n"; +# warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n"; $mech->get( $years_url ); - save_mech $mech; + save_mech; my $html = $mech->content; my $years; - while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\){$1} = $2; + while ( $html =~ s{(\d{4})\s\(([\d,]+)\)}{} ) { + my ( $year, $count ) = ( $1, $2 ); + $count =~ s{,}{}g; + $years->{$year} = $count; } warn "# years ",dump $years; $mech->back; + + my @y = sort keys %$years; + + my $y = shift @y; + my $size = $years->{$y}; + + @ranges = (); + my $cites_range; + $cites_range = [$y] if $y; + + foreach my $y ( @y ) { + if ( $size + $years->{$y} > $max_cites ) { + push @ranges, $cites_range; + warn "# cites_range $size years ",dump( $cites_range ),$/; + + $cites_range = []; + $size = 0; + } + $size += $years->{$y}; + push @$cites_range, $y; + } + + if ( $cites_range ) { + push @ranges, $cites_range; + warn "# cites_range $size years ",dump( $cites_range ), " FINAL\n" + } + + warn '# ranges ', dump @ranges; + @ranges = () if $#ranges == 1; # just take all + return $years; } @@ -150,7 +200,14 @@ citations; -years; -$q .= '.citing'; -get_results; + +do { + my $part; + if ( @ranges ) { + $part .= $ranges[0]->[0] . '.'; + search; + } + $part .= 'citing'; + get_results $part; +} while ( @ranges );