--- trunk/bin/isi-download-results.pl 2009/09/19 10:23:35 1289 +++ trunk/bin/isi-download-results.pl 2009/09/19 12:43:03 1290 @@ -23,32 +23,39 @@ sub save_mech { my ( $mech, $path ) = @_; $step++; - $path ||= sprintf('/tmp/isi.%02d.%s', $step, $mech->{ct} =~ m{html}i ? 'html' : 'txt' ); + mkdir '/tmp/isi/' unless -e '/tmp/isi'; + my $base_path = sprintf('/tmp/isi/%04d', $step); + $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' ); $mech->save_content( $path ); - warn "# [$step] $path ", -s $path, " ", $mech->ct; - $mech->dump_all if $dump; + warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n"; + open(my $dump, '>', "$base_path.dump.txt"); + $mech->dump_all($dump); } warn "# get session"; $mech->get( 'http://isiknowledge.com/?DestApp=WOS' ); save_mech $mech; -warn "# advanced serach"; -$mech->follow_link( url_regex => qr/AdvancedSearch/ ); -save_mech $mech; - -warn "# cookie_jar ", dump $mech->cookie_jar; +sub search { + my $q = shift; -$mech->submit_form( - fields => { - 'value(input1)' => $q, - } -); -save_mech $mech; + warn "# advanced serach"; + $mech->follow_link( url_regex => qr/AdvancedSearch/ ); + save_mech $mech; + + warn "# cookie_jar ", dump $mech->cookie_jar; + + $mech->submit_form( + fields => { + 'value(input1)' => $q, + } + ); + save_mech $mech; -warn "# summary"; -$mech->follow_link( url_regex => qr/summary/ ); -save_mech $mech; + warn "# summary"; + $mech->follow_link( url_regex => qr/summary/ ); + save_mech $mech; +} sub get_results { my $q = shift; @@ -80,10 +87,10 @@ if ( $mech->content =~ m{invalid API call} ) { $mech->back; - return; + last; } - warn "# save_file $from - $to [$q]"; + warn "range $from - $to [$q]\n"; $mech->follow_link( url_regex => qr/save_file/ ); save_mech $mech => "/tmp/isi.$q.$from-$to.txt"; @@ -93,19 +100,46 @@ $mech->back; #save_mech $mech; - } # while + } } -get_results $q; -save_mech $mech; -warn "# citations"; -$mech->follow_link( url_regex => qr/search_mode=CitationReport/ ); -save_mech $mech; +sub citations { + save_mech $mech; + warn "# citation report"; + $mech->follow_link( url_regex => qr/search_mode=CitationReport/ ); + save_mech $mech; + + warn "view citing articles"; + $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ ); + save_mech $mech; +} -$mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ ); -save_mech $mech; +sub years { + my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs; + warn "## $years_url"; + $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name"; + warn "# refine years (hidden by javascript)"; + warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n"; + $mech->get( $years_url ); + save_mech $mech; + + my $html = $mech->content; + my @years; + while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\) $2 ]; + } + warn "# years ",dump @years; + $mech->back; + return @years; +} + +search $q; +years; +get_results $q; +citations; +years; get_results $q . '.citing';