1 |
dpavlin |
1.1 |
#!/usr/bin/perl -w |
2 |
|
|
|
3 |
|
|
use LWP::UserAgent; |
4 |
|
|
use HTML::TreeBuilder; |
5 |
|
|
use strict; |
6 |
|
|
require "../common.pm"; |
7 |
|
|
|
8 |
|
|
my $debug=1; |
9 |
|
|
|
10 |
|
|
my $dir = open_data_files("sciencedirect"); |
11 |
|
|
my $last_tell=0; |
12 |
|
|
|
13 |
|
|
print MPS "M working...\n"; |
14 |
|
|
|
15 |
dpavlin |
1.2 |
my $base_url = 'http://www.sciencedirect.com'; |
16 |
dpavlin |
1.1 |
my $url = $base_url . '/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
17 |
|
|
|
18 |
|
|
$debug++ if (lc($ARGV[0]) eq "-d"); |
19 |
|
|
|
20 |
|
|
sub print_debug { |
21 |
|
|
return if (! $debug); |
22 |
|
|
open(DEBUG,">> debug") || warn "can't open debug file!"; |
23 |
|
|
print DEBUG "###",@_; |
24 |
|
|
print @_; |
25 |
|
|
close(DEBUG); |
26 |
|
|
} |
27 |
|
|
|
28 |
|
|
print_debug("debug level $debug"); |
29 |
|
|
|
30 |
|
|
my $ua = new LWP::UserAgent; |
31 |
|
|
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
32 |
|
|
$ua->timeout(60); |
33 |
|
|
#$ua->env_proxy(); |
34 |
|
|
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
35 |
|
|
|
36 |
dpavlin |
1.6 |
print "getting '$url'...\n"; |
37 |
dpavlin |
1.4 |
my $req = HTTP::Request->new(GET => $url); |
38 |
dpavlin |
1.1 |
|
39 |
dpavlin |
1.4 |
my $res = $ua->request($req); |
40 |
|
|
if ($res->is_success) { |
41 |
dpavlin |
1.6 |
print "parsing html...\n"; |
42 |
dpavlin |
1.1 |
my $tree = HTML::TreeBuilder->new; |
43 |
dpavlin |
1.4 |
# $tree->parse_file("list.html"); # ! |
44 |
|
|
$tree->parse($res->content); |
45 |
dpavlin |
1.1 |
|
46 |
|
|
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
47 |
|
|
my $link; |
48 |
|
|
if ($link = $tr->look_down('_tag','a')) { |
49 |
|
|
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
50 |
|
|
my $bib = ""; |
51 |
|
|
my $mps = ""; |
52 |
|
|
|
53 |
|
|
$bib .= "%tip Èasopis\n"; |
54 |
dpavlin |
1.2 |
$bib .= "%tip on-line\n"; |
55 |
dpavlin |
1.1 |
# url |
56 |
|
|
$bib .= "%856 $base_url".$link->attr('href')."\n"; |
57 |
|
|
# naslov |
58 |
dpavlin |
1.2 |
$bib .= "%200+ ".$link->as_text."\n"; |
59 |
dpavlin |
1.1 |
$mps .= mps_expand(2,$link->as_text); |
60 |
|
|
|
61 |
dpavlin |
1.3 |
# tip |
62 |
dpavlin |
1.5 |
$mps .= mps_expand(17,"on-line casopis"); |
63 |
dpavlin |
1.3 |
|
64 |
dpavlin |
1.4 |
$mps .= "H ".$link->as_text." <i>(on-line, ScrienceDirect)</i>\n"; |
65 |
dpavlin |
1.1 |
|
66 |
dpavlin |
1.2 |
print R $bib."\n"; |
67 |
dpavlin |
1.1 |
$mps .= "T document text/plain ".(tell(R) - $last_tell)." $dir/bib $last_tell ".tell(R)."\n"; |
68 |
|
|
$last_tell=tell(R); |
69 |
|
|
|
70 |
dpavlin |
1.2 |
print R "\n"; |
71 |
dpavlin |
1.1 |
|
72 |
|
|
$mps .= "E\n"; |
73 |
|
|
|
74 |
|
|
print S $mps; |
75 |
|
|
print MPS $mps; |
76 |
|
|
} |
77 |
|
|
} |
78 |
|
|
} |
79 |
|
|
|
80 |
|
|
$tree->delete; # clear memory! |
81 |
|
|
|
82 |
|
|
} else { |
83 |
|
|
warn "can't fetch web page from '$url'"; |
84 |
|
|
} |
85 |
|
|
|
86 |
|
|
print S "M over and out\nX\n"; |
87 |
|
|
print MPS "M over and out\nX\n"; |
88 |
|
|
close(MPS); |