1 |
#!/usr/bin/perl -w |
2 |
|
3 |
# This script will fatch list of articles on which you have access |
4 |
# (using IP authorisation) from ScienceDirect |
5 |
# |
6 |
# This version requires CSV dumps from ScienceDirect for Holdings data |
7 |
# and categories, but can output much more data about each record |
8 |
|
9 |
use LWP::UserAgent; |
10 |
use HTML::TreeBuilder; |
11 |
require Text::CSV; |
12 |
use Text::Unaccent; |
13 |
use strict; |
14 |
|
15 |
my $debug=1; |
16 |
|
17 |
my $file; |
18 |
|
19 |
# uncomment following line if you want to use file instead of http connection |
20 |
#$file="list.html"; |
21 |
|
22 |
# configure ScienceDirect CVS files location |
23 |
my $csv_dir="/data/isis_data/sciencedirect"; |
24 |
my $j_holdings="sd_JournalHoldingsRpt.txt"; |
25 |
my $j_category="sd_Journal_Category.txt"; |
26 |
|
27 |
# URL to list of subscribed journals |
28 |
my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920'; |
29 |
my $html_codepage="iso-8859-1"; |
30 |
|
31 |
my $csv = Text::CSV->new(); |
32 |
my $journal; |
33 |
my $c_wo_h = 0; # category without holding record |
34 |
my $c_nr = 0; # number of categories assigned |
35 |
|
36 |
my $j_basic = 0; |
37 |
my $j_detailed = 0; |
38 |
|
39 |
print STDERR "unrolling $j_holdings\n"; |
40 |
|
41 |
sub nuc { |
42 |
# normalizing UC |
43 |
my $s=shift @_ || return ""; |
44 |
$s=unac_string($html_codepage,$s); |
45 |
$s=~s/[^\w]/ /g; |
46 |
$s=~s/ +/ /g; |
47 |
return uc($s); |
48 |
} |
49 |
|
50 |
open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!"; |
51 |
my $line = <H>; # skip header line |
52 |
while(<H>) { |
53 |
chomp; |
54 |
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
55 |
my @data = $csv->fields; |
56 |
my $key = nuc($data[0]); |
57 |
push @data,""; # for categories later... |
58 |
$journal->{$key} = \@data; |
59 |
} |
60 |
close(H); |
61 |
|
62 |
print STDERR "unrolling $j_category\n"; |
63 |
|
64 |
open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!"; |
65 |
$line = <C>; # skip header line |
66 |
while(<C>) { |
67 |
chomp; |
68 |
$csv->parse($_) || warn "can't parse '$_': ".$csv->error_input; |
69 |
my @data = $csv->fields; |
70 |
my $key = nuc($data[1]); |
71 |
if (! $journal->{$key}) { |
72 |
$c_wo_h++; |
73 |
next; |
74 |
} |
75 |
|
76 |
foreach my $i (4, 6, 8, 10) { |
77 |
push @{$journal->{$key}},$data[$i] || ""; |
78 |
if ($data[$i]) { |
79 |
$c_nr++; |
80 |
} |
81 |
} |
82 |
} |
83 |
close(C); |
84 |
|
85 |
print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n"; |
86 |
|
87 |
$debug++ if (lc($ARGV[0]) eq "-d"); |
88 |
|
89 |
|
90 |
my $res; |
91 |
if (! $file) { |
92 |
my $ua = new LWP::UserAgent; |
93 |
$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0"); |
94 |
$ua->timeout(60); |
95 |
#$ua->env_proxy(); |
96 |
#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/'); |
97 |
|
98 |
print STDERR "getting '$url'...\n" if ($debug); |
99 |
my $req = HTTP::Request->new(GET => $url); |
100 |
|
101 |
$res = $ua->request($req); |
102 |
} elsif (! -e $file) { |
103 |
die "can't find feed file '$file'"; |
104 |
} |
105 |
|
106 |
if ($file || $res->is_success) { |
107 |
print STDERR "parsing html...\n" if ($debug); |
108 |
my $tree = HTML::TreeBuilder->new; |
109 |
if ($file) { |
110 |
$tree->parse_file("list.html"); |
111 |
} else { |
112 |
$tree->parse($res->content); |
113 |
} |
114 |
|
115 |
foreach my $tr ($tree->look_down('_tag', 'tr')) { |
116 |
my $link; |
117 |
foreach my $link ($tr->look_down('_tag','a')) { |
118 |
if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) { |
119 |
my $j=nuc($link->as_text); |
120 |
if ($journal->{$j}) { |
121 |
my $i=0; |
122 |
foreach my $line (@{$journal->{$j}}) { |
123 |
print $i++,": $line\n"; |
124 |
} |
125 |
$j_detailed++; |
126 |
} else { |
127 |
print "0: ",$link->as_text."\n"; |
128 |
print "7: http://www.sciencedirect.com",$link->attr('href')."\n"; |
129 |
$j_basic++; |
130 |
print STDERR "can't find details for $j\n" if ($debug); |
131 |
} |
132 |
|
133 |
print "\n"; |
134 |
} |
135 |
} |
136 |
} |
137 |
|
138 |
$tree->delete; # clear memory! |
139 |
|
140 |
} else { |
141 |
warn "can't fetch web page from '$url'"; |
142 |
} |
143 |
|
144 |
print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n"; |
145 |
|