/[safari]/get_book.sh
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /get_book.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (show annotations)
Thu Jun 17 19:43:24 2004 UTC (19 years, 10 months ago) by dpavlin
Branch: MAIN
Changes since 1.10: +15 -9 lines
File MIME type: application/x-sh
lot of improvements

1 #!/bin/sh
2
3 # proxy settings (same as in firebird)
4 fping -q proxy && export http_proxy=http://proxy:8080 ||
5 fping -q proxy.lan && export http_proxy=http://proxy.lan:8080
6 # user agent (same as in firebird)
7 ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040506 Firefox/0.8"
8 cookie_file="cookie.txt"
9
10 # wait between pages
11 export wait=120
12
13 if [ ! -z "$http_proxy" ] ; then
14 echo "Using proxy $http_proxy"
15 fi
16
17 if [ -z "$1" ] ; then
18 echo "Usage: $0 ISBN"
19 exit 1
20 fi
21
22 if [ -e orig ] ; then
23 echo "orig directory found. Resume download? [Y/n]"
24 read ans
25 if [ "$ans" = "n" ] ; then
26 exit 1;
27 fi
28 mv orig/* .
29 rm -Rf orig
30
31 grep -l 'promo.asp' * | xargs -i rm {}
32 grep -l 'This is only a preview of the full book' * | xargs -i rm {}
33 fi
34
35 isbn=`echo $1 | sed 's/-//g' | tr '[a-z]' '[A-Z]'`
36
37 function mirror() {
38
39 url="$1"
40
41 file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g`
42 if [ -e "$file" ] ; then
43 # echo "skip $url"
44 echo -n "."
45 return
46 fi
47
48 cookie=""
49 if echo $url | grep '/index' >/dev/null ; then
50 echo -n "no login (index) "
51 elif echo $url | grep 'mode=toc' >/dev/null ; then
52 echo -n "no login (toc) "
53 else
54 if [ ! -e $cookie_file ] ; then
55 echo "cookies file $cookie_file doesn't exits! Please create it."
56 echo "It should be in format:"
57 echo "Cookie: Site=UICode=&Portal=oreilly&GUID=..."
58 exit 1
59 fi
60 read cookie < $cookie_file
61 if [ -z "$cookie" ] ; then
62 echo "Empty cookie file $cookie_file !"
63 exit 1
64 fi
65 fi
66
67 if [ -z "$cookie" ] ; then
68 echo "$url [no cookie]"
69 else
70 echo "$url [with cookie]"
71 fi
72
73 wget -q -p -nH -nc -k -t 1 -U "$ua" --cookies=off --header="$cookie" $url
74 perl -e '$t=rand($ENV{wait} || 120);print "sleep for $t sec.\n"; sleep($t);'
75
76 }
77
78 function geturl() {
79 hindent -s $1 | grep -i href | grep mode=[st][eo]c | \
80 sed -e 's/^.*<a.*href="//i' \
81 -e 's/".*//' -e 's/amp;//g' \
82 -e 's,^[^\?]*\?,http://safari.oreilly.com/,' \
83 -e 's/#.*$//' \
84 -e 's/\&srchText=//' \
85 -e 's/open=false/open=true/' | \
86 grep '&s=1&b=1&f=1&t=1&c=1&u=1&r=&o=1' | \
87 grep -i "xmlid=[0-9A-Za-z\-][0-9A-Za-z\-]*" | \
88 grep -vi "xmlid=[0-9A-Za-z\-]*/[0-9][0-9][0-9][0-9][0-9][0-9][0-9]" | \
89 sort -u >> in
90 }
91
92 function uniqurl() {
93 mv in in.tmp
94 grep -v 'view=[A-Z]' in.tmp | sort -u > in
95 grep 'view=[A-Z].*/index' in.tmp | sort -u >> in
96 links=`wc -l in | cut -d" " -f1`
97 echo "found $links unique links"
98 }
99
100 function mirror_in() {
101 cat in | while read url ; do
102 mirror "$url"
103 #sleep $wait
104
105 if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then
106 echo "WARNING: safari seems to logout you as user. Aborting."
107 exit 1
108 fi
109
110 if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then
111 echo "WARNING: your safari account is locked. Aborting."
112 exit 1
113 fi
114
115 if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then
116 echo "WARNING: your safari session is disabled. Aborting."
117 exit 1
118 fi
119
120 if grep -i 'This is only a preview of the full book' `ls -t index.html* | head -3` >/dev/null ; then
121 echo "WARNING: you didn't add this book to your bookshelf!"
122 exit 1
123 fi
124 done
125 echo
126 }
127
128 echo -n > in
129 mirror "http://safari.oreilly.com/$isbn"
130 echo
131
132 file=`ls *index.html?XmlId=*`
133 isbn=`echo $file | cut -d= -f2`
134 echo "extract URLs from first page $file... [$isbn]"
135 geturl $file $isbn
136 uniqurl
137
138 mirror_in
139
140 echo -n "extracting URLs [1]"
141 ls index.html* | while read file ; do
142 echo -n "."
143 geturl $file $isbn
144 done
145 echo
146
147 uniqurl
148
149 mirror_in
150
151 echo -n > in
152 echo -n "extracting URLs [2]"
153 ls index.html* | while read file ; do
154 echo -n "."
155 geturl $file $isbn
156 done
157
158 uniqurl
159
160 mirror_in
161
162 # convert links in html
163 bn=`basename $0`
164 dir=`echo $0 | sed "s/$bn$//"`
165 ls index.html* | xargs -i $dir/filter.pl {}
166 mkdir orig
167 mv index.html* orig/
168

  ViewVC Help
Powered by ViewVC 1.1.26