/[safari]/get_book.sh
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /get_book.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (hide annotations)
Wed Jun 2 19:18:21 2004 UTC (19 years, 10 months ago) by dpavlin
Branch: MAIN
Changes since 1.9: +1 -1 lines
File MIME type: application/x-sh
fix wget without cookie

1 dpavlin 1.1 #!/bin/sh
2    
3 dpavlin 1.5 # proxy settings (same as in firebird)
4 dpavlin 1.7 fping proxy && export http_proxy=http://proxy:8080
5 dpavlin 1.5 # user agent (same as in firebird)
6 dpavlin 1.9 ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040506 Firefox/0.8"
7     cookie_file="cookie.txt"
8 dpavlin 1.5
9     # wait between pages
10     export wait=120
11    
12 dpavlin 1.1
13 dpavlin 1.2 if [ -z "$1" ] ; then
14     echo "Usage: $0 ISBN"
15     exit 1
16     fi
17    
18 dpavlin 1.5 if [ -e orig ] ; then
19     echo "orig directory found. Resume download? [Y/n]"
20     read ans
21     if [ "$ans" = "n" ] ; then
22     exit 1;
23     fi
24     mv orig/* .
25     rm -Rf orig
26    
27 dpavlin 1.7 grep -l 'promo.asp' * | xargs -i rm {}
28     grep -l 'This is only a preview of the full book' * | xargs -i rm {}
29 dpavlin 1.5 fi
30    
31 dpavlin 1.9 isbn=`echo $1 | sed 's/-//g'`
32 dpavlin 1.1
33     function mirror() {
34 dpavlin 1.5
35     url="$1"
36    
37     file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g`
38     if [ -e "$file" ] ; then
39     # echo "skip $url"
40 dpavlin 1.8 echo -n "."
41 dpavlin 1.5 return
42     fi
43    
44 dpavlin 1.10 cookie=" "
45 dpavlin 1.5 if echo $url | grep '/index' >/dev/null ; then
46     echo -n "no login (index) "
47     elif echo $url | grep 'mode=toc' >/dev/null ; then
48     echo -n "no login (toc) "
49     else
50 dpavlin 1.9 if [ ! -e $cookie_file ] ; then
51     echo "cookies file $cookie_file doesn't exits! Please create it."
52     echo "It should be in format:"
53     echo "Cookie: Site=UICode=&Portal=oreilly&GUID=..."
54     exit 1
55     fi
56     read cookie < $cookie_file
57     if [ -z "$cookie" ] ; then
58     echo "Empty cookie file $cookie_file !"
59     exit 1
60     fi
61 dpavlin 1.5 fi
62    
63 dpavlin 1.9 if [ -z "$cookie" ] ; then
64     echo "$url [no cookie]"
65     else
66     echo "$url [with cookie]"
67     fi
68 dpavlin 1.1
69 dpavlin 1.9 wget -q -p -nH -nc -k -t 1 -U "$ua" --cookies=off --header="$cookie" $url
70 dpavlin 1.5
71     perl -e '$t=rand($ENV{wait} || 30);print "sleep for $t sec.\n"; sleep($t);'
72 dpavlin 1.1 }
73    
74     function geturl() {
75 dpavlin 1.2 hindent -s $1 | grep -i href | grep mode=[st][eo]c | \
76 dpavlin 1.1 sed -e 's/^.*<a.*href="//i' \
77     -e 's/".*//' -e 's/amp;//g' \
78     -e 's,^[^\?]*\?,http://safari.oreilly.com/,' \
79 dpavlin 1.8 -e 's/#.*$//' \
80 dpavlin 1.2 -e 's/\&srchText=//' \
81     -e 's/open=false/open=true/' | \
82 dpavlin 1.1 grep '&s=1&b=1&f=1&t=1&c=1&u=1&r=&o=1' | \
83 dpavlin 1.2 grep $2 | \
84 dpavlin 1.5 grep -v "$2/[0-9][0-9][0-9][0-9][0-9][0-9][0-9]" | \
85 dpavlin 1.1 sort -u >> in
86     }
87    
88 dpavlin 1.2 function uniqurl() {
89     mv in in.tmp
90     grep -v 'view=[A-Z]' in.tmp | sort -u > in
91     grep 'view=[A-Z].*/index' in.tmp | sort -u >> in
92 dpavlin 1.8 links=`wc -l in | cut -d" " -f1`
93     echo "found $links unique links"
94 dpavlin 1.2 }
95    
96 dpavlin 1.4 function mirror_in() {
97     cat in | while read url ; do
98     mirror "$url"
99 dpavlin 1.5 #sleep $wait
100 dpavlin 1.4
101     if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then
102 dpavlin 1.5 echo "WARNING: safari seems to logout you as user. Aborting."
103     exit 1
104     fi
105    
106     if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then
107     echo "WARNING: your safari account is locked. Aborting."
108     exit 1
109     fi
110    
111     if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then
112     echo "WARNING: your safari session is disabled. Aborting."
113 dpavlin 1.4 exit 1
114     fi
115 dpavlin 1.7
116     if grep -i 'This is only a preview of the full book' `ls -t index.html* | head -3` >/dev/null ; then
117     echo "WARNING: you didn't add this book to your bookshelf!"
118     exit 1
119     fi
120 dpavlin 1.4 done
121 dpavlin 1.8 echo
122 dpavlin 1.3 }
123    
124 dpavlin 1.4 echo -n > in
125 dpavlin 1.2 mirror "http://safari.oreilly.com/?XmlId=$isbn"
126 dpavlin 1.8 echo
127 dpavlin 1.1
128     echo "extract URLs from first page..."
129     geturl "index.html?XmlId=$isbn" $isbn
130 dpavlin 1.2 uniqurl
131 dpavlin 1.1
132 dpavlin 1.4 mirror_in
133 dpavlin 1.1
134     echo -n "extracting URLs [1]"
135     ls index.html* | while read file ; do
136     echo -n "."
137     geturl $file $isbn
138     done
139     echo
140    
141 dpavlin 1.2 uniqurl
142 dpavlin 1.1
143 dpavlin 1.4 mirror_in
144 dpavlin 1.1
145 dpavlin 1.4 echo -n > in
146 dpavlin 1.1 echo -n "extracting URLs [2]"
147     ls index.html* | while read file ; do
148     echo -n "."
149     geturl $file $isbn
150     done
151    
152 dpavlin 1.2 uniqurl
153    
154 dpavlin 1.4 mirror_in
155 dpavlin 1.2
156     # convert links in html
157     bn=`basename $0`
158     dir=`echo $0 | sed "s/$bn$//"`
159     ls index.html* | xargs -i $dir/filter.pl {}
160     mkdir orig
161     mv index.html* orig/
162 dpavlin 1.1

  ViewVC Help
Powered by ViewVC 1.1.26