--- get_book.sh 2003/12/15 09:33:29 1.4 +++ get_book.sh 2004/06/02 16:14:53 1.9 @@ -1,27 +1,74 @@ #!/bin/sh -export http_proxy=http://proxy:8080 +# proxy settings (same as in firebird) +fping proxy && export http_proxy=http://proxy:8080 +# user agent (same as in firebird) +ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040506 Firefox/0.8" +cookie_file="cookie.txt" + +# wait between pages +export wait=120 + if [ -z "$1" ] ; then echo "Usage: $0 ISBN" exit 1 fi -isbn=$1; +if [ -e orig ] ; then + echo "orig directory found. Resume download? [Y/n]" + read ans + if [ "$ans" = "n" ] ; then + exit 1; + fi + mv orig/* . + rm -Rf orig -wait=10 + grep -l 'promo.asp' * | xargs -i rm {} + grep -l 'This is only a preview of the full book' * | xargs -i rm {} +fi -isbn2=`echo $isbn | sed 's/-//g'` +isbn=`echo $1 | sed 's/-//g'` function mirror() { - wget -p -nH -nc -k \ - --random-wait --wait=$wait -t 0 \ - --load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt \ - -U "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031206 Firebird/0.7" \ - $1 -# -D safari.oreilly.com \ -# -A 0-201-41975-0 \ + url="$1" + + file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g` + if [ -e "$file" ] ; then +# echo "skip $url" + echo -n "." + return + fi + + cookie="" + if echo $url | grep '/index' >/dev/null ; then + echo -n "no login (index) " + elif echo $url | grep 'mode=toc' >/dev/null ; then + echo -n "no login (toc) " + else + if [ ! -e $cookie_file ] ; then + echo "cookies file $cookie_file doesn't exits! Please create it." + echo "It should be in format:" + echo "Cookie: Site=UICode=&Portal=oreilly&GUID=..." + exit 1 + fi + read cookie < $cookie_file + if [ -z "$cookie" ] ; then + echo "Empty cookie file $cookie_file !" + exit 1 + fi + fi + + if [ -z "$cookie" ] ; then + echo "$url [no cookie]" + else + echo "$url [with cookie]" + fi + + wget -q -p -nH -nc -k -t 1 -U "$ua" --cookies=off --header="$cookie" $url + + perl -e '$t=rand($ENV{wait} || 30);print "sleep for $t sec.\n"; sleep($t);' } function geturl() { @@ -29,11 +76,12 @@ sed -e 's/^.*> in } @@ -41,21 +89,41 @@ mv in in.tmp grep -v 'view=[A-Z]' in.tmp | sort -u > in grep 'view=[A-Z].*/index' in.tmp | sort -u >> in + links=`wc -l in | cut -d" " -f1` + echo "found $links unique links" } function mirror_in() { cat in | while read url ; do mirror "$url" + #sleep $wait if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then - echo "WARNING: safari seems to logunt you as user. Aborting." + echo "WARNING: safari seems to logout you as user. Aborting." + exit 1 + fi + + if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari account is locked. Aborting." + exit 1 + fi + + if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari session is disabled. Aborting." + exit 1 + fi + + if grep -i 'This is only a preview of the full book' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: you didn't add this book to your bookshelf!" exit 1 fi done + echo } echo -n > in mirror "http://safari.oreilly.com/?XmlId=$isbn" +echo echo "extract URLs from first page..." geturl "index.html?XmlId=$isbn" $isbn