--- get_book.sh 2003/12/14 22:27:18 1.2 +++ get_book.sh 2004/02/15 11:40:43 1.7 @@ -1,27 +1,66 @@ #!/bin/sh -#export http_proxy=http://proxy:8080 +# proxy settings (same as in firebird) +fping proxy && export http_proxy=http://proxy:8080 +# user agent (same as in firebird) +ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031225 Firebird/0.7" + +# wait between pages +export wait=120 + if [ -z "$1" ] ; then echo "Usage: $0 ISBN" exit 1 fi -isbn=$1; +if [ -e orig ] ; then + echo "orig directory found. Resume download? [Y/n]" + read ans + if [ "$ans" = "n" ] ; then + exit 1; + fi + mv orig/* . + rm -Rf orig -wait=10 + grep -l 'promo.asp' * | xargs -i rm {} + grep -l 'This is only a preview of the full book' * | xargs -i rm {} +fi + +isbn=$1; isbn2=`echo $isbn | sed 's/-//g'` function mirror() { + + url="$1" + + file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g` + if [ -e "$file" ] ; then +# echo "skip $url" + return + fi + + if echo $url | grep '/index' >/dev/null ; then + cookies="" + echo -n "no login (index) " + elif echo $url | grep 'mode=toc' >/dev/null ; then + cookies="" + echo -n "no login (toc) " + else + cookies="--load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt" + echo -n "with login " + fi + echo $url + wget -p -nH -nc -k \ - --random-wait --wait=$wait -t 0 \ - --load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt \ - -U "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031206 Firebird/0.7" \ - $1 + -t 1 -U "$ua" \ + $cookies $url # -D safari.oreilly.com \ # -A 0-201-41975-0 \ + + perl -e '$t=rand($ENV{wait} || 30);print "sleep for $t sec.\n"; sleep($t);' } function geturl() { @@ -34,6 +73,7 @@ -e 's/open=false/open=true/' | \ grep '&s=1&b=1&f=1&t=1&c=1&u=1&r=&o=1' | \ grep $2 | \ + grep -v "$2/[0-9][0-9][0-9][0-9][0-9][0-9][0-9]" | \ sort -u >> in } @@ -43,14 +83,41 @@ grep 'view=[A-Z].*/index' in.tmp | sort -u >> in } -echo > in +function mirror_in() { + cat in | while read url ; do + mirror "$url" + #sleep $wait + + if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: safari seems to logout you as user. Aborting." + exit 1 + fi + + if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari account is locked. Aborting." + exit 1 + fi + + if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari session is disabled. Aborting." + exit 1 + fi + + if grep -i 'This is only a preview of the full book' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: you didn't add this book to your bookshelf!" + exit 1 + fi + done +} + +echo -n > in mirror "http://safari.oreilly.com/?XmlId=$isbn" echo "extract URLs from first page..." geturl "index.html?XmlId=$isbn" $isbn uniqurl -mirror "-i in" +mirror_in echo -n "extracting URLs [1]" ls index.html* | while read file ; do @@ -61,9 +128,9 @@ uniqurl -mirror "-i in" +mirror_in -echo > in +echo -n > in echo -n "extracting URLs [2]" ls index.html* | while read file ; do echo -n "." @@ -72,7 +139,7 @@ uniqurl -mirror "-i in" +mirror_in # convert links in html bn=`basename $0`