--- get_book.sh 2003/12/14 19:11:30 1.1.1.1 +++ get_book.sh 2004/07/21 13:58:35 1.12 @@ -1,46 +1,143 @@ #!/bin/sh -#export http_proxy=http://proxy.pliva.hr:8080 +# proxy settings (same as in firebird) +fping -q proxy && export http_proxy=http://proxy:8080 +if [ -z "$http_proxy" ] ; then + fping -q proxy.lan && export http_proxy=http://proxy.lan:8080 +fi +# user agent (same as in firebird) +ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040506 Firefox/0.8" +cookie_file="cookie.txt" + +# wait between pages +export wait=120 + +if [ ! -z "$http_proxy" ] ; then + echo "Using proxy $http_proxy" +fi + +if [ -z "$1" ] ; then + echo "Usage: $0 ISBN" + exit 1 +fi + +if [ -e orig ] ; then + echo "orig directory found. Resume download? [Y/n]" + read ans + if [ "$ans" = "n" ] ; then + exit 1; + fi + mv orig/* . + rm -Rf orig + + grep -l 'promo.asp' * | xargs -i rm {} + grep -l 'This is only a preview of the full book' * | xargs -i rm {} +fi -#isbn="0-201-41975-0" -isbn="0-672-32240-4" +isbn=`echo $1 | sed 's/-//g' | tr '[a-z]' '[A-Z]'` -wait=10 +function mirror() { -isbn2=`echo $isbn | sed 's/-//g'` + url="$1" -function mirror() { - wget -p -nH -nc -k \ - --random-wait --wait=$wait -t 0 \ - --load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt \ - -U "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031206 Firebird/0.7" \ - $1 + file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g` + if [ -e "$file" ] ; then +# echo "skip $url" + echo -n "." + return + fi + + cookie="" + if echo $url | grep '/index' >/dev/null ; then + echo -n "no login (index) " + elif echo $url | grep 'mode=toc' >/dev/null ; then + echo -n "no login (toc) " + else + if [ ! -e $cookie_file ] ; then + echo "cookies file $cookie_file doesn't exits! Please create it." + echo "It should be in format:" + echo "Cookie: Site=UICode=&Portal=oreilly&GUID=..." + exit 1 + fi + read cookie < $cookie_file + if [ -z "$cookie" ] ; then + echo "Empty cookie file $cookie_file !" + exit 1 + fi + fi + + if [ -z "$cookie" ] ; then + echo "$url [no cookie]" + else + echo "$url [with cookie]" + fi + + wget -q -p -nH -nc -k -t 1 -U "$ua" --cookies=off --header="$cookie" $url + perl -e '$t=rand($ENV{wait} || 120);print "sleep for $t sec.\n"; sleep($t);' -# -D safari.oreilly.com \ -# -A 0-201-41975-0 \ } function geturl() { - hindent -s $1 | grep $2 | grep -i href | grep mode=[st][eo]c | \ + hindent -s $1 | grep -i href | grep mode=[st][eo]c | \ sed -e 's/^.*> in } -echo > in -#mirror "http://safari.oreilly.com/?XmlId=$isbn" +function uniqurl() { + mv in in.tmp + grep -v 'view=[A-Z]' in.tmp | grep 'a=0$' | sort -u > in + grep 'view=[A-Z].*/index' in.tmp | grep 'a=0$' | sort -u >> in + links=`wc -l in | cut -d" " -f1` + echo "found $links unique links" +} + +function mirror_in() { + cat in | while read url ; do + mirror "$url" + #sleep $wait + + if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: safari seems to logout you as user. Aborting." + exit 1 + fi + + if grep -i '>Account locked<' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari account is locked. Aborting." + exit 1 + fi + + if grep -i 'session disabled' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: your safari session is disabled. Aborting." + exit 1 + fi + + if grep -i 'This is only a preview of the full book' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: you didn't add this book to your bookshelf!" + exit 1 + fi + done + echo +} + +echo -n > in +mirror "http://safari.oreilly.com/$isbn" +echo -echo "extract URLs from first page..." -geturl "index.html?XmlId=$isbn" $isbn +file=`ls *index.html?XmlId=*` +isbn=`echo $file | cut -d= -f2` +echo "extract URLs from first page $file... [$isbn]" +geturl $file $isbn +uniqurl -mirror "-i in" +mirror_in echo -n "extracting URLs [1]" ls index.html* | while read file ; do @@ -49,17 +146,25 @@ done echo -sort -u in > in2 +uniqurl -mirror "-i in2" +mirror_in -echo > in +echo -n > in echo -n "extracting URLs [2]" ls index.html* | while read file ; do echo -n "." geturl $file $isbn done -sort -u in > in2 +uniqurl + +mirror_in + +# convert links in html +bn=`basename $0` +dir=`echo $0 | sed "s/$bn$//"` +ls index.html* | xargs -i $dir/filter.pl {} +mkdir orig +mv index.html* orig/ -mirror "-i in2"