--- get_book.sh 2004/02/15 11:40:43 1.7 +++ get_book.sh 2004/07/21 13:58:35 1.12 @@ -1,13 +1,20 @@ #!/bin/sh # proxy settings (same as in firebird) -fping proxy && export http_proxy=http://proxy:8080 +fping -q proxy && export http_proxy=http://proxy:8080 +if [ -z "$http_proxy" ] ; then + fping -q proxy.lan && export http_proxy=http://proxy.lan:8080 +fi # user agent (same as in firebird) -ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.5) Gecko/20031225 Firebird/0.7" +ua="Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040506 Firefox/0.8" +cookie_file="cookie.txt" # wait between pages export wait=120 +if [ ! -z "$http_proxy" ] ; then + echo "Using proxy $http_proxy" +fi if [ -z "$1" ] ; then echo "Usage: $0 ISBN" @@ -27,9 +34,7 @@ grep -l 'This is only a preview of the full book' * | xargs -i rm {} fi -isbn=$1; - -isbn2=`echo $isbn | sed 's/-//g'` +isbn=`echo $1 | sed 's/-//g' | tr '[a-z]' '[A-Z]'` function mirror() { @@ -38,29 +43,38 @@ file=`echo $url | sed -e s,http://[^?]*?,index.html?, -e s,#.*$,, -e s,/,%2F,g` if [ -e "$file" ] ; then # echo "skip $url" + echo -n "." return fi + cookie="" if echo $url | grep '/index' >/dev/null ; then - cookies="" echo -n "no login (index) " elif echo $url | grep 'mode=toc' >/dev/null ; then - cookies="" echo -n "no login (toc) " else - cookies="--load-cookies=/home/dpavlin/.phoenix/default/g6b45nt6.slt/cookies.txt" - echo -n "with login " + if [ ! -e $cookie_file ] ; then + echo "cookies file $cookie_file doesn't exits! Please create it." + echo "It should be in format:" + echo "Cookie: Site=UICode=&Portal=oreilly&GUID=..." + exit 1 + fi + read cookie < $cookie_file + if [ -z "$cookie" ] ; then + echo "Empty cookie file $cookie_file !" + exit 1 + fi fi - echo $url - wget -p -nH -nc -k \ - -t 1 -U "$ua" \ - $cookies $url + if [ -z "$cookie" ] ; then + echo "$url [no cookie]" + else + echo "$url [with cookie]" + fi -# -D safari.oreilly.com \ -# -A 0-201-41975-0 \ + wget -q -p -nH -nc -k -t 1 -U "$ua" --cookies=off --header="$cookie" $url + perl -e '$t=rand($ENV{wait} || 120);print "sleep for $t sec.\n"; sleep($t);' - perl -e '$t=rand($ENV{wait} || 30);print "sleep for $t sec.\n"; sleep($t);' } function geturl() { @@ -68,19 +82,21 @@ sed -e 's/^.*> in } function uniqurl() { mv in in.tmp - grep -v 'view=[A-Z]' in.tmp | sort -u > in - grep 'view=[A-Z].*/index' in.tmp | sort -u >> in + grep -v 'view=[A-Z]' in.tmp | grep 'a=0$' | sort -u > in + grep 'view=[A-Z].*/index' in.tmp | grep 'a=0$' | sort -u >> in + links=`wc -l in | cut -d" " -f1` + echo "found $links unique links" } function mirror_in() { @@ -108,13 +124,17 @@ exit 1 fi done + echo } echo -n > in -mirror "http://safari.oreilly.com/?XmlId=$isbn" +mirror "http://safari.oreilly.com/$isbn" +echo -echo "extract URLs from first page..." -geturl "index.html?XmlId=$isbn" $isbn +file=`ls *index.html?XmlId=*` +isbn=`echo $file | cut -d= -f2` +echo "extract URLs from first page $file... [$isbn]" +geturl $file $isbn uniqurl mirror_in