--- get_book.sh 2003/12/14 19:11:30 1.1 +++ get_book.sh 2003/12/15 09:33:29 1.4 @@ -1,9 +1,13 @@ #!/bin/sh -#export http_proxy=http://proxy.pliva.hr:8080 +export http_proxy=http://proxy:8080 -#isbn="0-201-41975-0" -isbn="0-672-32240-4" +if [ -z "$1" ] ; then + echo "Usage: $0 ISBN" + exit 1 +fi + +isbn=$1; wait=10 @@ -21,26 +25,43 @@ } function geturl() { - hindent -s $1 | grep $2 | grep -i href | grep mode=[st][eo]c | \ + hindent -s $1 | grep -i href | grep mode=[st][eo]c | \ sed -e 's/^.*> in } -echo > in -#mirror "http://safari.oreilly.com/?XmlId=$isbn" +function uniqurl() { + mv in in.tmp + grep -v 'view=[A-Z]' in.tmp | sort -u > in + grep 'view=[A-Z].*/index' in.tmp | sort -u >> in +} + +function mirror_in() { + cat in | while read url ; do + mirror "$url" + + if grep 'promo.asp' `ls -t index.html* | head -3` >/dev/null ; then + echo "WARNING: safari seems to logunt you as user. Aborting." + exit 1 + fi + done +} + +echo -n > in +mirror "http://safari.oreilly.com/?XmlId=$isbn" echo "extract URLs from first page..." geturl "index.html?XmlId=$isbn" $isbn +uniqurl -mirror "-i in" +mirror_in echo -n "extracting URLs [1]" ls index.html* | while read file ; do @@ -49,17 +70,25 @@ done echo -sort -u in > in2 +uniqurl -mirror "-i in2" +mirror_in -echo > in +echo -n > in echo -n "extracting URLs [2]" ls index.html* | while read file ; do echo -n "." geturl $file $isbn done -sort -u in > in2 +uniqurl + +mirror_in + +# convert links in html +bn=`basename $0` +dir=`echo $0 | sed "s/$bn$//"` +ls index.html* | xargs -i $dir/filter.pl {} +mkdir orig +mv index.html* orig/ -mirror "-i in2"