--- get_book.sh 2003/12/14 19:11:30 1.1 +++ get_book.sh 2003/12/14 22:27:18 1.2 @@ -1,9 +1,13 @@ #!/bin/sh -#export http_proxy=http://proxy.pliva.hr:8080 +#export http_proxy=http://proxy:8080 -#isbn="0-201-41975-0" -isbn="0-672-32240-4" +if [ -z "$1" ] ; then + echo "Usage: $0 ISBN" + exit 1 +fi + +isbn=$1; wait=10 @@ -21,24 +25,30 @@ } function geturl() { - hindent -s $1 | grep $2 | grep -i href | grep mode=[st][eo]c | \ + hindent -s $1 | grep -i href | grep mode=[st][eo]c | \ sed -e 's/^.*> in } +function uniqurl() { + mv in in.tmp + grep -v 'view=[A-Z]' in.tmp | sort -u > in + grep 'view=[A-Z].*/index' in.tmp | sort -u >> in +} + echo > in -#mirror "http://safari.oreilly.com/?XmlId=$isbn" +mirror "http://safari.oreilly.com/?XmlId=$isbn" echo "extract URLs from first page..." geturl "index.html?XmlId=$isbn" $isbn +uniqurl mirror "-i in" @@ -49,9 +59,9 @@ done echo -sort -u in > in2 +uniqurl -mirror "-i in2" +mirror "-i in" echo > in echo -n "extracting URLs [2]" @@ -60,6 +70,14 @@ geturl $file $isbn done -sort -u in > in2 +uniqurl + +mirror "-i in" + +# convert links in html +bn=`basename $0` +dir=`echo $0 | sed "s/$bn$//"` +ls index.html* | xargs -i $dir/filter.pl {} +mkdir orig +mv index.html* orig/ -mirror "-i in2"