Extract links from a BBC responsive site
DOMAIN="m.bbc.co.uk"
SERVICE="hindi"
HTTP_USER_AGENT="Mozilla/5.0 (iPhone; Mobile; AppleWebKit; Safari)"
EXCLUDE_EXTENSIONS="\.\(txt\|css\|js\|png\|gif\|jpg\)$"
MAX_DEPTH="3"
wget --spider --no-directories --no-parent --force-html --recursive \
--level=$MAX_DEPTH --no-clobber \
--domains=$DOMAIN --include-directories=$SERVICE \
--user-agent="$HTTP_USER_AGENT" http://$DOMAIN/$SERVICE 2>&1 \
| grep '^--' \
| awk '{ print $3 }' \
| grep -v $EXCLUDE_EXTENSIONS \
| sort | uniq > $SERVICE.txt