DOMAIN="m.bbc.co.uk"
SERVICE="hindi"
HTTP_USER_AGENT="Mozilla/5.0 (iPhone; Mobile; AppleWebKit; Safari)"
EXCLUDE_EXTENSIONS="\.\(txt\|css\|js\|png\|gif\|jpg\)$"
MAX_DEPTH="3"
wget --spider --no-directories --no-parent --force-html --recursive \
--level=$MAX_DEPTH --no-clobber \
--domains=$DOMAIN --include-directories=$SERVICE \
--user-agent="$HTTP_USER_AGENT" http://$DOMAIN/$SERVICE 2>&1 \
| grep '^--' \
| awk '{ print $3 }' \
| grep -v $EXCLUDE_EXTENSIONS \
| sort | uniq > $SERVICE.txt
Last active
May 4, 2017 11:44
-
-
Save fedecarg/7544506 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment