fedecarg/wget-spider.md

## wget-spider.md

      
    Raw
  

              wget-spider.md
            
          
    Extract links from a BBC responsive site

DOMAIN="m.bbc.co.uk"
SERVICE="hindi"
HTTP_USER_AGENT="Mozilla/5.0 (iPhone; Mobile; AppleWebKit; Safari)"
EXCLUDE_EXTENSIONS="\.\(txt\|css\|js\|png\|gif\|jpg\)$"
MAX_DEPTH="3"

wget --spider --no-directories --no-parent --force-html --recursive \
 --level=$MAX_DEPTH --no-clobber \
 --domains=$DOMAIN --include-directories=$SERVICE \
 --user-agent="$HTTP_USER_AGENT" http://$DOMAIN/$SERVICE 2>&1 \
 | grep '^--' \
 | awk '{ print $3 }' \
 | grep -v $EXCLUDE_EXTENSIONS \
 | sort | uniq > $SERVICE.txt