Skip to content

Instantly share code, notes, and snippets.

@ryan-wendel
Last active October 7, 2018 00:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ryan-wendel/3f5699618aefc02c7c5a9e8f8c3343d0 to your computer and use it in GitHub Desktop.
Save ryan-wendel/3f5699618aefc02c7c5a9e8f8c3343d0 to your computer and use it in GitHub Desktop.
Grabs list of popular domains and aggregates disallowed path data from the top N domains' robots.txt files.
#!/bin/bash
TOP_N="5000"
SKIP_DOWNLOAD=0
BASE="/usr/local/tools/web/robots"
URL_BASE="http://s3-us-west-1.amazonaws.com/umbrella-static"
print_help() {
echo "Usage: $(basename $0) -n <integer> [-s]"
echo -e "\t-n: Number of top sites to poll"
echo -e "\t-s: Skip the initial download"
echo -e "\t-h: Display help"
}
while getopts ":n:sh" OPT; do
case ${OPT} in
n) TOP_N=${OPTARG};;
s) SKIP_DOWNLOAD=1;;
h) print_help; exit 1;;
\?) print_help; exit 1;;
:) print_help; exit 1;;
esac
done
if [[ ${OPTIND} -gt 4 ]]; then
print_help
exit 1
fi
function getRobots() {
TARGET=$1
RETVAL="99"
UP_443=$(nmap -Pn -sT --open -p 443 --host-timeout=3 ${TARGET} | grep -c "^443/")
if [[ $UP_443 -ge 1 ]]; then
wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "https://${TARGET}/robots.txt"
RETVAL=$?
dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
else
UP_80=$(nmap -Pn -sT --open -p 80 --host-timeout=3 ${TARGET} | grep -c "^80/")
if [[ $UP_80 -ge 1 ]]; then
wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "http://${TARGET}/robots.txt"
RETVAL=$?
dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
fi
fi
if [[ $RETVAL -eq 99 ]]; then
echo "Error: No web services responding on $TARGET"
elif [[ $RETVAL -gt 0 ]]; then
echo "Error: Problem with download on $TARGET"
else
echo "Success: robots.txt downloaded from $TARGET"
fi
}
shift $((OPTIND -1))
if [[ $SKIP_DOWNLOAD -eq 0 ]]; then
# look for the newest file going back 20 days
for i in {0..20}; do
DATE=$(date -d "${i} days ago" +%Y-%m-%d)
URL=${URL_BASE}/top-1m-${DATE}.csv.zip
STATUS=$(curl -s -k -w "%{http_code}" --head ${URL} | tail -1)
# look for 200 status and download
if [[ $STATUS -eq 200 ]]; then
wget -q -O ${BASE}/top-1m-${DATE}.csv.zip ${URL}
OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
break
fi
# if we made it to 20 we didn't download
if [[ ${i} -eq 20 ]]; then
echo "Error: No file was downloaded"
exit 1
fi
done
unzip -o ${BASE}/top-1m-${DATE}.csv.zip
else
DATE=$(date +%Y-%m-%d)
OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
fi
if [ -e "${BASE}/top-1m.csv" ]; then
mkdir ${BASE}/data >/dev/null 2>&1
rm -f ${BASE}/data/*_robots.txt >/dev/null 2>&1
head -${TOP_N} ${BASE}/top-1m.csv | cut -d ',' -f2 | dos2unix | while read DOMAIN; do
TARGET=""
ADDRESSES=$(host $DOMAIN | grep -c 'has address')
if [[ ${ADDRESSES} -gt 0 ]]; then
TARGET=${DOMAIN}
else
ADDRESSES=$(host "www.${DOMAIN}" | grep -c 'has address')
if [[ ${ADDRESSES} -gt 0 ]]; then
TARGET="www.${DOMAIN}"
fi
fi
if [ ! -z "${TARGET}" ]; then
getRobots $TARGET
else
echo "Error: ${DOMAIN} and www.${DOMAIN} not resolving to an IP"
fi
done
grep -hi '^disallow' ${BASE}/data/*_robots.txt | \
sed -e 's/[Dd]isallow.*\//\//g' -e 's/\?.*//g' -e 's/\*.*//g' | \
sort -u > $OUTPUT_FILE
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment