ryan-wendel/get_robots_paths.sh

## get_robots_paths.sh
#!/bin/bash

TOP_N="5000"
SKIP_DOWNLOAD=0

BASE="/usr/local/tools/web/robots"
URL_BASE="http://s3-us-west-1.amazonaws.com/umbrella-static"

print_help() {
    echo "Usage: $(basename $0) -n <integer> [-s]"
    echo -e "\t-n: Number of top sites to poll"
    echo -e "\t-s: Skip the initial download"
    echo -e "\t-h: Display help"
}

while getopts ":n:sh" OPT; do
    case ${OPT} in
        n) TOP_N=${OPTARG};;
        s) SKIP_DOWNLOAD=1;;
        h) print_help; exit 1;;
        \?) print_help; exit 1;;
        :) print_help; exit 1;;
    esac
done

if [[ ${OPTIND} -gt 4 ]]; then
    print_help
    exit 1
fi

function getRobots() {
    TARGET=$1
    RETVAL="99"

    UP_443=$(nmap -Pn -sT --open -p 443 --host-timeout=3 ${TARGET} | grep -c "^443/")

    if [[ $UP_443 -ge 1 ]]; then
        wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "https://${TARGET}/robots.txt"
        RETVAL=$?
        dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
    else
        UP_80=$(nmap -Pn -sT --open -p 80 --host-timeout=3 ${TARGET} | grep -c "^80/")

        if [[ $UP_80 -ge 1 ]]; then
            wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "http://${TARGET}/robots.txt"
            RETVAL=$?
            dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
        fi
    fi

    if [[ $RETVAL -eq 99 ]]; then
        echo "Error: No web services responding on $TARGET"
    elif [[ $RETVAL -gt 0 ]]; then
        echo "Error: Problem with download on $TARGET"
    else
        echo "Success: robots.txt downloaded from $TARGET"
    fi
}

shift $((OPTIND -1))

if [[ $SKIP_DOWNLOAD -eq 0 ]]; then
    # look for the newest file going back 20 days
    for i in {0..20}; do
        DATE=$(date -d "${i} days ago" +%Y-%m-%d)
        URL=${URL_BASE}/top-1m-${DATE}.csv.zip

        STATUS=$(curl -s -k -w "%{http_code}" --head ${URL} | tail -1)

        # look for 200 status and download
        if [[ $STATUS -eq 200 ]]; then
            wget -q -O ${BASE}/top-1m-${DATE}.csv.zip  ${URL}
            OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
            break
        fi

        # if we made it to 20 we didn't download
        if [[ ${i} -eq 20 ]]; then
            echo "Error: No file was downloaded"
            exit 1
        fi
    done

    unzip -o ${BASE}/top-1m-${DATE}.csv.zip
else
    DATE=$(date +%Y-%m-%d)
    OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
fi

if [ -e "${BASE}/top-1m.csv" ]; then
    mkdir ${BASE}/data >/dev/null 2>&1
    rm -f ${BASE}/data/*_robots.txt >/dev/null 2>&1

    head -${TOP_N} ${BASE}/top-1m.csv | cut -d ',' -f2 | dos2unix | while read DOMAIN; do
        TARGET=""

        ADDRESSES=$(host $DOMAIN | grep -c 'has address')

        if [[ ${ADDRESSES} -gt 0 ]]; then
            TARGET=${DOMAIN}
        else
            ADDRESSES=$(host "www.${DOMAIN}" | grep -c 'has address')
            if [[ ${ADDRESSES} -gt 0 ]]; then
                TARGET="www.${DOMAIN}"
            fi
        fi

        if [ ! -z "${TARGET}" ]; then
            getRobots $TARGET
        else
            echo "Error: ${DOMAIN} and www.${DOMAIN} not resolving to an IP"
        fi
    done

    grep -hi '^disallow' ${BASE}/data/*_robots.txt | \
    sed -e 's/[Dd]isallow.*\//\//g' -e 's/\?.*//g' -e 's/\*.*//g' | \
    sort -u > $OUTPUT_FILE
fi
	#!/bin/bash

	TOP_N="5000"
	SKIP_DOWNLOAD=0

	BASE="/usr/local/tools/web/robots"
	URL_BASE="http://s3-us-west-1.amazonaws.com/umbrella-static"

	print_help() {
	echo "Usage: $(basename $0) -n <integer> [-s]"
	echo -e "\t-n: Number of top sites to poll"
	echo -e "\t-s: Skip the initial download"
	echo -e "\t-h: Display help"
	}

	while getopts ":n:sh" OPT; do
	case ${OPT} in
	n) TOP_N=${OPTARG};;
	s) SKIP_DOWNLOAD=1;;
	h) print_help; exit 1;;
	\?) print_help; exit 1;;
	:) print_help; exit 1;;
	esac
	done

	if [[ ${OPTIND} -gt 4 ]]; then
	print_help
	exit 1
	fi

	function getRobots() {
	TARGET=$1
	RETVAL="99"

	UP_443=$(nmap -Pn -sT --open -p 443 --host-timeout=3 ${TARGET} \| grep -c "^443/")

	if [[ $UP_443 -ge 1 ]]; then
	wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "https://${TARGET}/robots.txt"
	RETVAL=$?
	dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
	else
	UP_80=$(nmap -Pn -sT --open -p 80 --host-timeout=3 ${TARGET} \| grep -c "^80/")

	if [[ $UP_80 -ge 1 ]]; then
	wget --tries=2 --timeout=5 -q -O ${BASE}/data/${TARGET}_robots.txt "http://${TARGET}/robots.txt"
	RETVAL=$?
	dos2unix ${BASE}/data/${TARGET}_robots.txt >/dev/null 2>&1
	fi
	fi

	if [[ $RETVAL -eq 99 ]]; then
	echo "Error: No web services responding on $TARGET"
	elif [[ $RETVAL -gt 0 ]]; then
	echo "Error: Problem with download on $TARGET"
	else
	echo "Success: robots.txt downloaded from $TARGET"
	fi
	}

	shift $((OPTIND -1))

	if [[ $SKIP_DOWNLOAD -eq 0 ]]; then
	# look for the newest file going back 20 days
	for i in {0..20}; do
	DATE=$(date -d "${i} days ago" +%Y-%m-%d)
	URL=${URL_BASE}/top-1m-${DATE}.csv.zip

	STATUS=$(curl -s -k -w "%{http_code}" --head ${URL} \| tail -1)

	# look for 200 status and download
	if [[ $STATUS -eq 200 ]]; then
	wget -q -O ${BASE}/top-1m-${DATE}.csv.zip ${URL}
	OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
	break
	fi

	# if we made it to 20 we didn't download
	if [[ ${i} -eq 20 ]]; then
	echo "Error: No file was downloaded"
	exit 1
	fi
	done

	unzip -o ${BASE}/top-1m-${DATE}.csv.zip
	else
	DATE=$(date +%Y-%m-%d)
	OUTPUT_FILE="${BASE}/robots-dirs-${DATE}.txt"
	fi

	if [ -e "${BASE}/top-1m.csv" ]; then
	mkdir ${BASE}/data >/dev/null 2>&1
	rm -f ${BASE}/data/*_robots.txt >/dev/null 2>&1

	head -${TOP_N} ${BASE}/top-1m.csv \| cut -d ',' -f2 \| dos2unix \| while read DOMAIN; do
	TARGET=""

	ADDRESSES=$(host $DOMAIN \| grep -c 'has address')

	if [[ ${ADDRESSES} -gt 0 ]]; then
	TARGET=${DOMAIN}
	else
	ADDRESSES=$(host "www.${DOMAIN}" \| grep -c 'has address')
	if [[ ${ADDRESSES} -gt 0 ]]; then
	TARGET="www.${DOMAIN}"
	fi
	fi

	if [ ! -z "${TARGET}" ]; then
	getRobots $TARGET
	else
	echo "Error: ${DOMAIN} and www.${DOMAIN} not resolving to an IP"
	fi
	done

	grep -hi '^disallow' ${BASE}/data/*_robots.txt \| \
	sed -e 's/[Dd]isallow.\//\//g' -e 's/\?.//g' -e 's/\.//g' \| \
	sort -u > $OUTPUT_FILE
	fi