eelfroth/404scraper.sh

## 404scraper.sh
#!/bin/bash

### 404 SCRAPER ###

# this script reads hostnames from stdin (newline separated)
# it sends a request for a nonesense file to each host
# on a 404 response the message body is saved to ./scraped404s/

path="this-path-probably-doesnt-exist"

mkdir -p "scraped404s"

scrape() {
    host="$1"

    echo -e "requesting \"$host/$path\""
    body="scraped404s/404.$host.html"

    response=$(curl -sSL "$host/$path" -H "Accept: text/html" --max-time 10 -D - -o $body)
    status=$(echo "$response" | awk -v RS=$'\r\n\r\n' 'END{print $(2)}')

    if [ "$status" == "404" ]; then
        echo -e "\x1b[36m  ->  saved 404.$host.html\x1b[0m"
    elif [ -n "$status" ]; then
        echo -e "\x1b[31m  !!  $host responded with status $status\x1b[0m" 1>&2
        rm $body > /dev/null
    else
        echo -e "\x1b[31m  !!  no valid response from $host\x1b[0m" 1>&2
    fi
}

while read host; do
   scrape $host &
done
wait
	#!/bin/bash

	### 404 SCRAPER ###

	# this script reads hostnames from stdin (newline separated)
	# it sends a request for a nonesense file to each host
	# on a 404 response the message body is saved to ./scraped404s/

	path="this-path-probably-doesnt-exist"

	mkdir -p "scraped404s"

	scrape() {
	host="$1"

	echo -e "requesting \"$host/$path\""
	body="scraped404s/404.$host.html"

	response=$(curl -sSL "$host/$path" -H "Accept: text/html" --max-time 10 -D - -o $body)
	status=$(echo "$response" \| awk -v RS=$'\r\n\r\n' 'END{print $(2)}')

	if [ "$status" == "404" ]; then
	echo -e "\x1b[36m -> saved 404.$host.html\x1b[0m"
	elif [ -n "$status" ]; then
	echo -e "\x1b[31m !! $host responded with status $status\x1b[0m" 1>&2
	rm $body > /dev/null
	else
	echo -e "\x1b[31m !! no valid response from $host\x1b[0m" 1>&2
	fi
	}

	while read host; do
	scrape $host &
	done
	wait