robmiller/404s.sh

## 404s.sh
#!/bin/zsh
#
# 404s
#
# Usage:
# 1. Download, rename to 404s, put in path (~/bin is a good place)
# 2. Run the script:
#
#        $ 404s http://example.com
#
# To get just the URLs and not progress updates, silence stderr:
# $ 404s http://example.com 2>/dev/null

SITE="$1"
HOSTNAME=$(echo "$SITE" | ruby -ruri -ne 'puts URI.parse($_.chomp).hostname rescue nil')

if [ -z "$HOSTNAME" ]; then
	echo "Invalid URL specified" 1>&2
	exit 1
fi

LOG_FILE="$HOSTNAME.log"

echo "Spidering site..." 1>&2
wget --spider -o "$LOG_FILE" -e robots=off -w 0.1 -r -p -nd --delete-after "$SITE"

echo "URLs that returned 404s:" 1>&2
cat "$LOG_FILE" | grep -B2 '404 Not Found' | egrep 'https?:' | ruby -pe 'gsub(/^--.+--  /, "")'

rm "$LOG_FILE"
	#!/bin/zsh
	#
	# 404s
	#
	# Usage:
	# 1. Download, rename to 404s, put in path (~/bin is a good place)
	# 2. Run the script:
	#
	# $ 404s http://example.com
	#
	# To get just the URLs and not progress updates, silence stderr:
	# $ 404s http://example.com 2>/dev/null

	SITE="$1"
	HOSTNAME=$(echo "$SITE" \| ruby -ruri -ne 'puts URI.parse($_.chomp).hostname rescue nil')

	if [ -z "$HOSTNAME" ]; then
	echo "Invalid URL specified" 1>&2
	exit 1
	fi

	LOG_FILE="$HOSTNAME.log"

	echo "Spidering site..." 1>&2
	wget --spider -o "$LOG_FILE" -e robots=off -w 0.1 -r -p -nd --delete-after "$SITE"

	echo "URLs that returned 404s:" 1>&2
	cat "$LOG_FILE" \| grep -B2 '404 Not Found' \| egrep 'https?:' \| ruby -pe 'gsub(/^--.+-- /, "")'

	rm "$LOG_FILE"