MikeNGarrett/utility.sh

## utility.sh
# Crawl a site's public urls to produce a csv list of urls and response codes
# This could be reduced into a single command, but I find it helpful to have a list of all urls.

# overview: crawl the site to add one url per line in a text file.
# NOTE: this must run and complete first.
# wget mirror's the site (including static files)
# grep files the line with the url on it.
# awk grabs the 3rd item (separated by spaces) and writes to urls.txt

wget --mirror -p https://domain.com/ 2>&1 | grep '^--' | awk '{ print $3 }' > urls.txt

# overview: given a file with one url per line, output a csv with urls and response codes.
# cat read the file
# xargs executes a command for each new line.
# - P 10 sets 10 parallel processes.
# - User agent matches Google's bots.
# - Only read the head.
# - Replace the write out with our own. Check out the available variables: https://ec.haxx.se/usingcurl-verbose.html#available---write-out-variables
# tee outputs piped content to a file. Same as >>

cat urls.txt | xargs -P 10 curl --user-agent "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" -o /dev/null --silent --head --write-out '%{url_effective};%{http_code};\n' | tee tested-urls.csv
	# Crawl a site's public urls to produce a csv list of urls and response codes
	# This could be reduced into a single command, but I find it helpful to have a list of all urls.

	# overview: crawl the site to add one url per line in a text file.
	# NOTE: this must run and complete first.
	# wget mirror's the site (including static files)
	# grep files the line with the url on it.
	# awk grabs the 3rd item (separated by spaces) and writes to urls.txt

	wget --mirror -p https://domain.com/ 2>&1 \| grep '^--' \| awk '{ print $3 }' > urls.txt

	# overview: given a file with one url per line, output a csv with urls and response codes.
	# cat read the file
	# xargs executes a command for each new line.
	# - P 10 sets 10 parallel processes.
	# - User agent matches Google's bots.
	# - Only read the head.
	# - Replace the write out with our own. Check out the available variables: https://ec.haxx.se/usingcurl-verbose.html#available---write-out-variables
	# tee outputs piped content to a file. Same as >>

	cat urls.txt \| xargs -P 10 curl --user-agent "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" -o /dev/null --silent --head --write-out '%{url_effective};%{http_code};\n' \| tee tested-urls.csv