joshkoenig/spiderme.sh

## spiderme.sh
#!/bin/sh

# Spiderme - quick and clean benchmarking for website performance on Pantheon.
#
# This script uses wget to "spider" your website to provide a good set of data
# on site performance. It will automatically bypass Pantheon's edge cache, and
# skip images, javascript, css, etc. It will also only spider links that are
# under the multidev environment you are spidering.
#
#
# USAGE
#
# ./spiderme.sh https://envname-sitename.pantheonsites.io
#
#
# INSTALLATION
#
# Just download and make the script executable:
#
# chmod +x spiderme.sh
#
# This script requires the common linus uitilities 'wget' and 'timeout'.
# You can install these for MacOS via homebrew.
#
# $> brew install wget
# $> brew install coreutils
#
#
# HOW TO BENCHMARK
# For best results you'll want to work on a fresh environment to avoid any
# confusion about what you are measuring. Step by step instructions:
#
# 1) Set up a fresh multidev environment to test your performance changes.
#
# 2) Insure New Relic is enabled and capturing data for the environment
#    you intend to test.
#
# 3) Clear the cache for the environment.
#
# 4) Run the script to establish baseline performance:
#    ./spiderme.sh https://envname-sitename.pantheonsites.io
#
# 5) Push your performance change (e.g. enable PHP 7)
#
# 6) Clear the cache for the environment.
#
# 7) Run the script again.
#
# This should give you a solid "side by side" comparison of before and
# after performance. You'll have two sets of data, each consisting of a
# cold and warm spider pass on the website. This should give you a sense
# of how much benefit your performance improvements deliver in under 30 mins.

# Assumes that the script takes a url to a multidev env as an arg.
#
# e.g. https://lcache-outlandish-josh.pantheonsites.io
#
# Thanks to http://stackoverflow.com/questions/6174220/parse-url-in-shell-script

# extract the protocol
PROTO="$(echo $1 | grep :// | sed -e's,^\(.*://\).*,\1,g')"
# remove the protocol
URL="$(echo ${1/$PROTO/})"
# extract the user (if any)
USER="$(echo $URL | grep @ | cut -d@ -f1)"
# extract the host
HOST="$(echo ${URL/$USER@/} | cut -d/ -f1)"

# Set proper timeout command.
unamestr=`uname`
if [[ "$unamestr" == 'Darwin' ]]; then
   TIMEOUT_COMMAND='gtimeout'
else
   TIMEOUT_COMMAND='timeout'
fi

# Wget is more efficient when it doesn't waste time downloading static files.
# Add prefixes here if you have other file types in play.
REJECT_FILES='jpg,png,pdf,css,js,eot,svg,gif,ico,xml,ttf,mp3,mov,mpg,mp4'

COOKIE='--header "Cookie: NO_CACHE=1"'
SPIDER_ME="$TIMEOUT_COMMAND 300s wget --reject $REJECT_FILES -e robots=off $COOKIE -r -l inf -D$HOST $PROTO$HOST"

# Main script. You shouldn't need to edit below this line.
echo "Starting first pass on $URL"
echo "Output is suppressed, just sit tight."
echo "This will go for a maximum of five minutes..."
echo
# Having trouble? Uncomment below and start picking apart the wget
echo "$SPIDER_ME"
eval $SPIDER_ME
echo "First pass complete!"
rm -rf $HOST
echo "Pausing for two minutes to create neat break in New Relic..."
echo
sleep 120
echo "Starting second pass on $URL..."
echo
eval $SPIDER_ME
echo "Complete!"
# Clean up
rm -rf $HOST
	#!/bin/sh

	# Spiderme - quick and clean benchmarking for website performance on Pantheon.
	#
	# This script uses wget to "spider" your website to provide a good set of data
	# on site performance. It will automatically bypass Pantheon's edge cache, and
	# skip images, javascript, css, etc. It will also only spider links that are
	# under the multidev environment you are spidering.
	#
	#
	# USAGE
	#
	# ./spiderme.sh https://envname-sitename.pantheonsites.io
	#
	#
	# INSTALLATION
	#
	# Just download and make the script executable:
	#
	# chmod +x spiderme.sh
	#
	# This script requires the common linus uitilities 'wget' and 'timeout'.
	# You can install these for MacOS via homebrew.
	#
	# $> brew install wget
	# $> brew install coreutils
	#
	#
	# HOW TO BENCHMARK
	# For best results you'll want to work on a fresh environment to avoid any
	# confusion about what you are measuring. Step by step instructions:
	#
	# 1) Set up a fresh multidev environment to test your performance changes.
	#
	# 2) Insure New Relic is enabled and capturing data for the environment
	# you intend to test.
	#
	# 3) Clear the cache for the environment.
	#
	# 4) Run the script to establish baseline performance:
	# ./spiderme.sh https://envname-sitename.pantheonsites.io
	#
	# 5) Push your performance change (e.g. enable PHP 7)
	#
	# 6) Clear the cache for the environment.
	#
	# 7) Run the script again.
	#
	# This should give you a solid "side by side" comparison of before and
	# after performance. You'll have two sets of data, each consisting of a
	# cold and warm spider pass on the website. This should give you a sense
	# of how much benefit your performance improvements deliver in under 30 mins.

	# Assumes that the script takes a url to a multidev env as an arg.
	#
	# e.g. https://lcache-outlandish-josh.pantheonsites.io
	#
	# Thanks to http://stackoverflow.com/questions/6174220/parse-url-in-shell-script

	# extract the protocol
	PROTO="$(echo $1 \| grep :// \| sed -e's,^\(.://\).,\1,g')"
	# remove the protocol
	URL="$(echo ${1/$PROTO/})"
	# extract the user (if any)
	USER="$(echo $URL \| grep @ \| cut -d@ -f1)"
	# extract the host
	HOST="$(echo ${URL/$USER@/} \| cut -d/ -f1)"

	# Set proper timeout command.
	unamestr=`uname`
	if [[ "$unamestr" == 'Darwin' ]]; then
	TIMEOUT_COMMAND='gtimeout'
	else
	TIMEOUT_COMMAND='timeout'
	fi

	# Wget is more efficient when it doesn't waste time downloading static files.
	# Add prefixes here if you have other file types in play.
	REJECT_FILES='jpg,png,pdf,css,js,eot,svg,gif,ico,xml,ttf,mp3,mov,mpg,mp4'

	COOKIE='--header "Cookie: NO_CACHE=1"'
	SPIDER_ME="$TIMEOUT_COMMAND 300s wget --reject $REJECT_FILES -e robots=off $COOKIE -r -l inf -D$HOST $PROTO$HOST"

	# Main script. You shouldn't need to edit below this line.
	echo "Starting first pass on $URL"
	echo "Output is suppressed, just sit tight."
	echo "This will go for a maximum of five minutes..."
	echo
	# Having trouble? Uncomment below and start picking apart the wget
	echo "$SPIDER_ME"
	eval $SPIDER_ME
	echo "First pass complete!"
	rm -rf $HOST
	echo "Pausing for two minutes to create neat break in New Relic..."
	echo
	sleep 120
	echo "Starting second pass on $URL..."
	echo
	eval $SPIDER_ME
	echo "Complete!"
	# Clean up
	rm -rf $HOST