pgwillia/crawl.sh

## crawl.sh
# --spider only do HEAD request
# --no-verbose gives minimal output (~1/4 the lines)
# -o send output to file
# -e robots=off ignores robots.txt (but you should play nice -- so don't use this)
# -w 1 waits for a second between requests
# --random-wait uses -w to vary between 0.5 and 1.5 * wait seconds
# -r recursive (this is the crawler part)
# -nd don't create a hierarchy of directories when retrieving recursively (prevent inode issues)
# -p all page requisites

wget --spider --no-verbose -o ~/example-crawl.log -w 1 --random-wait -r -nd -p http://www.example.com
	# --spider only do HEAD request
	# --no-verbose gives minimal output (~1/4 the lines)
	# -o send output to file
	# -e robots=off ignores robots.txt (but you should play nice -- so don't use this)
	# -w 1 waits for a second between requests
	# --random-wait uses -w to vary between 0.5 and 1.5 * wait seconds
	# -r recursive (this is the crawler part)
	# -nd don't create a hierarchy of directories when retrieving recursively (prevent inode issues)
	# -p all page requisites

	wget --spider --no-verbose -o ~/example-crawl.log -w 1 --random-wait -r -nd -p http://www.example.com