dylan-k/crawler.sh

## crawler.sh
#!/bin/bash
#
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# $1 = URL
# $2 = csv filename
#
# USAGE:
# save this script as, say, crawler.sh”.
# Then “chmod +x crawler.sh”.
# Then run the script passing in the name of this site you want to crawl, for example ` ./crawler.sh http://www.someSite.com output`

# Text color variables
txtund=$(tput sgr 0 1)

# Underline
txtbld=$(tput bold)

# Bold
bldred=${txtbld}$(tput setaf 1)   # red
bldblu=${txtbld}$(tput setaf 4)   # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7)   # white
txtrst=$(tput sgr0)               # Reset
info=${bldwht}*${txtrst}          # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 ===  %s" "$bldgreen" "$txtrst"

# wget in Spider mode, outputs to wglog file
# man wget is your friend, but...
# --reject switch to ignore specific file types (images, javascript etc.)
# --reject-regex switch to ignore URL parts (eg urls with question marks by using --reject-regex "(.*)\?(.*)"  )
# --no-check-certificate switch to (be careful!) ignore security certificates
wget --reject-regex "(.*)\?(.*)" --no-check-certificate --spider --recursive --no-clobber --no-directories $1 --reject bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip,eot,svg,ttf,woff,woff2,rdf 2>&1 | tee wglog v
printf " %s========================================== \n" "$bldgreen"
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
printf "%s========================================== \n" "$dgreen"
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"

# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
  printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
  printf "\"${url}\",\"$(curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q')\"\n" >>$2.csv
}; done

# clean up log file
rm wglog
exit
	#!/bin/bash
	#
	# Crawls a domain
	# Retreives all visible URLs and their page titles
	# Saves to CSV
	# $1 = URL
	# $2 = csv filename
	#
	# USAGE:
	# save this script as, say, crawler.sh”.
	# Then “chmod +x crawler.sh”.
	# Then run the script passing in the name of this site you want to crawl, for example ` ./crawler.sh http://www.someSite.com output`

	# Text color variables
	txtund=$(tput sgr 0 1)

	# Underline
	txtbld=$(tput bold)

	# Bold
	bldred=${txtbld}$(tput setaf 1) # red
	bldblu=${txtbld}$(tput setaf 4) # blue
	bldgreen=${txtbld}$(tput setaf 2) # green
	bldwht=${txtbld}$(tput setaf 7) # white
	txtrst=$(tput sgr0) # Reset
	info=${bldwht}*${txtrst} # Feedback
	pass=${bldblu}*${txtrst}
	warn=${bldred}*${txtrst}
	ques=${bldblu}?${txtrst}
	printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst"

	# wget in Spider mode, outputs to wglog file
	# man wget is your friend, but...
	# --reject switch to ignore specific file types (images, javascript etc.)
	# --reject-regex switch to ignore URL parts (eg urls with question marks by using --reject-regex "(.)\?(.)" )
	# --no-check-certificate switch to (be careful!) ignore security certificates
	wget --reject-regex "(.)\?(.)" --no-check-certificate --spider --recursive --no-clobber --no-directories $1 --reject bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip,eot,svg,ttf,woff,woff2,rdf 2>&1 \| tee wglog v
	printf " %s========================================== \n" "$bldgreen"
	printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
	printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
	printf "%s========================================== \n" "$dgreen"
	printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"

	# from wglog, grab URLs
	# curl each URL and grep title
	cat wglog \| grep '^--' \| awk '{print $3}' \| sort \| uniq \| while read url; do {
	printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
	printf "\"${url}\",\"$(curl -s ${url} \| sed -n 's/.<title>\(.\)<\/title>.*/\1/ip;T;q')\"\n" >>$2.csv
	}; done

	# clean up log file
	rm wglog
	exit