Skip to content

Instantly share code, notes, and snippets.

@dylan-k
Forked from nerdpanda/crawler.sh
Last active October 13, 2017 19:00
Show Gist options
  • Save dylan-k/fe52389d569c2bb9660b713433d37875 to your computer and use it in GitHub Desktop.
Save dylan-k/fe52389d569c2bb9660b713433d37875 to your computer and use it in GitHub Desktop.
wget crawler
#!/bin/bash
#
# Crawls a domain
# Retreives all visible URLs and their page titles
# Saves to CSV
# $1 = URL
# $2 = csv filename
#
# USAGE:
# save this script as, say, crawler.sh”.
# Then “chmod +x crawler.sh”.
# Then run the script passing in the name of this site you want to crawl, for example ` ./crawler.sh http://www.someSite.com output`
# Text color variables
txtund=$(tput sgr 0 1)
# Underline
txtbld=$(tput bold)
# Bold
bldred=${txtbld}$(tput setaf 1) # red
bldblu=${txtbld}$(tput setaf 4) # blue
bldgreen=${txtbld}$(tput setaf 2) # green
bldwht=${txtbld}$(tput setaf 7) # white
txtrst=$(tput sgr0) # Reset
info=${bldwht}*${txtrst} # Feedback
pass=${bldblu}*${txtrst}
warn=${bldred}*${txtrst}
ques=${bldblu}?${txtrst}
printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst"
# wget in Spider mode, outputs to wglog file
# man wget is your friend, but...
# --reject switch to ignore specific file types (images, javascript etc.)
# --reject-regex switch to ignore URL parts (eg urls with question marks by using --reject-regex "(.*)\?(.*)" )
# --no-check-certificate switch to (be careful!) ignore security certificates
wget --reject-regex "(.*)\?(.*)" --no-check-certificate --spider --recursive --no-clobber --no-directories $1 --reject bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip,eot,svg,ttf,woff,woff2,rdf 2>&1 | tee wglog v
printf " %s========================================== \n" "$bldgreen"
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst"
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst"
printf "%s========================================== \n" "$dgreen"
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst"
# from wglog, grab URLs
# curl each URL and grep title
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do {
printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst"
printf "\"${url}\",\"$(curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q')\"\n" >>$2.csv
}; done
# clean up log file
rm wglog
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment