-
-
Save dylan-k/fe52389d569c2bb9660b713433d37875 to your computer and use it in GitHub Desktop.
wget crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Crawls a domain | |
# Retreives all visible URLs and their page titles | |
# Saves to CSV | |
# $1 = URL | |
# $2 = csv filename | |
# | |
# USAGE: | |
# save this script as, say, crawler.sh”. | |
# Then “chmod +x crawler.sh”. | |
# Then run the script passing in the name of this site you want to crawl, for example ` ./crawler.sh http://www.someSite.com output` | |
# Text color variables | |
txtund=$(tput sgr 0 1) | |
# Underline | |
txtbld=$(tput bold) | |
# Bold | |
bldred=${txtbld}$(tput setaf 1) # red | |
bldblu=${txtbld}$(tput setaf 4) # blue | |
bldgreen=${txtbld}$(tput setaf 2) # green | |
bldwht=${txtbld}$(tput setaf 7) # white | |
txtrst=$(tput sgr0) # Reset | |
info=${bldwht}*${txtrst} # Feedback | |
pass=${bldblu}*${txtrst} | |
warn=${bldred}*${txtrst} | |
ques=${bldblu}?${txtrst} | |
printf "%s=== Crawling $1 === %s" "$bldgreen" "$txtrst" | |
# wget in Spider mode, outputs to wglog file | |
# man wget is your friend, but... | |
# --reject switch to ignore specific file types (images, javascript etc.) | |
# --reject-regex switch to ignore URL parts (eg urls with question marks by using --reject-regex "(.*)\?(.*)" ) | |
# --no-check-certificate switch to (be careful!) ignore security certificates | |
wget --reject-regex "(.*)\?(.*)" --no-check-certificate --spider --recursive --no-clobber --no-directories $1 --reject bmp,css,gif,ico,jpg,jpeg,js,mp3,mp4,pdf,png,swf,txt,xml,xls,zip,eot,svg,ttf,woff,woff2,rdf 2>&1 | tee wglog v | |
printf " %s========================================== \n" "$bldgreen" | |
printf "%s=== Crawl Finished... ===%s \n" "$bldgreen" "$txtrst" | |
printf "%s=== Begin retreiving page titles... ===%s \n" "$bldgreen" "$txtrst" | |
printf "%s========================================== \n" "$dgreen" | |
printf "%s** Run tail -f $1.csv for progress%s \n" "$bldred" "$txtrst" | |
# from wglog, grab URLs | |
# curl each URL and grep title | |
cat wglog | grep '^--' | awk '{print $3}' | sort | uniq | while read url; do { | |
printf "%s* Retreiving title for: %s$url%s \n" "$bldgreen" "$txtrst$txtbld" "$txtrst" | |
printf "\"${url}\",\"$(curl -s ${url} | sed -n 's/.*<title>\(.*\)<\/title>.*/\1/ip;T;q')\"\n" >>$2.csv | |
}; done | |
# clean up log file | |
rm wglog | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment