Created
September 22, 2016 16:24
-
-
Save steveosoule/b3835d29a030eb2a2a3a87d287bb73cf to your computer and use it in GitHub Desktop.
WGET Crawl Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# wget --mirror --adjust-extension --page-requisites --execute robots=off --wait=30 --rand om-wait --convert-links --user-agent=Mozilla http://www.example.com | |
### V1 | |
# wget \ | |
# --recursive \ | |
# --no-clobber \ | |
# --page-requisites \ | |
# --html-extension \ | |
# --convert-links \ | |
# --restrict-file-names=windows \ | |
# --domains www.example.com \ | |
# --no-parent \ | |
# www.example.com | |
### V2 | |
# wget \ | |
# --recursive \ | |
# --no-clobber \ | |
# --page-requisites \ | |
# --html-extension \ | |
# --convert-links \ | |
# --execute robots=off \ | |
# --restrict-file-names=windows \ | |
# --domains www.example.com \ | |
# --no-parent \ | |
# www.example.com | |
# wget \ | |
# --user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \ | |
# --execute robots=off \ | |
# --recursive \ | |
# --mirror \ | |
# --wait=10 \ | |
# --random-wait \ | |
# www.example.com \ | |
# 2>&1 | grep '^--' | awk '{ print $3 }' | grep -v '\.\(css\|js\|png\|gif\|jpg\|JPG\)$' > www.example.com.txt | |
# wget \ | |
# --mirror \ | |
# --recursive \ | |
# --execute robots=off \ | |
# --user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \ | |
# --timestamping \ | |
# --page-requisites \ | |
# --html-extension \ | |
# --restrict-file-names=windows \ | |
# --wait=1 \ | |
# --random-wait \ | |
# --domains www.example.com \ | |
# --debug \ | |
# --output-file=sample.log \ | |
# --progress=dot \ | |
# --directory-prefix=sample \ | |
# www.example.com | |
# wget \ | |
# --mirror \ | |
# --recursive \ | |
# --execute robots=off \ | |
# --user-agent='Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/52.0.2725.0 Mobile/13B143 Safari/601.1.46' \ | |
# --timestamping \ | |
# --page-requisites \ | |
# --html-extension \ | |
# --restrict-file-names=windows \ | |
# --wait=1 \ | |
# --random-wait \ | |
# --domains www.example.com \ | |
# --progress=bar \ | |
# www.example.com | |
wget \ | |
--mirror \ | |
--recursive \ | |
--execute robots=off \ | |
--user-agent='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2725.0 Safari/537.36' \ | |
--timestamping \ | |
--page-requisites \ | |
--html-extension \ | |
--restrict-file-names=windows \ | |
--random-wait \ | |
--convert-links \ | |
--domains www.example.com \ | |
www.example.com |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment