Created
July 4, 2017 12:35
-
-
Save T31337/163f18366648a6b9ab5ab1088d2c5f1b to your computer and use it in GitHub Desktop.
Download From Websites With wget And curl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Allow more advanced pattern matching (for case..esac below) | |
shopt -s extglob | |
#Variables | |
dir="$HOME/Downloads/web/" #What Directory Do We Save Files To? | |
ok="ok.txt" | |
notok="bad_urls.txt" # We Won't Be Needing This... But It Might Be Handy... | |
sitelist="download.txt" #also can named urls.txt or url-list.txt | |
siteFile="site.html" #This Is The File We Save The Scrapped Text To | |
curl="/usr/bin/curl" #CURL Binary Location | |
ACCEPT='jpg,png,jpeg,gif,bmp,img,ico,svg' #File Accept/Download List | |
DECLINE='tmp,htm,php,js,html' #File Reject/Decline List | |
#Allow Direct Access To Downloading Files If A Url Is Passed As Argument | |
if [ $# -eq 1 ]; then | |
wget -r -k -K -p -e robots=off --no-parent -nd -A "$ACCEPT" -R "$REJECT" --continue -P $dir $1 | |
echo "Done!" | |
exit 1 | |
fi | |
function downloadFiles() | |
{ | |
if [ ! -f $ok ] || [ ! -s $ok ]; then | |
echo "$ok File Missing Or Empty!" | |
echo "Will Now Exit!" | |
exit 1 | |
fi | |
#Download Files From Url List | |
while read url; do | |
wget -r -k -K -p -e robots=off --no-parent -nd -A "$ACCEPT" -R "$REJECT" --continue -P $dir $url | |
done < $ok | |
echo "Done!" | |
} | |
function webCrawler() | |
{ | |
uri=$1 | |
element=$2 | |
wget -qO- $uri | | |
hxnormalize -x | | |
hxselect "$element" > $siteFile | |
# | lynx -stdin -dump -nolist # You Can Use This Instad, Someone Might Like This, But I Just Want To Save The File | |
echo "Saved To $siteFile" | |
} | |
# Some errors, for good measure... | |
if [[ ! -f "$sitelist" ]]; then | |
echo "ERROR: $sitelist is missing. Checking For url-list.txt" | |
#echo "Creating Empty url-list.txt" >&2 | |
#touch $sitelist | |
sitelist="url-list.txt" | |
fi | |
if [[ ! -f "$sitelist" ]]; then | |
echo "$sitelist Missing, Checking For download.txt..." | |
#touch $sitelist | |
sitelist="download.txt" | |
fi | |
if [[ ! -f "$sitelist" ]]; then | |
echo "$sitelist Missing, Creating Blank Donwload List..." | |
touch $sitelist | |
fi | |
if [[ ! -s "$sitelist" ]]; then | |
echo "ERROR: $sitelist is empty." >&2 | |
exit 1 | |
elif [[ ! -x "$curl" ]]; then | |
echo "ERROR: I can't work under these conditions." | |
exit 1 | |
fi | |
# Allow more advanced pattern matching (for case..esac below) | |
shopt -s extglob | |
#shopt -s globstar | |
#shopt -s nocaseglob | |
function checkURLS | |
{ | |
echo "Loading..." | |
echo "Removing Old Outdated Files..." | |
rm -rf $ok | |
rm -rf $notok | |
echo "Checking URLS..." | |
while read url; do | |
# remove comments | |
url=${url%%#*} | |
# skip empty lines | |
if [[ -z "$url" ]]; then | |
continue | |
fi | |
# Handle just ftp, http and https. | |
# We could do full URL pattern matching, but meh. | |
case "$url" in | |
@(f|ht)tp?(s)://*) | |
# Get just the numeric HTTP response code | |
http_code=$($curl -sL -w '%{http_code}' "$url" -o /dev/null) | |
case "$http_code" in | |
200|226|2*) | |
# You'll get a 226 in ${http_code} from a valid FTP URL. | |
# If all you really care about is that the response is in the 200's, | |
# you could match against "2??" instead. | |
echo "$url" >> $ok | |
;; | |
*) | |
# You might want different handling for redirects (301/302). | |
echo "$url | $http_code" >> $notok | |
;; | |
esac | |
;; | |
*) | |
# If we're here, we didn't get a URL we could read. | |
echo "WARNING: invalid url: $url" # >&2 | |
;; | |
esac | |
done < "$sitelist" | |
echo "$ok Created Form Valid URLS In $sitelist" | |
} | |
checkURLS | |
#Infinate While Loop To Keep Menu Open At Bad Input | |
while : | |
do | |
########Menu######### | |
printf "Choose from the following operations:\n" | |
printf "[1] Downlaod Files From download.txt With wget\n" | |
printf "[2] Donwlaod/Scrape Webpage With curl\n" | |
printf "\n" | |
read -p "Your choice: " op | |
case $op in | |
1) | |
#Download Files Form ok.txt | |
downloadFiles | |
break | |
;; | |
2) | |
#Spider/View | |
read -p 'URL: ' url | |
#read -sp 'Password: ' passvar | |
read -p "Enter Element: " element | |
webCrawler $url $element | |
break | |
;; | |
*) | |
echo "invalid Input, Please Try Again." | |
esac | |
done | |
echo -e "\nThank You For Using My Simple Script, Please Feel Free To Modify/Share It!\n" | |
sleep 5 | |
exit 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment