Skip to content

Instantly share code, notes, and snippets.

@T31337
Created July 4, 2017 12:35
Show Gist options
  • Save T31337/163f18366648a6b9ab5ab1088d2c5f1b to your computer and use it in GitHub Desktop.
Save T31337/163f18366648a6b9ab5ab1088d2c5f1b to your computer and use it in GitHub Desktop.
Download From Websites With wget And curl
#!/bin/bash
# Allow more advanced pattern matching (for case..esac below)
shopt -s extglob
#Variables
dir="$HOME/Downloads/web/" #What Directory Do We Save Files To?
ok="ok.txt"
notok="bad_urls.txt" # We Won't Be Needing This... But It Might Be Handy...
sitelist="download.txt" #also can named urls.txt or url-list.txt
siteFile="site.html" #This Is The File We Save The Scrapped Text To
curl="/usr/bin/curl" #CURL Binary Location
ACCEPT='jpg,png,jpeg,gif,bmp,img,ico,svg' #File Accept/Download List
DECLINE='tmp,htm,php,js,html' #File Reject/Decline List
#Allow Direct Access To Downloading Files If A Url Is Passed As Argument
if [ $# -eq 1 ]; then
wget -r -k -K -p -e robots=off --no-parent -nd -A "$ACCEPT" -R "$REJECT" --continue -P $dir $1
echo "Done!"
exit 1
fi
function downloadFiles()
{
if [ ! -f $ok ] || [ ! -s $ok ]; then
echo "$ok File Missing Or Empty!"
echo "Will Now Exit!"
exit 1
fi
#Download Files From Url List
while read url; do
wget -r -k -K -p -e robots=off --no-parent -nd -A "$ACCEPT" -R "$REJECT" --continue -P $dir $url
done < $ok
echo "Done!"
}
function webCrawler()
{
uri=$1
element=$2
wget -qO- $uri |
hxnormalize -x |
hxselect "$element" > $siteFile
# | lynx -stdin -dump -nolist # You Can Use This Instad, Someone Might Like This, But I Just Want To Save The File
echo "Saved To $siteFile"
}
# Some errors, for good measure...
if [[ ! -f "$sitelist" ]]; then
echo "ERROR: $sitelist is missing. Checking For url-list.txt"
#echo "Creating Empty url-list.txt" >&2
#touch $sitelist
sitelist="url-list.txt"
fi
if [[ ! -f "$sitelist" ]]; then
echo "$sitelist Missing, Checking For download.txt..."
#touch $sitelist
sitelist="download.txt"
fi
if [[ ! -f "$sitelist" ]]; then
echo "$sitelist Missing, Creating Blank Donwload List..."
touch $sitelist
fi
if [[ ! -s "$sitelist" ]]; then
echo "ERROR: $sitelist is empty." >&2
exit 1
elif [[ ! -x "$curl" ]]; then
echo "ERROR: I can't work under these conditions."
exit 1
fi
# Allow more advanced pattern matching (for case..esac below)
shopt -s extglob
#shopt -s globstar
#shopt -s nocaseglob
function checkURLS
{
echo "Loading..."
echo "Removing Old Outdated Files..."
rm -rf $ok
rm -rf $notok
echo "Checking URLS..."
while read url; do
# remove comments
url=${url%%#*}
# skip empty lines
if [[ -z "$url" ]]; then
continue
fi
# Handle just ftp, http and https.
# We could do full URL pattern matching, but meh.
case "$url" in
@(f|ht)tp?(s)://*)
# Get just the numeric HTTP response code
http_code=$($curl -sL -w '%{http_code}' "$url" -o /dev/null)
case "$http_code" in
200|226|2*)
# You'll get a 226 in ${http_code} from a valid FTP URL.
# If all you really care about is that the response is in the 200's,
# you could match against "2??" instead.
echo "$url" >> $ok
;;
*)
# You might want different handling for redirects (301/302).
echo "$url | $http_code" >> $notok
;;
esac
;;
*)
# If we're here, we didn't get a URL we could read.
echo "WARNING: invalid url: $url" # >&2
;;
esac
done < "$sitelist"
echo "$ok Created Form Valid URLS In $sitelist"
}
checkURLS
#Infinate While Loop To Keep Menu Open At Bad Input
while :
do
########Menu#########
printf "Choose from the following operations:\n"
printf "[1] Downlaod Files From download.txt With wget\n"
printf "[2] Donwlaod/Scrape Webpage With curl\n"
printf "\n"
read -p "Your choice: " op
case $op in
1)
#Download Files Form ok.txt
downloadFiles
break
;;
2)
#Spider/View
read -p 'URL: ' url
#read -sp 'Password: ' passvar
read -p "Enter Element: " element
webCrawler $url $element
break
;;
*)
echo "invalid Input, Please Try Again."
esac
done
echo -e "\nThank You For Using My Simple Script, Please Feel Free To Modify/Share It!\n"
sleep 5
exit 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment