Skip to content

Instantly share code, notes, and snippets.

@T31337
Last active July 5, 2017 04:20
Show Gist options
  • Save T31337/b05dfa84d6dbd77c87eed8c92c713fe0 to your computer and use it in GitHub Desktop.
Save T31337/b05dfa84d6dbd77c87eed8c92c713fe0 to your computer and use it in GitHub Desktop.
Spider Using xidel, xargs, and curl
#!/bin/bash
spiderFile="spider.txt"
spiderInfo="spiderInfo.txt"
errorFile="html_errors.txt"
ok="spiderList.txt"
prefix="http://"
curl=$(which curl)
spiderBin=$(which xidel)
xargsBin=$(which xargs)
if [[ -z $curl ]]; then
echo "You Need To Install curl To Use This Script"
exit 1
fi
if [[ -z $spiderBin ]]; then
echo "You Need To Install xeidl To Use This Script"
exit 1
fi
if [[ -z $xargsBin ]]; then
echo "You Need To Install xargs To Use This Script"
exit 1
fi
# Allow more advanced pattern matching (for case..esac below)
shopt -s extglob
function checkURLS
{
echo "Removing Old Outdated Files..."
rm -rf $ok
rm -rf $errorFile
echo "Checking URLS..."
while read url; do
# remove comments
url=${url%%#*}
# skip empty lines
if [[ -z "$url" ]]; then
continue
fi
if [[ "$url" != "http"* ]];then
url=$prefix$url
fi
# Handle just ftp, http and https.
# We could do full URL pattern matching, but meh.
case "$url" in
@(f|ht)tp?(s)://*)
# Get just the numeric HTTP response code
http_code=$(curl -sL -w '%{http_code}' "$url" -o /dev/null)
case "$http_code" in
200|226|2*)
# You'll get a 226 in ${http_code} from a valid FTP URL.
# If all you really care about is that the response is in the 200's,
# you could match against "2??" instead.
echo -e "$url" >> $ok
;;
*)
# You might want different handling for redirects (301/302).
echo -e "$url | $http_code" >> $errorFile
;;
esac
;;
*)
# If we're here, we didn't get a URL we could read.
echo "WARNING: invalid url: $url"
echo -e "$url | $http_code | INVALID URL " >> $errorFile
;;
esac
done < "$spiderFile"
echo "$ok Created Form Valid URLS In $spiderFlle"
echo -e "============================================\n"
cat $ok
echo -e "============================================\n"
}
function wSpider1
{
wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose >> $wgetLog
}
function spider
{
if [[ -f $spiderInfo ]]; then
echo -e "Found Old SpiderData...\n"
echo -e "Backing Up Old SpiderData File: $spiderInfo -> ${spiderInfo}.old"
echo -e "==================================================================\n"
echo -e "Adding Last SpiderInfo To Old SpiderInfo File...\n"
echo -e "==================================================\n" >> "${spiderInfo}.old"
echo -e $spiderInfo >> "${spiderInfo}.old"
echo -e "Done!\nRemoving SpiderInfo File So We Can Start Fresh!\n"
rm -f $spiderInfo
fi
clear
echo -e "=====================================\n"
echo -e "Spidering WebSite...\n\nPlease Wait...\n"
echo -e "======================================\n"
#xSpider=$(xidel $1 -e '//a/@href' | grep -v "http" | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v "http" | sort -u)
#xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)"
#echo -e "==================================================\n"
echo -e "Spidering WebSite: $1...\n"
hSpider=$(httrack --continue $1 --spider --list "${spiderInfo}_httrack")
wgetLog="${spiderInfo}_wget"
wSpider1=$(wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose -o $wgetLog)
wSpider2=$(wget --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose $1 >> $wgetLog)
xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {} xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)"
echo -e "=================\n" >> $spiderInfo
echo -e "===xSpiderInfo:===\n" >> $spiderInfo
echo -e $xSpider >> $spiderInfo
echo -e "\n=================================\n"
echo -e " Starting wSpider1..."
echo -e "=========wSpiderInfo:==============\n"
wSpider1 $1
if [[ -f $wgetLog ]]; then
sed -e 's/\s\+/\n/g' $wgetLog >> $spiderInfo
fi
sed -e 's/\s\+/\n/g' $spiderInfo >> "new_${spiderInfo}"
mv -f "new_${spiderInfo}" $spiderInfo
echo -e "==========END OF CRAWL========\n" >> $spiderInfo
cat $spiderInfo
echo -e "Removing Some Files That Are No-Longer Needed...\n"
rm -f "${spiderInfo}_wget"
rm -f "${spiderInfo}_httrack"
exit 1
}
if [ ! -f $spiderFile ] || [ ! -s $spiderFile ]; then
echo "SpiderFile $spiderFile Is Missing Or Empty..."
echo "Creating Deafult SpiderFile..."
touch $spiderFile
echo "#http://www.google.com" >> $spiderFile
echo "#http://www.facebook.com" >> $spiderFile
echo "https://plus.google.com/collection" >> $spiderFile
echo "Done!"
fi
if [ $# -eq 0 ]; then
clear
checkURLS
echo "Checking For Spider File: $ok"
if [ ! -f $ok ] || [ ! -s $ok ]; then
echo -e "$ok Not Found Or Empty\n"
read -p "URL: " site
if [[ $site != "http"* ]];then
site=$prefix$site
fi
spider $site
else
while read data; do
# remove comments
url=${data%%#*}
# skip empty lines
if [[ -z "$url" ]]; then
continue
fi
if [[ $url != "http"* ]];then
url=$prefix$url
fi
spider $url
done < $ok
echo -e "Done!\n"
echo -e "============\n"
echo -e "SpiderData:\n"
echo -e "============\n"
cat $spiderInfo
fi
fi
if [[ $# -eq 1 ]]; then
if [[ $1 != "http"* ]];then
site=$prefix$1
echo -e "Spidering WebSite: ${site}...\n"
spider ${site}
else
echo -e "Spidering WebSite: $1...\n"
spider $1
fi
fi
echo "Thanks For Spidering..."
exit 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment