T31337/spider.sh

## spider.sh
#!/bin/bash
spiderFile="spider.txt"
spiderInfo="spiderInfo.txt"
errorFile="html_errors.txt"
ok="spiderList.txt"
prefix="http://"
curl=$(which curl)
spiderBin=$(which xidel)
xargsBin=$(which xargs)
if [[  -z $curl ]]; then
    echo "You Need To Install curl To Use This Script"
    exit 1
fi
if [[ -z $spiderBin ]]; then
    echo "You Need To Install xeidl To Use This Script"
    exit 1
fi
if [[ -z $xargsBin ]]; then
    echo "You Need To Install xargs To Use This Script"
    exit 1
fi
# Allow more advanced pattern matching (for case..esac below)
shopt -s extglob
function checkURLS
{
    echo "Removing Old Outdated Files..."
    rm -rf $ok
    rm -rf $errorFile
    echo "Checking URLS..."
    while read url; do
    # remove comments
    url=${url%%#*}
    # skip empty lines
    if [[ -z "$url" ]]; then
        continue
    fi
    if [[ "$url" != "http"* ]];then
        url=$prefix$url
    fi
    # Handle just ftp, http and https.
    # We could do full URL pattern matching, but meh.
    case "$url" in
        @(f|ht)tp?(s)://*)
        # Get just the numeric HTTP response code
        http_code=$(curl -sL -w '%{http_code}' "$url" -o /dev/null)
        case "$http_code" in
            200|226|2*)
            # You'll get a 226 in ${http_code} from a valid FTP URL.
            # If all you really care about is that the response is in the 200's,
            # you could match against "2??" instead.
            echo -e "$url" >> $ok
            ;;
            *)
            # You might want different handling for redirects (301/302).
            echo -e "$url | $http_code"  >> $errorFile
            ;;
        esac
        ;;
        *)
        # If we're here, we didn't get a URL we could read.
        echo "WARNING: invalid url: $url"
        echo -e "$url | $http_code | INVALID URL " >> $errorFile
        ;;
    esac

    done < "$spiderFile"
    echo "$ok Created Form Valid URLS In $spiderFlle"
    echo -e "============================================\n"
    cat $ok
    echo -e "============================================\n"
}

function wSpider1
{
    wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent  --recursive --no-verbose >> $wgetLog
}
function spider
{
    if [[ -f $spiderInfo ]]; then
        echo -e "Found Old SpiderData...\n"
        echo -e "Backing Up Old SpiderData File: $spiderInfo -> ${spiderInfo}.old"
        echo -e "==================================================================\n"
        echo -e "Adding Last SpiderInfo To Old SpiderInfo File...\n"
        echo -e "==================================================\n" >> "${spiderInfo}.old"
        echo -e  $spiderInfo >> "${spiderInfo}.old"
        echo -e "Done!\nRemoving SpiderInfo File So We Can Start Fresh!\n"
        rm -f $spiderInfo
    fi
    clear
    echo -e "=====================================\n"
    echo -e "Spidering WebSite...\n\nPlease Wait...\n"
    echo -e "======================================\n"
    #xSpider=$(xidel $1 -e '//a/@href' | grep -v "http" | sort -u | xargs -L1 -I {}  xidel $1{} -e '//a/@href' | grep -v "http" | sort -u)
    #xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {}  xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)"
    #echo -e "==================================================\n"
    echo -e "Spidering WebSite: $1...\n"
    hSpider=$(httrack --continue $1 --spider --list "${spiderInfo}_httrack")
    wgetLog="${spiderInfo}_wget"
    wSpider1=$(wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent  --recursive --no-verbose -o $wgetLog)
    wSpider2=$(wget --spider --no-check-certificate --auth-no-challenge --no-parent  --recursive --no-verbose $1 >> $wgetLog)
    xSpider="$(xidel $1 -e '//a/@href' | grep -v 'http' | sort -u | xargs -L1 -I {}  xidel $1{} -e '//a/@href' | grep -v 'http' | sort -u)"
    echo -e "=================\n" >> $spiderInfo
    echo -e "===xSpiderInfo:===\n" >> $spiderInfo
    echo -e $xSpider >> $spiderInfo
    echo -e "\n=================================\n"
    echo -e "      Starting wSpider1..."
    echo -e "=========wSpiderInfo:==============\n"
    wSpider1 $1
    if [[ -f $wgetLog ]]; then
        sed -e 's/\s\+/\n/g' $wgetLog >> $spiderInfo
    fi
    sed -e 's/\s\+/\n/g' $spiderInfo >> "new_${spiderInfo}"
    mv -f "new_${spiderInfo}" $spiderInfo

    echo -e "==========END OF CRAWL========\n" >> $spiderInfo
    cat $spiderInfo
    echo -e "Removing Some Files That Are No-Longer Needed...\n"
    rm -f "${spiderInfo}_wget"
    rm -f "${spiderInfo}_httrack"
    exit 1
}
if [ ! -f $spiderFile ] || [ ! -s $spiderFile ]; then
        echo "SpiderFile $spiderFile Is Missing Or Empty..."
        echo "Creating Deafult SpiderFile..."
        touch $spiderFile
        echo "#http://www.google.com" >> $spiderFile
        echo "#http://www.facebook.com" >> $spiderFile
        echo "https://plus.google.com/collection" >> $spiderFile
        echo "Done!"
fi
if [ $# -eq 0 ]; then
    clear
  checkURLS
  echo "Checking For Spider File: $ok"
    if [ ! -f $ok ] || [ ! -s $ok ]; then
        echo -e "$ok Not Found Or Empty\n"
        read -p "URL: " site
        if [[ $site != "http"* ]];then
            site=$prefix$site
        fi
        spider $site
    else
        while read data; do
            # remove comments
            url=${data%%#*}
            # skip empty lines
            if [[ -z "$url" ]]; then
                continue
            fi
            if [[ $url != "http"* ]];then
                url=$prefix$url
            fi
            spider $url
            done < $ok
            echo -e "Done!\n"
            echo -e "============\n"
            echo -e "SpiderData:\n"
            echo -e "============\n"
            cat $spiderInfo
    fi
fi
if [[ $# -eq 1 ]]; then
    if [[ $1 != "http"* ]];then
        site=$prefix$1
        echo -e "Spidering WebSite: ${site}...\n"
        spider ${site}
    else
        echo -e "Spidering WebSite: $1...\n"
        spider $1
    fi
fi
echo "Thanks For Spidering..."
exit 1
	#!/bin/bash
	spiderFile="spider.txt"
	spiderInfo="spiderInfo.txt"
	errorFile="html_errors.txt"
	ok="spiderList.txt"
	prefix="http://"
	curl=$(which curl)
	spiderBin=$(which xidel)
	xargsBin=$(which xargs)
	if [[ -z $curl ]]; then
	echo "You Need To Install curl To Use This Script"
	exit 1
	fi
	if [[ -z $spiderBin ]]; then
	echo "You Need To Install xeidl To Use This Script"
	exit 1
	fi
	if [[ -z $xargsBin ]]; then
	echo "You Need To Install xargs To Use This Script"
	exit 1
	fi
	# Allow more advanced pattern matching (for case..esac below)
	shopt -s extglob
	function checkURLS
	{
	echo "Removing Old Outdated Files..."
	rm -rf $ok
	rm -rf $errorFile
	echo "Checking URLS..."
	while read url; do
	# remove comments
	url=${url%%#*}
	# skip empty lines
	if [[ -z "$url" ]]; then
	continue
	fi
	if [[ "$url" != "http"* ]];then
	url=$prefix$url
	fi
	# Handle just ftp, http and https.
	# We could do full URL pattern matching, but meh.
	case "$url" in
	@(f\|ht)tp?(s)://*)
	# Get just the numeric HTTP response code
	http_code=$(curl -sL -w '%{http_code}' "$url" -o /dev/null)
	case "$http_code" in
	200\|226\|2*)
	# You'll get a 226 in ${http_code} from a valid FTP URL.
	# If all you really care about is that the response is in the 200's,
	# you could match against "2??" instead.
	echo -e "$url" >> $ok
	;;
	*)
	# You might want different handling for redirects (301/302).
	echo -e "$url \| $http_code" >> $errorFile
	;;
	esac
	;;
	*)
	# If we're here, we didn't get a URL we could read.
	echo "WARNING: invalid url: $url"
	echo -e "$url \| $http_code \| INVALID URL " >> $errorFile
	;;
	esac

	done < "$spiderFile"
	echo "$ok Created Form Valid URLS In $spiderFlle"
	echo -e "============================================\n"
	cat $ok
	echo -e "============================================\n"
	}

	function wSpider1
	{
	wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose >> $wgetLog
	}
	function spider
	{
	if [[ -f $spiderInfo ]]; then
	echo -e "Found Old SpiderData...\n"
	echo -e "Backing Up Old SpiderData File: $spiderInfo -> ${spiderInfo}.old"
	echo -e "==================================================================\n"
	echo -e "Adding Last SpiderInfo To Old SpiderInfo File...\n"
	echo -e "==================================================\n" >> "${spiderInfo}.old"
	echo -e $spiderInfo >> "${spiderInfo}.old"
	echo -e "Done!\nRemoving SpiderInfo File So We Can Start Fresh!\n"
	rm -f $spiderInfo
	fi
	clear
	echo -e "=====================================\n"
	echo -e "Spidering WebSite...\n\nPlease Wait...\n"
	echo -e "======================================\n"
	#xSpider=$(xidel $1 -e '//a/@href' \| grep -v "http" \| sort -u \| xargs -L1 -I {} xidel $1{} -e '//a/@href' \| grep -v "http" \| sort -u)
	#xSpider="$(xidel $1 -e '//a/@href' \| grep -v 'http' \| sort -u \| xargs -L1 -I {} xidel $1{} -e '//a/@href' \| grep -v 'http' \| sort -u)"
	#echo -e "==================================================\n"
	echo -e "Spidering WebSite: $1...\n"
	hSpider=$(httrack --continue $1 --spider --list "${spiderInfo}_httrack")
	wgetLog="${spiderInfo}_wget"
	wSpider1=$(wget $1 --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose -o $wgetLog)
	wSpider2=$(wget --spider --no-check-certificate --auth-no-challenge --no-parent --recursive --no-verbose $1 >> $wgetLog)
	xSpider="$(xidel $1 -e '//a/@href' \| grep -v 'http' \| sort -u \| xargs -L1 -I {} xidel $1{} -e '//a/@href' \| grep -v 'http' \| sort -u)"
	echo -e "=================\n" >> $spiderInfo
	echo -e "===xSpiderInfo:===\n" >> $spiderInfo
	echo -e $xSpider >> $spiderInfo
	echo -e "\n=================================\n"
	echo -e " Starting wSpider1..."
	echo -e "=========wSpiderInfo:==============\n"
	wSpider1 $1
	if [[ -f $wgetLog ]]; then
	sed -e 's/\s\+/\n/g' $wgetLog >> $spiderInfo
	fi
	sed -e 's/\s\+/\n/g' $spiderInfo >> "new_${spiderInfo}"
	mv -f "new_${spiderInfo}" $spiderInfo

	echo -e "==========END OF CRAWL========\n" >> $spiderInfo
	cat $spiderInfo
	echo -e "Removing Some Files That Are No-Longer Needed...\n"
	rm -f "${spiderInfo}_wget"
	rm -f "${spiderInfo}_httrack"
	exit 1
	}
	if [ ! -f $spiderFile ] \|\| [ ! -s $spiderFile ]; then
	echo "SpiderFile $spiderFile Is Missing Or Empty..."
	echo "Creating Deafult SpiderFile..."
	touch $spiderFile
	echo "#http://www.google.com" >> $spiderFile
	echo "#http://www.facebook.com" >> $spiderFile
	echo "https://plus.google.com/collection" >> $spiderFile
	echo "Done!"
	fi
	if [ $# -eq 0 ]; then
	clear
	checkURLS
	echo "Checking For Spider File: $ok"
	if [ ! -f $ok ] \|\| [ ! -s $ok ]; then
	echo -e "$ok Not Found Or Empty\n"
	read -p "URL: " site
	if [[ $site != "http"* ]];then
	site=$prefix$site
	fi
	spider $site
	else
	while read data; do
	# remove comments
	url=${data%%#*}
	# skip empty lines
	if [[ -z "$url" ]]; then
	continue
	fi
	if [[ $url != "http"* ]];then
	url=$prefix$url
	fi
	spider $url
	done < $ok
	echo -e "Done!\n"
	echo -e "============\n"
	echo -e "SpiderData:\n"
	echo -e "============\n"
	cat $spiderInfo
	fi
	fi
	if [[ $# -eq 1 ]]; then
	if [[ $1 != "http"* ]];then
	site=$prefix$1
	echo -e "Spidering WebSite: ${site}...\n"
	spider ${site}
	else
	echo -e "Spidering WebSite: $1...\n"
	spider $1
	fi
	fi
	echo "Thanks For Spidering..."
	exit 1