Skip to content

Instantly share code, notes, and snippets.

@Code-Egg
Created November 10, 2019 14:43
Show Gist options
  • Save Code-Egg/ff6a3c26df3bc29c247a4e9eedb2692e to your computer and use it in GitHub Desktop.
Save Code-Egg/ff6a3c26df3bc29c247a4e9eedb2692e to your computer and use it in GitHub Desktop.
#Author :LiteSpeedtech & Matan Schatzman
#date :20191023
#version :1.7.3
#Require :Prepare site map XML
# Allow LSCache crawler
#=======================================================
AGENTDESKTOP='User-Agent: lscache_runner'
AGENTMOBILE='User-Agent: lscache_runner iPhone'
SVALUE="0.1"
WITH_MOBILE='OFF'
WITH_COOKIE='OFF'
COOKIE=''
XML_LIST=()
CURL_OPTS=''
PROTECTOR='ON'
VERBOSE='OFF'
DEBUGURL='OFF'
BLACKLIST='OFF'
CRAWLQS='OFF'
REPORT='OFF'
CRAWLLOG='/tmp/crawler.log'
BLACKLSPATH='/tmp/blk_crawler.txt'
CT_URLS=0
CT_NOCACHE=0
CT_CACHEHIT=0
CT_CACHEMISS=0
CT_BLACKLIST=0
CT_FAILCACHE=0
DETECT_NUM=0
DETECT_LIMIT=10
ERR_LIST="'400'|'401'|'403'|'404'|'407|'500'|'502'"
function help_message() {
case ${1} in
"1")
cat <<EOF
Server crawler engine not enabled. Please check
https://www.litespeedtech.com/support/wiki/doku.php/litespeed_wiki:cache:lscwp:configuration:enabling_the_crawler
Stop crawling...
EOF
;;
"2")
cat <<EOF
Important:
## Valid xml file and allow LSCache cawler are needed
0. bash cachecrawler.sh -h ## help
Example:
1. bash cachecrawler.sh SITE-MAP-URL ## When desktop and mobile share same theme
2. bash cachecrawler.sh -m SITE-MAP-URL ## When desktop & mobile have different theme
3. bash cachecrawler.sh -c SITE-MAP-URL ## For brining cookies case
4. bash cachecrawler.sh -b -c SITE-MAP-URL ## For brining cookies case and blacklist check
Debug example:
1. bash cachecrawler.sh -v SITE-MAP-URL ## To output details to crawler log
2. bash cachecrawler.sh -d SITE-URL ## Debug one URL directly
3. bash cachecrawler.sh -b -c -d SITE-URL ## Debug one URL with cookies and blacklist check
## Example for SITE-MAP-URL, http://prestashop-123/456_sitemap.xml
## More information, please check with LiteSpeed wiki here:
https://www.litespeedtech.com/support/wiki/doku.php/litespeed_wiki:cache:lscps:crawler
Optional Arguments:
-h, --help Show this message and exit
-m, --with-mobile Crawl mobile view in addition to default view
-c, --with-cookie Crawl with site's cookies
-b, --black-list Page will be added to black list if html status error and no cache. Next run will bypas page
-g, --general-ua Use general user-agent instead of lscache_runner for desktop view
-i, --interval Change request interval. "-i 0.2" changes from default 0.1s to 0.2s
-v, --verbose Show complete response header under /tmp/crawler.log
-d, --debug-url Test one URL directly. "sh M2-crawler.sh -v -d http://example.com/test.html"
-qs,--crawl-qs Crawl sitemap, including URLS with query strings
-r, --report Display total count of crawl result
EOF
;;
esac
exit 1
}
function echoY() {
echo -e "\033[38;5;148m${1}\033[39m"
}
function echoG() {
echo -e "\033[38;5;71m${1}\033[39m"
}
function echoR
{
echo -e "\033[38;5;203m${1}\033[39m"
}
function echoB()
{
echo -e "\033[1;3;94m${1}\033[0m"
}
function echoCYAN
{
FLAG=$1
shift
echo -e "\033[1;36m$FLAG\033[0m$@"
}
function checkcurlver(){
curl --help | grep 'Use HTTP 2' > /dev/null
if [ ${?} = 0 ]; then
CURL_OPTS='--http1.1'
fi
}
### Curl with version1 only
checkcurlver
function excludecookie(){
### Check if cloudflare
if [[ $(echo "${1}" | grep -i 'Server: cloudflare') ]]; then
CURLRESULT=$(echo "${1}" | grep -Ev 'Set-Cookie.*__cfduid')
fi
}
function duplicateck(){
grep -w "${1}" ${2} >/dev/null 2>&1
}
function cachecount(){
if [ ${1} = 'miss' ]; then
CT_CACHEMISS=$((CT_CACHEMISS+1))
elif [ ${1} = 'hit' ]; then
CT_CACHEHIT=$((CT_CACHEHIT+1))
elif [ ${1} = 'no' ]; then
CT_NOCACHE=$((CT_NOCACHE+1))
elif [ ${1} = 'black' ]; then
CT_BLACKLIST=$((CT_BLACKLIST+1))
elif [ ${1} = 'fail' ]; then
CT_FAILCACHE=$((CT_FAILCACHE+1))
elif [[ ${1} =~ ^[0-9]+$ ]]; then
CT_URLS=$((CT_URLS+${1}))
else
echoR "${1} no define to cachecount!"
fi
}
prttwostr(){
printf "\033[38;5;71m%s\033[39m \t%s\t \033[1;30m%s\033[0m \n" "${1}" "${2}" "${3}"
}
function cachereport(){
echoY '=====================Crawl result:======================='
prttwostr "Total URLs :" "${CT_URLS}" ''
prttwostr "Added :" "${CT_CACHEMISS}" ''
prttwostr "Existing :" "${CT_CACHEHIT}" ''
if [ "${CT_NOCACHE}" -gt 0 ]; then
TMPMESG="(Page with 'no cache', please check cache debug log for the reason)"
else
TMPMESG=''
fi
prttwostr "Skipped :" "${CT_NOCACHE}" "${TMPMESG}"
if [ "${BLACKLIST}" != 'OFF' ]; then
if [ "${CT_BLACKLIST}" -gt 0 ]; then
TMPMESG="(Pages with status code ${ERR_LIST} may add into blacklist)"
else
TMPMESG=''
fi
prttwostr "Blacklisted:" "${CT_BLACKLIST}" "${TMPMESG}"
fi
if [ "${CT_FAILCACHE}" -gt 0 ]; then
TMPMESG="(Pages with status code ${ERR_LIST} may add into blacklist)"
else
TMPMESG=''
fi
prttwostr "Failed :" "${CT_FAILCACHE}" "${TMPMESG}"
}
function protect_count(){
if [ "${PROTECTOR}" = 'ON' ]; then
if [ ${1} -eq 1 ]; then
DETECT_NUM=$((DETECT_NUM+1))
if [ ${DETECT_NUM} -ge ${DETECT_LIMIT} ]; then
echoR "Hit ${DETECT_LIMIT} times 'page error' or 'no cache' in a row, abort !!"
echoR "To run script with no abort, please set PROTECTOR from 'ON' to 'OFF'."
exit 1
fi
elif [ ${1} -eq 0 ]; then
DETECT_NUM=0
fi
fi
}
function addtoblacklist(){
echoB "Add ${1} to BlackList"
echo "${1}" >> ${BLACKLSPATH}
}
function debugurl_display(){
echo ''
echoY "-------Debug curl start-------"
echoY "URL: ${2}"
echoY "AGENTDESKTOP: ${1}"
echoY "COOKIE: ${3}"
echo "${4}"
echoY "-------Debug curl end-------"
echoY "Header Match: ${5}"
}
function crawl_verbose(){
echo "URL: ${2}" >> ${CRAWLLOG}
echo "AGENTDESKTOP: ${1}" >> ${CRAWLLOG}
echo "COOKIE: ${3}" >> ${CRAWLLOG}
echo "${4}" >> ${CRAWLLOG}
echo "Header Match: ${5}" >> ${CRAWLLOG}
echo "----------------------------------------------------------" >> ${CRAWLLOG}
}
function crawlreq() {
if [ "${DEBUGURL}" != "OFF" ] && [ "${BLACKLIST}" = 'ON' ]; then
duplicateck ${2} ${BLACKLSPATH}
if [ ${?} = 0 ]; then
echoY "${2} is in blacklist"
exit 0
fi
fi
echo "${2} -> " | tr -d '\n'
CURLRESULT=$(curl ${CURL_OPTS} -siLk -b name="${3}" -X GET -H "${1}" ${2} | tac | tac | sed '/Server: /q')
excludecookie "${CURLRESULT}"
STATUS_CODE=$(echo "${CURLRESULT}" | grep HTTP | awk '{print $2}')
CHECKMATCH=$(echo ${STATUS_CODE} | grep -Eio "$(echo ${ERR_LIST} | tr -d "'")")
if [ "${CHECKMATCH}" == '' ]; then
CHECKMATCH=$(grep -Eio '(x-lsadc-cache: hit,litemage|x-lsadc-cache: hit|x-lsadc-cache: miss|x-qc-cache: hit|x-qc-cache: miss)'\
<<< ${CURLRESULT} | tr -d '\n')
fi
if [ "${CHECKMATCH}" == '' ]; then
CHECKMATCH=$(grep -Eio '(X-LiteSpeed-Cache: miss|X-Litespeed-Cache: hit|X-Litespeed-Cache-Control: no-cache)'\
<<< ${CURLRESULT} | tr -d '\n')
fi
if [ "${CHECKMATCH}" == '' ]; then
CHECKMATCH=$(grep -Eio '(lsc_private|HTTP/1.1 201 Created)'\
<<< ${CURLRESULT} | tr -d '\n')
fi
if [ ${VERBOSE} = 'ON' ]; then
crawl_verbose "${1}" "${2}" "${3}" "${CURLRESULT}" "${CHECKMATCH}"
fi
if [[ ${DEBUGURL} != "OFF" ]]; then
debugurl_display "${1}" "${2}" "${3}" "${CURLRESULT}" "${CHECKMATCH}"
fi
case ${CHECKMATCH} in
'CreatedSet-CookieSet-CookieSet-Cookie'|[Xx]-[Ll]ite[Ss]peed-[Cc]ache:\ miss|'X-LSADC-Cache: miss'|[Xx]-[Qq][Cc]-[Cc]ache:\ miss)
echoY 'Caching'
cachecount 'miss'
protect_count 0
;;
'HTTP/1.1 201 Created')
if [ $(echo ${CURLRESULT} | grep -i 'Cookie' | wc -l ) != 0 ]; then
if [[ ${DEBUGURL} != "OFF" ]]; then
echoY "Set-Cookie found"
fi
echoY 'Caching'
cachecount 'miss'
else
echoY 'Already cached'
cachecount 'hit'
fi
protect_count 0
;;
[Xx]-[Ll]ite[Ss]peed-Cache:\ hit|'x-lsadc-cache: hit'|'x-lsadc-cache: hit,litemage'|'x-qc-cache: hit')
echoY 'Already cached'
cachecount 'hit'
protect_count 0
;;
'HTTP/1.1 201 Createdlsc_private')
echoY 'Caching'
cachecount 'miss'
protect_count 0
;;
'400'|'401'|'403'|'404'|'407'|'500'|'502')
echoY "STATUS: ${CHECKMATCH}, can not cache"
cachecount 'fail'
protect_count 1
if [ "${BLACKLIST}" = 'ON' ]; then
addtoblacklist ${2}
fi
;;
[Xx]-[Ll]ite[sS]peed-Cache-Control:\ no-cache)
echoY 'No Cache page'
cachecount 'no'
protect_count 1
### To add 'no cache' page to black list, remove following lines' comment
#if [ "${BLACKLIST}" = 'ON' ]; then
# addtoblacklist ${2}
#fi
;;
*)
echoY 'No Need To Cache'
cachecount 'no'
;;
esac
}
function runLoop() {
for URL in ${URLLIST}; do
local ONLIST='NO'
if [ "${BLACKLIST}" = 'ON' ]; then
duplicateck ${URL} ${BLACKLSPATH}
if [ ${?} -eq 0 ]; then
ONLIST='YES'
cachecount 'black'
fi
fi
if [ "${ONLIST}" = 'NO' ]; then
crawlreq "${1}" "${URL}" "${2}"
sleep ${SVALUE}
fi
done
}
function validmap(){
CURL_CMD="curl -I -w httpcode=%{http_code}"
CURL_MAX_CONNECTION_TIMEOUT="-m 100"
CURL_RETURN_CODE=0
CURL_OUTPUT=$(${CURL_CMD} ${CURL_MAX_CONNECTION_TIMEOUT} ${SITEMAP} 2> /dev/null) || CURL_RETURN_CODE=$?
if [ ${CURL_RETURN_CODE} -ne 0 ]; then
echoR "Curl connection failed with return code - ${CURL_RETURN_CODE}, exit"
exit 1
else
HTTPCODE=$(echo "${CURL_OUTPUT}" | grep 'HTTP'| awk '{print $2}')
if [ "${HTTPCODE}" != '200' ]; then
echoR "Curl operation/command failed due to server return code - ${HTTPCODE}, exit"
exit 1
fi
echoG "SiteMap connection success \n"
fi
}
function checkcrawler() {
TRYURL=$(echo ${URLLIST} | cut -d " " -f1)
CRAWLRESULT=$(curl ${CURL_OPTS} -sI -X GET -H "${AGENTDESKTOP}" $TRYURL| grep -o "Precondition Required")
if [ "${CRAWLRESULT}" = 'Precondition Required' ]; then
help_message 1
fi
}
function genrandom(){
RANDOMSTR=$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 16 ; echo '')
}
function getcookie() {
for URL in ${URLLIST}; do
local ONLIST='NO'
if [ "${BLACKLIST}" = 'ON' ]; then
duplicateck ${URL} ${BLACKLSPATH}
if [ ${?} -eq 1 ]; then
break
fi
fi
done
COOKIESTRING=$(curl ${CURL_OPTS} -sILK -X GET ${URL} | grep 'Set-Cookie' | awk '{print $2}' | tr '\n' ' ')
if [ "${COOKIESTRING}" = '' ]; then
genrandom
COOKIESTRING=$(curl ${CURL_OPTS} -sILK -X GET ${URL}?${RANDOMSTR} | grep 'Set-Cookie' | awk '{print $2}' | tr '\n' ' ')
fi
COOKIE="${COOKIESTRING}"
}
function debugurl() {
if [ "${WITH_COOKIE}" = 'ON' ]; then
getcookie
fi
if [ "${WITH_MOBILE}" = 'ON' ]; then
crawlreq "${AGENTMOBILE}" "${1}" "${COOKIE}"
else
crawlreq "${AGENTDESKTOP}" "${1}" "${COOKIE}"
fi
}
function storexml() {
validmap
if [ $(echo ${1} | grep '\.xml$'|wc -l) != 0 ]; then
XML_URL=$(curl ${CURL_OPTS} -sk ${1}| grep '<loc>' | grep '\.xml' | sed -e 's/.*<loc>\(.*\)<\/loc>.*/\1/')
XML_NUM=$(echo ${XML_URL} | grep '\.xml' | wc -l)
if [ ${XML_NUM} -gt 0 ]; then
for URL in $XML_URL; do
XML_LIST=(${URL} "${XML_LIST[@]}")
done
else
XML_LIST=(${1} "${XML_LIST[@]}")
fi
else
echo "SITEMAP: $SITEMAP is not a valid xml"
help_message 2
fi
}
function maincrawl() {
checkcrawler
echoY "There are ${URLCOUNT} urls in this sitemap"
if [ ${URLCOUNT} -gt 0 ]; then
START_TIME="$(date -u +%s)"
echoY 'Starting to view with desktop agent...'
if [ "${WITH_COOKIE}" = 'ON' ]; then
getcookie
fi
cachecount ${URLCOUNT}
runLoop "${AGENTDESKTOP}" "${COOKIE}"
if [ "${WITH_MOBILE}" = 'ON' ]; then
echoY 'Starting to view with mobile agent...'
cachecount ${URLCOUNT}
runLoop "${AGENTMOBILE}" "${COOKIE}"
fi
END_TIME="$(date -u +%s)"
ELAPSED="$((${END_TIME}-${START_TIME}))"
echoY "***Total of ${ELAPSED} seconds to finish process***"
fi
}
function main(){
if [ "${DEBUGURL}" != 'OFF' ]; then
debugurl ${DEBUGURL}
else
for XMLURL in "${XML_LIST[@]}"; do
echoCYAN "Prepare to crawl ${XMLURL} XML file"
if [ "${CRAWLQS}" = 'ON' ]; then
URLLIST=$(curl ${CURL_OPTS} --silent ${XMLURL} | sed -e 's/\/url/\n/g'| grep '<loc>' | \
sed -e 's/.*<loc>\(.*\)<\/loc>.*/\1/' | sed 's/<!\[CDATA\[//;s/]]>//' | \
grep -iPo '^((?!png|jpg|webp).)*$' | sort -u)
else
URLLIST=$(curl ${CURL_OPTS} --silent ${XMLURL} | sed -e 's/\/url/\n/g'| grep '<loc>' | \
sed -e 's/.*<loc>\(.*\)<\/loc>.*/\1/' | sed 's/<!\[CDATA\[//;s/]]>//;s/.*?.*//' | \
grep -iPo '^((?!png|jpg|webp).)*$' | sort -u)
fi
URLCOUNT=$(echo "${URLLIST}" | grep -c '[^[:space:]]')
maincrawl
done
if [ "${REPORT}" != 'OFF' ]; then
cachereport
fi
fi
}
while [ ! -z "${1}" ]; do
case ${1} in
-h | --help)
help_message 2
;;
-m | --with-mobile | --mobile)
WITH_MOBILE='ON'
;;
-c | --with-cookie | --cookie)
WITH_COOKIE='ON'
;;
-i | --interval) shift
SVALUE=${1}
;;
-g| --general-ua)
AGENTDESKTOP='User-Agent: general_purpose'
;;
-v | --verbose)
VERBOSE='ON'
;;
-b | --black-list)
BLACKLIST='ON'
if [ ! -e ${BLACKLSPATH} ]; then
touch ${BLACKLSPATH}
fi
;;
-d | --debug-url) shift
if [ "${1}" = '' ]; then
help_message 2
else
DEBUGURL="${1}"
URLLIST="${1}"
if [ ! -e ${CRAWLLOG} ]; then
touch ${CRAWLLOG}
fi
fi
;;
-qs | --crawl-qs)
CRAWLQS='ON'
;;
-r | --report)
REPORT='ON'
;;
*)
SITEMAP=${1}
storexml ${SITEMAP}
;;
esac
shift
done
main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment