Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Anti-anti-crawler scripts
import re
import time
mkcookie = lambda coo: ";".join(["%s=%s" % (k,v) for k,v in coo.iteritems()])
def autodetect(resp, ct, ht, cookie, headers, url):
__func = None
while 'set-cookie' in resp:
coostr = resp['set-cookie']
_session = re.findall('yunsuo_session_verify=(\w+)', coostr)
if _session:
__func = check_yunsuo
break
_t = re.findall('Just a moment', ct)
if _t:
__func = check_cf
_session = re.findall('__cfduid=([^;]+);', coostr)
break
break
if __func:
return __func(ht, cookie, _session[0], ct, headers, url)
def check_yunsuo(ht, cookie, session, content, headers, url):
import binascii
cookie['yunsuo_session_verify'] = session
cookie['srcurl'] = binascii.hexlify(url)
hd = dict(headers)
hd.update({'Cookie':mkcookie(cookie)})
time.sleep(0.05)
resp, ct = ht.request('%s?security_verify_data=%s' % (url, binascii.hexlify("1366,768")), headers = hd)
cookie['security_session_mid_verify'] = re.findall('security_session_mid_verify=(\w+)', resp['set-cookie'])[0]
return True
def check_cf(ht, cookie, session, content, headers, url):
def get_cv(ct, host_name):
#ct = ct.replace('\n', '').replace('\r', '')
#find all hidden form value
hidden = re.findall('<input type="hidden" name="([^"]+)" value="([^\"]+)"', ct)
hidden = '&'.join(map(lambda x:'='.join(x), hidden))
url = re.findall('<form id="[^"]+" action="([^"]+)" method="get">', ct)[0]
# get var name
# var t,r,a,f, kMuTlpA={"t":+((!+[]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]))};
_, n, m, v = re.findall('var (:?[^,]+,){4} ([^=]+)={"([^"]+)":([^}]+)};', ct, re.DOTALL)[0]
v = calc_symbol(v)
for op, arg in re.findall('%s\.%s(.)=([^;]+);' % (n, m), ct):
v = eval('%d %s %d' % (v, op, calc_symbol(arg)))
# t = re.findall('\+\s*([^\.]+)\.length', ct, re.DOTALL)[0]
# print '%s\.innerHTML\s*=\s*"([^"])";' % t
# new_len = len(re.findall('%s\.innerHTML\s*=\s*"([^"]+)";' % t, ct, re.DOTALL)[0])
# print new_len
v += len(host_name)
wait = re.findall('}, (\d+)\);', ct, re.DOTALL)[0]
return hidden, v, url, wait
def calc_symbol(s):
_ = re.findall('\+?\(\(([^\)]+)\)\+\(([^\)]+)\)\)', s)
#type 1 +((...)+(...)) 2-digit num
if _:
v1, v2 = map(calc_symbol, _[0])
return int(str(v1)+str(v2))
#type 2 plain
else:
vmap = {'!':1, '[]':0, '!![]':1, '':0}
return sum(map(lambda x:vmap[x], s.split('+')))
hd = dict(headers)
cookie['__cfduid'] = session
hd.update({"Cookie":mkcookie(cookie)})
_host = re.findall("https*://([^/]+)", url)[0]
c, v, u, w = get_cv(content, _host)
time.sleep(int(w)/1000+2)
resp, ct = ht.request('%s%s?%s&jschl_answer=%s' % (url, u, c, v), headers = hd)
cfclearance = re.findall('cf_clearance=([^;]+);', resp['set-cookie'])[0]
cookie['cf_clearance'] = cfclearance
return True
import httplib2
import aacs
ht = httplib2.Http()
url = "http://someurl.com/rss"
cookie = {}
hd = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.0 (KHTML, like Gecko) Chrome/24.6.5128.7 Safari/536.0',
'Accept-Language':'zh-CN,zh;q=0.8',
'Accept-Charset':'utf-8;q=0.7,*;q=0.7',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding':'gzip,deflate',
}
resp, ct = ht.request(url, headers = hd)
if aacs.autodetect(resp, ct, ht, cookie, hd, url):
#request again
hd.update({"Cookie":aacs.mkcookie(cookie)})
resp, ct = ht.request(url, headers = hd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment