Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
12306 新版验证码识别脚本 (已失效)
#!/usr/bin/env python
# coding=utf8
# author=evi1m0
# website=linux.im
'''
12306 Captcha Picture:
author: Evi1m0@20150316
1. Download Captcha
2. Pic Conver Text
3. Return result
'''
import re
import time
import json
import urllib
import urllib2
import requests
from PIL import Image
def downloadImg():
pic_file = int(time.time())
pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand"
print '[+] Download Picture: {}'.format(pic_url)
try:
resp = requests.get(pic_url, verify=False, timeout=5)
except:
resp = requests.get(pic_url, verify=False, timeout=3)
with open("./12306_pic/%s.jpg"%pic_file, 'wb') as fp:
fp.write(resp.content)
return pic_file
def imgCut():
pic_file = downloadImg()
pic_path = "./12306_pic/%s.jpg" % pic_file
pic_text_path = './12306_pic/%s_text.jpg' % pic_file
pic_obj = Image.open(pic_path)
box = (120,0,290,25)
region = pic_obj.crop(box)
region.save(pic_text_path)
print '[*] Picture Text Picture: {}'.format(pic_text_path)
return pic_path, pic_text_path
def ocrApi(filename):
# Text picture conver text.
upload_pic_url = "http://cn.docs88.com/pdftowordupload2.php"
headers_fake = {
'ccept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'cn.docs88.com',
'Origin': 'http://cn.docs88.com',
'User-Agent': 'Mozilla/5.0 (KHTML, like Gecko) Chrome/41.0.2272.89',
'X-Requested-With': 'ShockwaveFlash/17.0.0.134',
}
filename_tmp = filename.split('/')[-1]
pic_text_content = open(filename).read()
para = {'Filename': filename_tmp,
'sourcename': filename_tmp,
'sourcelanguage': 'cn',
'desttype': 'txt',
'Upload': 'Submit Query',}
upload_pic = requests.post(upload_pic_url, data=para, files={"Filedata" : open(filename, 'rb')}, headers=headers_fake)
time.sleep(2)
text_result_url = 'http://cn.docs88.com/' + upload_pic.content[3:]
text_result = requests.get(text_result_url)
if text_result.status_code == 200:
print '[*] Text: {}'.format(text_result.content)
else:
print '[-] False'
return text_result.content
'''
baidu stu
author: andelf
'''
def baidu_stu_html_extract(html):
pattern = re.compile(r"keywords:'(.*?)'")
matches = pattern.findall(html)
if not matches:
return '[UNKOWN]'
json_str = matches[0]
json_str = json_str.replace('\\x22', '"').replace('\\\\', '\\')
result = [item['keyword'] for item in json.loads(json_str)]
return '|'.join(result) if result else '[UNKOWN]'
def baidu_stu_lookup(im):
url = ("http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id="
"WU_FILE_0&name=233.png&type=image%2Fpng&lastModifiedDate=Mon+Mar"
"+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size=")
im.save("./query_temp_img.png")
raw = open("./query_temp_img.png", 'rb').read()
url = url + str(len(raw))
req = urllib2.Request(url, raw, {'Content-Type':'image/png', 'User-Agent':UA})
resp = urllib2.urlopen(req)
resp_url = resp.read() # return a pure url
url = "http://stu.baidu.com/n/searchpc?queryImageUrl=" + urllib.quote(resp_url)
req = urllib2.Request(url, headers={'User-Agent':UA})
resp = urllib2.urlopen(req)
html = resp.read()
return baidu_stu_html_extract(html)
def get_sub_img(pic_text_path, x, y):
im = Image.open(pic_text_path)
assert 0 <= x <= 3
assert 0 <= y <= 2
WITH = HEIGHT = 68
left = 5 + (67 + 5) * x
top = 41 + (67 + 5) * y
right = left + 67
bottom = top + 67
return im.crop((left, top, right, bottom))
if __name__ == '__main__':
UA = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
pic_path, pic_text_path = imgCut()
captcha_text = ocrApi(pic_text_path)
dict_list = {}
count = 0
for y in range(2):
for x in range(4):
count += 1
im2 = get_sub_img(pic_path, x, y)
result = baidu_stu_lookup(im2)
dict_list[count] = result
print (y,x), result
if captcha_text.strip() > 2:
print '\n[*] Maybe the result of the:'
maybe_result = []
for v in dict_list:
for c in range(len(unicode(captcha_text.strip(), 'utf8'))):
text = unicode(captcha_text, 'utf8')[c]
if text in dict_list[v]:
_str_res = '%s --- %s' % (v, dict_list[v])
maybe_result.append(_str_res)
for r in list(set(maybe_result)):
print r
else:
print '[-] False'

harite commented Mar 17, 2015

有趣 :-)

nice

坑爹的12306证书会报错啊。。

Owner

evi1m0 commented Mar 17, 2015

关于证书问题,我在代码中添加了:

requests.get(pic_url, verify=False, timeout=5)

楼主,我把count加入到了baidu_stu_lookup函数里边,这样就把所有子图片保存了下来。。
count += 1
result = baidu_stu_lookup(im2, count)

dpp2009 commented Mar 19, 2015

试了下 文字没识别出来

The text can not be recognized, so does the result! but the idea is clear

12306 新版验证码识别脚本:好使不?

chemzqm commented Feb 2, 2016

试了下这里给出的 ocr api,成功率很低,现在 12306 文字上会有一些干扰,这个 ocr 应付不了,大概用商用的会好很多

好的开头等于成功的一半

guohuifengby commented Apr 25, 2016

IOError: [Errno 2] No such file or directory: './12306_pic/1461559784.jpg'
mac,先解决了几小时解决库问题,发现运行失败。无法创建文件或者读文件吗?我看你用的也是mac呀
---解决,python新手,我以为跟其他语言一样,创建文件的时候,没有那个路径会创建那个路径,结果自己新建了一个12306文件夹就好了。文字识别率太低,不到1%感觉

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment