Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
12306 新版验证码识别脚本 (已失效)
#!/usr/bin/env python
# coding=utf8
# author=evi1m0
# website=linux.im
'''
12306 Captcha Picture:
author: Evi1m0@20150316
1. Download Captcha
2. Pic Conver Text
3. Return result
'''
import re
import time
import json
import urllib
import urllib2
import requests
from PIL import Image
def downloadImg():
pic_file = int(time.time())
pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand"
print '[+] Download Picture: {}'.format(pic_url)
try:
resp = requests.get(pic_url, verify=False, timeout=5)
except:
resp = requests.get(pic_url, verify=False, timeout=3)
with open("./12306_pic/%s.jpg"%pic_file, 'wb') as fp:
fp.write(resp.content)
return pic_file
def imgCut():
pic_file = downloadImg()
pic_path = "./12306_pic/%s.jpg" % pic_file
pic_text_path = './12306_pic/%s_text.jpg' % pic_file
pic_obj = Image.open(pic_path)
box = (120,0,290,25)
region = pic_obj.crop(box)
region.save(pic_text_path)
print '[*] Picture Text Picture: {}'.format(pic_text_path)
return pic_path, pic_text_path
def ocrApi(filename):
# Text picture conver text.
upload_pic_url = "http://cn.docs88.com/pdftowordupload2.php"
headers_fake = {
'ccept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'cn.docs88.com',
'Origin': 'http://cn.docs88.com',
'User-Agent': 'Mozilla/5.0 (KHTML, like Gecko) Chrome/41.0.2272.89',
'X-Requested-With': 'ShockwaveFlash/17.0.0.134',
}
filename_tmp = filename.split('/')[-1]
pic_text_content = open(filename).read()
para = {'Filename': filename_tmp,
'sourcename': filename_tmp,
'sourcelanguage': 'cn',
'desttype': 'txt',
'Upload': 'Submit Query',}
upload_pic = requests.post(upload_pic_url, data=para, files={"Filedata" : open(filename, 'rb')}, headers=headers_fake)
time.sleep(2)
text_result_url = 'http://cn.docs88.com/' + upload_pic.content[3:]
text_result = requests.get(text_result_url)
if text_result.status_code == 200:
print '[*] Text: {}'.format(text_result.content)
else:
print '[-] False'
return text_result.content
'''
baidu stu
author: andelf
'''
def baidu_stu_html_extract(html):
pattern = re.compile(r"keywords:'(.*?)'")
matches = pattern.findall(html)
if not matches:
return '[UNKOWN]'
json_str = matches[0]
json_str = json_str.replace('\\x22', '"').replace('\\\\', '\\')
result = [item['keyword'] for item in json.loads(json_str)]
return '|'.join(result) if result else '[UNKOWN]'
def baidu_stu_lookup(im):
url = ("http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id="
"WU_FILE_0&name=233.png&type=image%2Fpng&lastModifiedDate=Mon+Mar"
"+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size=")
im.save("./query_temp_img.png")
raw = open("./query_temp_img.png", 'rb').read()
url = url + str(len(raw))
req = urllib2.Request(url, raw, {'Content-Type':'image/png', 'User-Agent':UA})
resp = urllib2.urlopen(req)
resp_url = resp.read() # return a pure url
url = "http://stu.baidu.com/n/searchpc?queryImageUrl=" + urllib.quote(resp_url)
req = urllib2.Request(url, headers={'User-Agent':UA})
resp = urllib2.urlopen(req)
html = resp.read()
return baidu_stu_html_extract(html)
def get_sub_img(pic_text_path, x, y):
im = Image.open(pic_text_path)
assert 0 <= x <= 3
assert 0 <= y <= 2
WITH = HEIGHT = 68
left = 5 + (67 + 5) * x
top = 41 + (67 + 5) * y
right = left + 67
bottom = top + 67
return im.crop((left, top, right, bottom))
if __name__ == '__main__':
UA = "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
pic_path, pic_text_path = imgCut()
captcha_text = ocrApi(pic_text_path)
dict_list = {}
count = 0
for y in range(2):
for x in range(4):
count += 1
im2 = get_sub_img(pic_path, x, y)
result = baidu_stu_lookup(im2)
dict_list[count] = result
print (y,x), result
if captcha_text.strip() > 2:
print '\n[*] Maybe the result of the:'
maybe_result = []
for v in dict_list:
for c in range(len(unicode(captcha_text.strip(), 'utf8'))):
text = unicode(captcha_text, 'utf8')[c]
if text in dict_list[v]:
_str_res = '%s --- %s' % (v, dict_list[v])
maybe_result.append(_str_res)
for r in list(set(maybe_result)):
print r
else:
print '[-] False'
@harite

This comment has been minimized.

Show comment Hide comment
@harite

harite Mar 17, 2015

有趣 :-)

harite commented Mar 17, 2015

有趣 :-)

@JiounDai

This comment has been minimized.

Show comment Hide comment
@JiounDai

JiounDai Mar 17, 2015

nice

nice

@zhangzhishan

This comment has been minimized.

Show comment Hide comment
@zhangzhishan

zhangzhishan Mar 17, 2015

坑爹的12306证书会报错啊。。

坑爹的12306证书会报错啊。。

@evi1m0

This comment has been minimized.

Show comment Hide comment
@evi1m0

evi1m0 Mar 17, 2015

关于证书问题,我在代码中添加了:

requests.get(pic_url, verify=False, timeout=5)
Owner

evi1m0 commented Mar 17, 2015

关于证书问题,我在代码中添加了:

requests.get(pic_url, verify=False, timeout=5)
@summer20100514

This comment has been minimized.

Show comment Hide comment
@summer20100514

summer20100514 Mar 18, 2015

楼主,我把count加入到了baidu_stu_lookup函数里边,这样就把所有子图片保存了下来。。
count += 1
result = baidu_stu_lookup(im2, count)

楼主,我把count加入到了baidu_stu_lookup函数里边,这样就把所有子图片保存了下来。。
count += 1
result = baidu_stu_lookup(im2, count)

@dpp2009

This comment has been minimized.

Show comment Hide comment
@dpp2009

dpp2009 Mar 19, 2015

试了下 文字没识别出来

dpp2009 commented Mar 19, 2015

试了下 文字没识别出来

@chaojunhou

This comment has been minimized.

Show comment Hide comment
@chaojunhou

chaojunhou Mar 19, 2015

The text can not be recognized, so does the result! but the idea is clear

The text can not be recognized, so does the result! but the idea is clear

@xgqfrms2015

This comment has been minimized.

Show comment Hide comment
@xgqfrms2015

xgqfrms2015 Dec 31, 2015

12306 新版验证码识别脚本:好使不?

12306 新版验证码识别脚本:好使不?

@chemzqm

This comment has been minimized.

Show comment Hide comment
@chemzqm

chemzqm Feb 2, 2016

试了下这里给出的 ocr api,成功率很低,现在 12306 文字上会有一些干扰,这个 ocr 应付不了,大概用商用的会好很多

chemzqm commented Feb 2, 2016

试了下这里给出的 ocr api,成功率很低,现在 12306 文字上会有一些干扰,这个 ocr 应付不了,大概用商用的会好很多

@understar

This comment has been minimized.

Show comment Hide comment
@understar

understar Apr 11, 2016

好的开头等于成功的一半

好的开头等于成功的一半

@guohuifengby

This comment has been minimized.

Show comment Hide comment
@guohuifengby

guohuifengby Apr 25, 2016

IOError: [Errno 2] No such file or directory: './12306_pic/1461559784.jpg'
mac,先解决了几小时解决库问题,发现运行失败。无法创建文件或者读文件吗?我看你用的也是mac呀
---解决,python新手,我以为跟其他语言一样,创建文件的时候,没有那个路径会创建那个路径,结果自己新建了一个12306文件夹就好了。文字识别率太低,不到1%感觉

guohuifengby commented Apr 25, 2016

IOError: [Errno 2] No such file or directory: './12306_pic/1461559784.jpg'
mac,先解决了几小时解决库问题,发现运行失败。无法创建文件或者读文件吗?我看你用的也是mac呀
---解决,python新手,我以为跟其他语言一样,创建文件的时候,没有那个路径会创建那个路径,结果自己新建了一个12306文件夹就好了。文字识别率太低,不到1%感觉

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment