Last active
August 29, 2015 14:26
-
-
Save ficapy/da009402855f962eba3e to your computer and use it in GitHub Desktop.
多线程请求
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Author: Ficapy | |
# Create: '15/8/6' | |
import time | |
import math | |
import requests | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from xlrd import open_workbook | |
USERNAME = 'XXXXXXXXX' | |
PASSWORD = 'XXXXXXXXX' | |
session = requests.Session() | |
headers = { | |
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.' | |
'50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)', | |
'Host': 'www.szceb.com', | |
'Accept-Encoding': 'gzip, deflate, sdch', | |
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive', | |
} | |
session.headers.update(headers) | |
raw_data = [] | |
with open_workbook('1.xls') as book: | |
sheet = book.sheet_by_index(0) | |
row_number = sheet.nrows | |
for row in range(1, row_number): | |
raw_data.append(sheet.cell(row, 1).value) | |
# 模拟登陆,网站有漏洞,不会验证验证码有效性 | |
session.get('http://www.szceb.com/szceb/login.jsp') | |
loging_url = 'http://www.szceb.com/szceb/login.do?method=login' | |
login = session.post(loging_url, data={ | |
'loginSignal': 1, | |
'username': USERNAME, | |
'password': PASSWORD, | |
'yanzheng': 1234, | |
}) | |
assert '登陆后显示的主页面' in login.text | |
def work(input_data, retry=3): | |
url = 'http://www.szceb.com/szceb/goodsFilingStatus.do?method=filingStatus' | |
data = { | |
'goodsRegCiqList.itemNo': input_data, | |
'goodsRegCiqList.GName': '', | |
'goodsRegCiq.ebcCode': '', | |
'goodsRegCiq.ebpCode': '', | |
'goodsRegCiqList.operType': '', | |
'goodsRegCiqList.ciqStatus': '', | |
'strStatDate': '', | |
'strEndDate': '', | |
'page': 1, | |
} | |
try: | |
result = session.post(url, data=data, timeout=60) | |
result.raise_for_status() | |
if '没有找到任何数据' in result.text: | |
return input_data | |
except: | |
retry -= 1 | |
time.sleep(math.pow(2, retry)) | |
print('{}重试第{}次'.format(input_data, 3 - retry)) | |
if retry < 0: | |
raise ValueError(input_data) | |
return work(input_data, retry) | |
return True | |
def main(): | |
non_data = [] | |
print('开始咯~~~') | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
future_to_url = {executor.submit(work, input_data): input_data for input_data in raw_data} | |
for future in as_completed(future_to_url): | |
input_data = future_to_url[future] | |
try: | |
result = future.result() | |
except Exception as exc: | |
print('%r 请求出错: %s' % (input_data, exc)) | |
else: | |
if result is not True: | |
non_data.append(input_data) | |
print('共查询{}条数据, 没有任何数据的有{}条'.format(len(raw_data), len(non_data))) | |
print('以下条目没有查询到数据:') | |
print('\n'.join(sorted(non_data))) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment