Last active
June 15, 2022 15:23
-
-
Save kiterza/8cb9f4bbac304cbfa7eec14f50aee9db to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
# 解析本地文件 etree.parse() | |
# 解析服务器返回信息 etree.HTML() | |
import urllib.request # 调用urllib库 | |
import csv # 调用csv库,用于写入 | |
import time | |
# 数据采集:http://www.instu.org/da101/zhanshi.php | |
# 对程序函数进行批量导入 | |
def n_abcd(str_a=' ') -> str: # 对答案数据进行规范,由数字转换为ABCD | |
if str_a == '1': | |
return 'A' | |
elif str_a == '2': | |
return 'B' | |
elif str_a == '3': | |
return 'C' | |
elif str_a == '4': | |
return 'D' | |
elif str_a == '5': | |
return 'E' | |
else: | |
return "暂无答案" | |
# 输入正确返回ABCD,输入错误无返回(可能引起报错,需要处理) | |
def rec1_30() -> None: | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
for i in range(1, 31): | |
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(i) | |
UA = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/100.0.4896.75 Safari/537.36 " | |
} | |
request = urllib.request.Request(url=url, headers=UA) | |
url_response = urllib.request.urlopen(request) | |
read = url_response.read().decode("utf-8") | |
tree = etree.HTML(read) | |
for j in range(1, 11): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
try: | |
nn: str = test_fo[0].split('#')[1] | |
except: | |
nn = "暂无答案" | |
data_len = len(test_p) | |
if data_len == 7: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[0] | |
elif nn == "暂无答案": | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[2] | |
else: | |
print("异常3") | |
print(j) | |
elif data_len == 8: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[2].split('#')[1] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
a4 = test_p[5].split('#')[0] | |
else: | |
print("异常") | |
print(j) | |
elif data_len == 9: | |
if nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[0] | |
else: | |
print("异常2") | |
print(j) | |
elif data_len == 10: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
else: | |
print("异常1-10") | |
print(j) | |
else: | |
print("异常1") | |
print(j) | |
# 每个if都和else对应,确认没有未抛出的异常 | |
an1 = "A" + str(a1) | |
an2 = "B" + str(a2) | |
an3 = "C" + str(a3) | |
an4 = "D" + str(a4) | |
an5 = " " | |
ank = test_p[len(test_p) - 1] | |
all_list = [an, an1, an2, an3, an4, an5, n_abcd(nn), ank] | |
writer.writerow(all_list) | |
del all_list, an, an1, an2, an3, an4, nn, data_len, ank, an5 | |
print("\r", end="") | |
print("开始单选数据获取:进度: {}/30,".format(i), end=" ") | |
print("获取完成") | |
def rec31() -> None: | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
url = "http://www.instu.org/da101/zhanshi.php?page=31" | |
UA = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36" | |
} | |
request = urllib.request.Request(url=url, headers=UA) | |
url_response = urllib.request.urlopen(request) | |
read = url_response.read().decode("utf-8") | |
tree = etree.HTML(read) | |
for j in range(1, 8): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
try: | |
nn: str = test_fo[0].split('#')[1] | |
except: | |
nn = "暂无答案" | |
data_len = len(test_p) | |
if data_len == 7: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[0] | |
elif nn == "暂无答案": | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[2] | |
else: | |
print("异常3") | |
print(j) | |
elif data_len == 8: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[2].split('#')[1] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
a4 = test_p[5].split('#')[0] | |
else: | |
print("异常") | |
print(j) | |
elif data_len == 9: | |
if nn == '2': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[0] | |
a3 = test_p[3].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '3': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[0] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
elif nn == '4': | |
a1 = test_p[1].split('#')[2] | |
a2 = test_p[2].split('#')[2] | |
a3 = test_p[3].split('#')[2] | |
a4 = test_p[4].split('#')[0] | |
else: | |
print("异常2") | |
print(j) | |
elif data_len == 10: | |
if nn == '1': | |
a1 = test_p[2].split('#')[0] | |
a2 = test_p[3].split('#')[2] | |
a3 = test_p[4].split('#')[2] | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = "" | |
else: | |
print("异常1-10") | |
print(j) | |
else: | |
print("异常1") | |
print(j) | |
# 每个if都和else对应,确认没有未抛出的异常 | |
an1 = "A" + str(a1) | |
an2 = "B" + str(a2) | |
an3 = "C" + str(a3) | |
an4 = "D" + str(a4) | |
an5 = " " | |
ank = test_p[len(test_p) - 1] | |
all_list = [an, an1, an2, an3, an4, an5, n_abcd(nn), ank] | |
writer.writerow(all_list) | |
print("\r", end="") | |
print("开始混合数据获取:进度: {}/7,".format(j), end=" ") | |
del all_list, an, an1, an2, an3, an4, nn, data_len, ank, an5 | |
print("获取完成") | |
for j in range(8, 11): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
list_fo = [] | |
list_a = [] | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
# print(test_h, test_p) | |
# print(test_fo) | |
for i in test_fo: | |
nn: str = i.split('#')[1] | |
list_fo.append(nn) | |
# print(list_fo) | |
data_len = len(test_p) | |
if '5' not in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
try: | |
a3 = test_p[4].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
try: | |
a4 = test_p[5].split('#')[2] | |
except: | |
a4 = " " | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
try: | |
a3 = test_p[3].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = " " | |
elif '5' in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
a3 = test_p[4].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
a4 = test_p[5].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[6].split('#')[0] | |
else: | |
a5 = test_p[6].split('#')[2] | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
a3 = test_p[3].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
a4 = test_p[4].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[5].split('#')[0] | |
else: | |
a5 = test_p[5].split('#')[2] | |
else: | |
print("异常") | |
print(j) | |
for i in list_fo: | |
list_a.append(n_abcd(i)) | |
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存 | |
del list_fo | |
an1 = "A" + str(a1) | |
an2 = "B" + str(a2) | |
an3 = "C" + str(a3) | |
an4 = "D" + str(a4) | |
try: | |
an5 = "E" + str(a5) | |
del a5 | |
except: | |
an5 = " " | |
ank = test_p[len(test_p) - 1] | |
all_list = [an, an1, an2, an3, an4, an5, list_a, ank] | |
# print(all_list) | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, list_a, ank | |
print("\r", end="") | |
print("开始混合数据获取:进度: {}/10,".format(j), end=" ") | |
print("获取完成") | |
def rec32_47() -> None: | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
for page in range(32, 48): | |
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(page) | |
UA = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36" | |
} | |
request = urllib.request.Request(url=url, headers=UA) | |
url_response = urllib.request.urlopen(request) | |
read = url_response.read().decode("utf-8") | |
tree = etree.HTML(read) | |
for j in range(1, 11): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
list_fo = [] | |
list_a = [] | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
# print(test_h, test_p) | |
# print(test_fo) | |
for i in test_fo: | |
nn: str = i.split('#')[1] | |
list_fo.append(nn) | |
# print(list_fo) | |
data_len = len(test_p) | |
if '5' not in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
try: | |
a3 = test_p[4].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
try: | |
a4 = test_p[5].split('#')[2] | |
except: | |
a4 = " " | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
try: | |
a3 = test_p[3].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = " " | |
elif '5' in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
a3 = test_p[4].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
a4 = test_p[5].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[6].split('#')[0] | |
else: | |
a5 = test_p[6].split('#')[2] | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
a3 = test_p[3].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
a4 = test_p[4].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[5].split('#')[0] | |
else: | |
a5 = test_p[5].split('#')[2] | |
else: | |
print("异常") | |
print(j) | |
for i in list_fo: | |
list_a.append(n_abcd(i)) | |
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存 | |
del list_fo | |
an1 = "A" + str(a1) | |
an2 = "B" + str(a2) | |
an3 = "C" + str(a3) | |
an4 = "D" + str(a4) | |
try: | |
an5 = "E" + str(a5) | |
del a5 | |
except: | |
an5 = " " | |
ank = test_p[len(test_p) - 1] | |
all_list = [an, an1, an2, an3, an4, an5, list_a, ank] | |
# print(all_list) | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, list_a, ank | |
print("\r", end="") | |
print("开始多选数据获取:进度: {}/32-47,".format(page), end=" ") | |
print("获取完成") | |
def rec48() -> None: | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
url = "http://www.instu.org/da101/zhanshi.php?page=48" | |
UA = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/100.0.4896.75 Safari/537.36 " | |
} | |
request = urllib.request.Request(url=url, headers=UA) | |
url_response = urllib.request.urlopen(request) | |
read = url_response.read().decode("utf-8") | |
tree = etree.HTML(read) | |
for j in range(1, 5): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
list_fo = [] | |
list_a = [] | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
for i in test_fo: | |
nn: str = i.split('#')[1] | |
list_fo.append(nn) | |
# print(list_fo) | |
data_len = len(test_p) | |
if '5' not in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
try: | |
a3 = test_p[4].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
try: | |
a4 = test_p[5].split('#')[2] | |
except: | |
a4 = " " | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
try: | |
a3 = test_p[3].split('#')[2] | |
except: | |
a3 = " " | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
try: | |
a4 = test_p[4].split('#')[2] | |
except: | |
a4 = " " | |
elif '5' in list_fo: | |
if '1' in list_fo: | |
a1 = test_p[2].split('#')[0] | |
if '2' in list_fo: | |
a2 = test_p[3].split('#')[0] | |
else: | |
a2 = test_p[3].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[4].split('#')[0] | |
else: | |
a3 = test_p[4].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[5].split('#')[0] | |
else: | |
a4 = test_p[5].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[6].split('#')[0] | |
else: | |
a5 = test_p[6].split('#')[2] | |
else: | |
a1 = test_p[1].split('#')[2] | |
if '2' in list_fo: | |
a2 = test_p[2].split('#')[0] | |
else: | |
a2 = test_p[2].split('#')[2] | |
if '3' in list_fo: | |
a3 = test_p[3].split('#')[0] | |
else: | |
a3 = test_p[3].split('#')[2] | |
if '4' in list_fo: | |
a4 = test_p[4].split('#')[0] | |
else: | |
a4 = test_p[4].split('#')[2] | |
if '5' in list_fo: | |
a5 = test_p[5].split('#')[0] | |
else: | |
a5 = test_p[5].split('#')[2] | |
else: | |
print("异常") | |
print(j) | |
for i in list_fo: | |
list_a.append(n_abcd(i)) | |
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存 | |
del list_fo | |
an1 = "A" + str(a1) | |
an2 = "B" + str(a2) | |
an3 = "C" + str(a3) | |
an4 = "D" + str(a4) | |
try: | |
an5 = "E" + str(a5) | |
del a5 | |
except: | |
an5 = " " | |
ank = test_p[len(test_p) - 1] | |
all_list = [an, an1, an2, an3, an4, an5, list_a, ank] | |
# print(all_list) | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, list_a, ank | |
for j in range(5, 11): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
ann = test_p[0].split('】 ', 1)[1] | |
ank = test_p[1].split('】 ', 1)[1] | |
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " " | |
all_list = [an, an1, an2, an3, an4, an5, ann, ank] | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, ank | |
print("多选判断混合数据获取:进度10/10,", end=" ") | |
time.sleep(0.5) | |
print("获取完成") | |
def rec49_75() -> None: | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
for page in range(49, 76): | |
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(page) | |
UA = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/100.0.4896.75 Safari/537.36 " | |
} | |
request = urllib.request.Request(url=url, headers=UA) | |
url_response = urllib.request.urlopen(request) | |
read = url_response.read().decode("utf-8") | |
tree = etree.HTML(read) | |
if page < 75: | |
for j in range(1, 11): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
ann = test_p[0].split('】 ', 1)[1] | |
ank = test_p[1].split('】 ', 1)[1] | |
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " " | |
all_list = [an, an1, an2, an3, an4, an5, ann, ank] | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, ank | |
else: | |
for j in range(1, 7): | |
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()") | |
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()") | |
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()") | |
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。 | |
ann = test_p[0].split('】 ', 1)[1] | |
ank = test_p[1].split('】 ', 1)[1] | |
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " " | |
all_list = [an, an1, an2, an3, an4, an5, ann, ank] | |
writer.writerow(all_list) | |
del an, an1, an2, an3, an4, an5, ank | |
print("\r", end="") | |
print("开始判断对错数据获取:进度: {}/49-75,".format(page), end=" ") | |
print("获取完成") | |
# 导入完成 | |
if __name__ == '__main__': | |
header = ['题干', '选项A', '选项B', '选项C', '选项D', "选项E", '正确答案', '考点'] | |
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a: | |
writer = csv.writer(a) | |
writer.writerow(header) | |
# 1-31为单选 31为单选多选混合,32到47为多选,48为多选和判断对错混合,其余为判断对错。 | |
rec_open = time.time() | |
print("---------------开始获取---------------") | |
rec1_30() # 对单选题进行爬取存入 | |
rec31() # 对单多选混合进行爬取存入 | |
rec32_47() # 对多选题进行爬取存入 | |
rec48() # 对多选判断混合进行爬取存入 | |
rec49_75() # 对判断进行爬取存入 | |
rec_end = time.time() | |
time.sleep(0.5) | |
print("---------------获取完毕---------------\n") | |
time.sleep(1) | |
atime = rec_end - rec_open | |
print('全部获取完毕,耗时{:.2f}秒'.format(atime)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
如果你要进行本地运行,请务必在此py文件所在目录下建立
test
目录用于存放csv文件。