Skip to content

Instantly share code, notes, and snippets.

@kiterza
Last active June 15, 2022 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kiterza/8cb9f4bbac304cbfa7eec14f50aee9db to your computer and use it in GitHub Desktop.
Save kiterza/8cb9f4bbac304cbfa7eec14f50aee9db to your computer and use it in GitHub Desktop.
from lxml import etree
# 解析本地文件 etree.parse()
# 解析服务器返回信息 etree.HTML()
import urllib.request # 调用urllib库
import csv # 调用csv库,用于写入
import time
# 数据采集:http://www.instu.org/da101/zhanshi.php
# 对程序函数进行批量导入
def n_abcd(str_a=' ') -> str: # 对答案数据进行规范,由数字转换为ABCD
if str_a == '1':
return 'A'
elif str_a == '2':
return 'B'
elif str_a == '3':
return 'C'
elif str_a == '4':
return 'D'
elif str_a == '5':
return 'E'
else:
return "暂无答案"
# 输入正确返回ABCD,输入错误无返回(可能引起报错,需要处理)
def rec1_30() -> None:
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
for i in range(1, 31):
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(i)
UA = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/100.0.4896.75 Safari/537.36 "
}
request = urllib.request.Request(url=url, headers=UA)
url_response = urllib.request.urlopen(request)
read = url_response.read().decode("utf-8")
tree = etree.HTML(read)
for j in range(1, 11):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
try:
nn: str = test_fo[0].split('#')[1]
except:
nn = "暂无答案"
data_len = len(test_p)
if data_len == 7:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[0]
elif nn == "暂无答案":
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[2]
else:
print("异常3")
print(j)
elif data_len == 8:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[2].split('#')[1]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
a4 = test_p[5].split('#')[0]
else:
print("异常")
print(j)
elif data_len == 9:
if nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[0]
else:
print("异常2")
print(j)
elif data_len == 10:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
else:
print("异常1-10")
print(j)
else:
print("异常1")
print(j)
# 每个if都和else对应,确认没有未抛出的异常
an1 = "A" + str(a1)
an2 = "B" + str(a2)
an3 = "C" + str(a3)
an4 = "D" + str(a4)
an5 = " "
ank = test_p[len(test_p) - 1]
all_list = [an, an1, an2, an3, an4, an5, n_abcd(nn), ank]
writer.writerow(all_list)
del all_list, an, an1, an2, an3, an4, nn, data_len, ank, an5
print("\r", end="")
print("开始单选数据获取:进度: {}/30,".format(i), end=" ")
print("获取完成")
def rec31() -> None:
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
url = "http://www.instu.org/da101/zhanshi.php?page=31"
UA = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=UA)
url_response = urllib.request.urlopen(request)
read = url_response.read().decode("utf-8")
tree = etree.HTML(read)
for j in range(1, 8):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
try:
nn: str = test_fo[0].split('#')[1]
except:
nn = "暂无答案"
data_len = len(test_p)
if data_len == 7:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[0]
elif nn == "暂无答案":
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[2]
else:
print("异常3")
print(j)
elif data_len == 8:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[2].split('#')[1]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
a4 = test_p[5].split('#')[0]
else:
print("异常")
print(j)
elif data_len == 9:
if nn == '2':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[0]
a3 = test_p[3].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '3':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[0]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
elif nn == '4':
a1 = test_p[1].split('#')[2]
a2 = test_p[2].split('#')[2]
a3 = test_p[3].split('#')[2]
a4 = test_p[4].split('#')[0]
else:
print("异常2")
print(j)
elif data_len == 10:
if nn == '1':
a1 = test_p[2].split('#')[0]
a2 = test_p[3].split('#')[2]
a3 = test_p[4].split('#')[2]
try:
a4 = test_p[4].split('#')[2]
except:
a4 = ""
else:
print("异常1-10")
print(j)
else:
print("异常1")
print(j)
# 每个if都和else对应,确认没有未抛出的异常
an1 = "A" + str(a1)
an2 = "B" + str(a2)
an3 = "C" + str(a3)
an4 = "D" + str(a4)
an5 = " "
ank = test_p[len(test_p) - 1]
all_list = [an, an1, an2, an3, an4, an5, n_abcd(nn), ank]
writer.writerow(all_list)
print("\r", end="")
print("开始混合数据获取:进度: {}/7,".format(j), end=" ")
del all_list, an, an1, an2, an3, an4, nn, data_len, ank, an5
print("获取完成")
for j in range(8, 11):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
list_fo = []
list_a = []
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
# print(test_h, test_p)
# print(test_fo)
for i in test_fo:
nn: str = i.split('#')[1]
list_fo.append(nn)
# print(list_fo)
data_len = len(test_p)
if '5' not in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
try:
a3 = test_p[4].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
try:
a4 = test_p[5].split('#')[2]
except:
a4 = " "
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
try:
a3 = test_p[3].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
try:
a4 = test_p[4].split('#')[2]
except:
a4 = " "
elif '5' in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
a3 = test_p[4].split('#')[2]
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
a4 = test_p[5].split('#')[2]
if '5' in list_fo:
a5 = test_p[6].split('#')[0]
else:
a5 = test_p[6].split('#')[2]
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
a3 = test_p[3].split('#')[2]
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
a4 = test_p[4].split('#')[2]
if '5' in list_fo:
a5 = test_p[5].split('#')[0]
else:
a5 = test_p[5].split('#')[2]
else:
print("异常")
print(j)
for i in list_fo:
list_a.append(n_abcd(i))
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存
del list_fo
an1 = "A" + str(a1)
an2 = "B" + str(a2)
an3 = "C" + str(a3)
an4 = "D" + str(a4)
try:
an5 = "E" + str(a5)
del a5
except:
an5 = " "
ank = test_p[len(test_p) - 1]
all_list = [an, an1, an2, an3, an4, an5, list_a, ank]
# print(all_list)
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, list_a, ank
print("\r", end="")
print("开始混合数据获取:进度: {}/10,".format(j), end=" ")
print("获取完成")
def rec32_47() -> None:
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
for page in range(32, 48):
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(page)
UA = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=UA)
url_response = urllib.request.urlopen(request)
read = url_response.read().decode("utf-8")
tree = etree.HTML(read)
for j in range(1, 11):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
list_fo = []
list_a = []
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
# print(test_h, test_p)
# print(test_fo)
for i in test_fo:
nn: str = i.split('#')[1]
list_fo.append(nn)
# print(list_fo)
data_len = len(test_p)
if '5' not in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
try:
a3 = test_p[4].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
try:
a4 = test_p[5].split('#')[2]
except:
a4 = " "
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
try:
a3 = test_p[3].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
try:
a4 = test_p[4].split('#')[2]
except:
a4 = " "
elif '5' in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
a3 = test_p[4].split('#')[2]
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
a4 = test_p[5].split('#')[2]
if '5' in list_fo:
a5 = test_p[6].split('#')[0]
else:
a5 = test_p[6].split('#')[2]
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
a3 = test_p[3].split('#')[2]
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
a4 = test_p[4].split('#')[2]
if '5' in list_fo:
a5 = test_p[5].split('#')[0]
else:
a5 = test_p[5].split('#')[2]
else:
print("异常")
print(j)
for i in list_fo:
list_a.append(n_abcd(i))
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存
del list_fo
an1 = "A" + str(a1)
an2 = "B" + str(a2)
an3 = "C" + str(a3)
an4 = "D" + str(a4)
try:
an5 = "E" + str(a5)
del a5
except:
an5 = " "
ank = test_p[len(test_p) - 1]
all_list = [an, an1, an2, an3, an4, an5, list_a, ank]
# print(all_list)
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, list_a, ank
print("\r", end="")
print("开始多选数据获取:进度: {}/32-47,".format(page), end=" ")
print("获取完成")
def rec48() -> None:
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
url = "http://www.instu.org/da101/zhanshi.php?page=48"
UA = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/100.0.4896.75 Safari/537.36 "
}
request = urllib.request.Request(url=url, headers=UA)
url_response = urllib.request.urlopen(request)
read = url_response.read().decode("utf-8")
tree = etree.HTML(read)
for j in range(1, 5):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
list_fo = []
list_a = []
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
for i in test_fo:
nn: str = i.split('#')[1]
list_fo.append(nn)
# print(list_fo)
data_len = len(test_p)
if '5' not in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
try:
a3 = test_p[4].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
try:
a4 = test_p[5].split('#')[2]
except:
a4 = " "
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
try:
a3 = test_p[3].split('#')[2]
except:
a3 = " "
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
try:
a4 = test_p[4].split('#')[2]
except:
a4 = " "
elif '5' in list_fo:
if '1' in list_fo:
a1 = test_p[2].split('#')[0]
if '2' in list_fo:
a2 = test_p[3].split('#')[0]
else:
a2 = test_p[3].split('#')[2]
if '3' in list_fo:
a3 = test_p[4].split('#')[0]
else:
a3 = test_p[4].split('#')[2]
if '4' in list_fo:
a4 = test_p[5].split('#')[0]
else:
a4 = test_p[5].split('#')[2]
if '5' in list_fo:
a5 = test_p[6].split('#')[0]
else:
a5 = test_p[6].split('#')[2]
else:
a1 = test_p[1].split('#')[2]
if '2' in list_fo:
a2 = test_p[2].split('#')[0]
else:
a2 = test_p[2].split('#')[2]
if '3' in list_fo:
a3 = test_p[3].split('#')[0]
else:
a3 = test_p[3].split('#')[2]
if '4' in list_fo:
a4 = test_p[4].split('#')[0]
else:
a4 = test_p[4].split('#')[2]
if '5' in list_fo:
a5 = test_p[5].split('#')[0]
else:
a5 = test_p[5].split('#')[2]
else:
print("异常")
print(j)
for i in list_fo:
list_a.append(n_abcd(i))
# 把获取到的数字答案挨个转换为ABCDE,删除list_fo参数,节约内存
del list_fo
an1 = "A" + str(a1)
an2 = "B" + str(a2)
an3 = "C" + str(a3)
an4 = "D" + str(a4)
try:
an5 = "E" + str(a5)
del a5
except:
an5 = " "
ank = test_p[len(test_p) - 1]
all_list = [an, an1, an2, an3, an4, an5, list_a, ank]
# print(all_list)
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, list_a, ank
for j in range(5, 11):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
ann = test_p[0].split('】 ', 1)[1]
ank = test_p[1].split('】 ', 1)[1]
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " "
all_list = [an, an1, an2, an3, an4, an5, ann, ank]
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, ank
print("多选判断混合数据获取:进度10/10,", end=" ")
time.sleep(0.5)
print("获取完成")
def rec49_75() -> None:
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
for page in range(49, 76):
url = "http://www.instu.org/da101/zhanshi.php?page=" + str(page)
UA = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/100.0.4896.75 Safari/537.36 "
}
request = urllib.request.Request(url=url, headers=UA)
url_response = urllib.request.urlopen(request)
read = url_response.read().decode("utf-8")
tree = etree.HTML(read)
if page < 75:
for j in range(1, 11):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
ann = test_p[0].split('】 ', 1)[1]
ank = test_p[1].split('】 ', 1)[1]
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " "
all_list = [an, an1, an2, an3, an4, an5, ann, ank]
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, ank
else:
for j in range(1, 7):
test_h = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/h4/text()")
test_p = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/text()")
test_fo = tree.xpath(r"//div[@class='div_topic'][" + str(j) + "]/p/font/text()")
an: str = test_h[0].split('、', 1)[1] # 此处需要指定分割时仅仅分割掉第一个’、‘,防止对题目进行分割影响数据。
ann = test_p[0].split('】 ', 1)[1]
ank = test_p[1].split('】 ', 1)[1]
an1, an2, an3, an4, an5 = " ", " ", " ", " ", " "
all_list = [an, an1, an2, an3, an4, an5, ann, ank]
writer.writerow(all_list)
del an, an1, an2, an3, an4, an5, ank
print("\r", end="")
print("开始判断对错数据获取:进度: {}/49-75,".format(page), end=" ")
print("获取完成")
# 导入完成
if __name__ == '__main__':
header = ['题干', '选项A', '选项B', '选项C', '选项D', "选项E", '正确答案', '考点']
with open('./test/数据采集.csv', 'a', newline="", encoding='utf-8') as a:
writer = csv.writer(a)
writer.writerow(header)
# 1-31为单选 31为单选多选混合,32到47为多选,48为多选和判断对错混合,其余为判断对错。
rec_open = time.time()
print("---------------开始获取---------------")
rec1_30() # 对单选题进行爬取存入
rec31() # 对单多选混合进行爬取存入
rec32_47() # 对多选题进行爬取存入
rec48() # 对多选判断混合进行爬取存入
rec49_75() # 对判断进行爬取存入
rec_end = time.time()
time.sleep(0.5)
print("---------------获取完毕---------------\n")
time.sleep(1)
atime = rec_end - rec_open
print('全部获取完毕,耗时{:.2f}秒'.format(atime))
@kiterza
Copy link
Author

kiterza commented Jun 15, 2022

如果你要进行本地运行,请务必在此py文件所在目录下建立test目录用于存放csv文件。

@kiterza
Copy link
Author

kiterza commented Jun 15, 2022

下方的库可能需要你来安装:

lxml

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment