Created
September 29, 2019 15:21
-
-
Save fakeyanss/7b1e137bce376dd101b3d866b84f8e34 to your computer and use it in GitHub Desktop.
统计知乎问题:有哪些可称为「神作」的网络小说?所有答案推荐的书名出现次数,https://www.zhihu.com/question/37985771
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import re | |
from functools import reduce | |
import requests | |
# 知乎有反爬虫,加入http headers伪装浏览器 | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", | |
"Connection": "keep-alive", | |
"Accept": "text/html,application/json,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
"Accept-Language": "zh-CN,zh;q=0.8"} | |
# 知乎问题id | |
question_id = 37985771 | |
def open_statistic_csv(filename): | |
csv_writer = csv.writer(open(filename, 'w')) | |
csv_writer.writerow(['排名', '小说名', '提名次数']) | |
return csv_writer | |
if __name__ == '__main__': | |
question_path = f'./{question_id}' | |
if not os.path.isdir(question_path): | |
os.mkdir(question_path) | |
novels_file = f'{question_id}.csv' | |
interval = 20 | |
offset = 0 | |
rank = 100 | |
all_novels = list() | |
all_answers = list() | |
# 先把该问题下所有的回答和带《》的小说名记录下来 | |
# 正则匹配被《》括起来的书名 | |
rex = r'《(.+?)》' | |
while True: | |
print(f'答案数 {offset} 到 {offset + interval}') | |
# 知乎获取回答分页API | |
url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=content,voteup_count&limit={interval}&offset={offset}&sort_by=default' | |
r = requests.get(url, headers=headers) | |
answers = r.json()['data'] | |
if len(answers) == 0: | |
break | |
for answer in answers: | |
all_answers.append(answer['content']) | |
# 结果去重,过滤掉同一个回答里的多次提及导致的重复统计 | |
results = set(re.findall(rex, answer['content'])) | |
for novel_name in results: | |
if novel_name and novel_name not in all_novels: | |
all_novels.append(novel_name) | |
offset += interval | |
# 再遍历一次所有回答,统计每个小说名出现的次数 | |
def make_statistics(novel_name): | |
novels_num = map(lambda answer: 1 if novel_name in answer else 0, all_answers) | |
novel_sum = reduce(lambda a, b: a + b, novels_num) | |
return novel_name, novel_sum | |
novels_rank_list = map(make_statistics, all_novels) | |
novels_rank_list = map(lambda x: (f'《{x[0]}》', x[1]), novels_rank_list) | |
# 把结果按提名次数从高到低排序 | |
novels_rank_list = sorted(novels_rank_list, key=lambda x: x[1], reverse=True) | |
print(f'提名小说总数:{len(novels_rank_list)}') | |
# 将统计结果保存到本地CSV文件 | |
writer = open_statistic_csv(os.path.join(question_path, novels_file)) | |
for i, novel in enumerate(novels_rank_list): | |
writer.writerow([f'{i + 1}', novel[0], novel[1]]) | |
# 打印前100 | |
for i, novel in enumerate(novels_rank_list): | |
if i > rank - 1: | |
break | |
name = novel[0] | |
num = novel[1] | |
print(f'{i + 1}.{name}提名{num}次') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment