Skip to content

Instantly share code, notes, and snippets.

@fakeyanss
Created September 29, 2019 15:21
Show Gist options
  • Save fakeyanss/7b1e137bce376dd101b3d866b84f8e34 to your computer and use it in GitHub Desktop.
Save fakeyanss/7b1e137bce376dd101b3d866b84f8e34 to your computer and use it in GitHub Desktop.
统计知乎问题:有哪些可称为「神作」的网络小说?所有答案推荐的书名出现次数,https://www.zhihu.com/question/37985771
import csv
import os
import re
from functools import reduce
import requests
# 知乎有反爬虫,加入http headers伪装浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/json,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8"}
# 知乎问题id
question_id = 37985771
def open_statistic_csv(filename):
csv_writer = csv.writer(open(filename, 'w'))
csv_writer.writerow(['排名', '小说名', '提名次数'])
return csv_writer
if __name__ == '__main__':
question_path = f'./{question_id}'
if not os.path.isdir(question_path):
os.mkdir(question_path)
novels_file = f'{question_id}.csv'
interval = 20
offset = 0
rank = 100
all_novels = list()
all_answers = list()
# 先把该问题下所有的回答和带《》的小说名记录下来
# 正则匹配被《》括起来的书名
rex = r'《(.+?)》'
while True:
print(f'答案数 {offset} 到 {offset + interval}')
# 知乎获取回答分页API
url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=content,voteup_count&limit={interval}&offset={offset}&sort_by=default'
r = requests.get(url, headers=headers)
answers = r.json()['data']
if len(answers) == 0:
break
for answer in answers:
all_answers.append(answer['content'])
# 结果去重,过滤掉同一个回答里的多次提及导致的重复统计
results = set(re.findall(rex, answer['content']))
for novel_name in results:
if novel_name and novel_name not in all_novels:
all_novels.append(novel_name)
offset += interval
# 再遍历一次所有回答,统计每个小说名出现的次数
def make_statistics(novel_name):
novels_num = map(lambda answer: 1 if novel_name in answer else 0, all_answers)
novel_sum = reduce(lambda a, b: a + b, novels_num)
return novel_name, novel_sum
novels_rank_list = map(make_statistics, all_novels)
novels_rank_list = map(lambda x: (f'《{x[0]}》', x[1]), novels_rank_list)
# 把结果按提名次数从高到低排序
novels_rank_list = sorted(novels_rank_list, key=lambda x: x[1], reverse=True)
print(f'提名小说总数:{len(novels_rank_list)}')
# 将统计结果保存到本地CSV文件
writer = open_statistic_csv(os.path.join(question_path, novels_file))
for i, novel in enumerate(novels_rank_list):
writer.writerow([f'{i + 1}', novel[0], novel[1]])
# 打印前100
for i, novel in enumerate(novels_rank_list):
if i > rank - 1:
break
name = novel[0]
num = novel[1]
print(f'{i + 1}.{name}提名{num}次')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment