fakeyanss/zhihu_novels_spider.py

## zhihu_novels_spider.py
import csv
import os
import re
from functools import reduce

import requests

# 知乎有反爬虫，加入http headers伪装浏览器
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
    "Connection": "keep-alive",
    "Accept": "text/html,application/json,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.8"}

# 知乎问题id
question_id = 37985771


def open_statistic_csv(filename):
    csv_writer = csv.writer(open(filename, 'w'))
    csv_writer.writerow(['排名', '小说名', '提名次数'])
    return csv_writer


if __name__ == '__main__':
    question_path = f'./{question_id}'
    if not os.path.isdir(question_path):
        os.mkdir(question_path)
    novels_file = f'{question_id}.csv'
    interval = 20
    offset = 0
    rank = 100
    all_novels = list()
    all_answers = list()

    # 先把该问题下所有的回答和带《》的小说名记录下来
    # 正则匹配被《》括起来的书名
    rex = r'《(.+?)》'
    while True:
        print(f'答案数 {offset} 到 {offset + interval}')
        # 知乎获取回答分页API
        url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=content,voteup_count&limit={interval}&offset={offset}&sort_by=default'
        r = requests.get(url, headers=headers)
        answers = r.json()['data']
        if len(answers) == 0:
            break
        for answer in answers:
            all_answers.append(answer['content'])

            # 结果去重，过滤掉同一个回答里的多次提及导致的重复统计
            results = set(re.findall(rex, answer['content']))
            for novel_name in results:
                if novel_name and novel_name not in all_novels:
                    all_novels.append(novel_name)

        offset += interval

    # 再遍历一次所有回答，统计每个小说名出现的次数
    def make_statistics(novel_name):
        novels_num = map(lambda answer: 1 if novel_name in answer else 0, all_answers)
        novel_sum = reduce(lambda a, b: a + b, novels_num)
        return novel_name, novel_sum

    novels_rank_list = map(make_statistics, all_novels)
    novels_rank_list = map(lambda x: (f'《{x[0]}》', x[1]), novels_rank_list)

    # 把结果按提名次数从高到低排序
    novels_rank_list = sorted(novels_rank_list, key=lambda x: x[1], reverse=True)
    print(f'提名小说总数：{len(novels_rank_list)}')

    # 将统计结果保存到本地CSV文件
    writer = open_statistic_csv(os.path.join(question_path, novels_file))
    for i, novel in enumerate(novels_rank_list):
        writer.writerow([f'{i + 1}', novel[0], novel[1]])

    # 打印前100
    for i, novel in enumerate(novels_rank_list):
        if i > rank - 1:
            break
        name = novel[0]
        num = novel[1]
        print(f'{i + 1}.{name}提名{num}次')
	import csv
	import os
	import re
	from functools import reduce

	import requests

	# 知乎有反爬虫，加入http headers伪装浏览器
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
	"Connection": "keep-alive",
	"Accept": "text/html,application/json,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "zh-CN,zh;q=0.8"}

	# 知乎问题id
	question_id = 37985771


	def open_statistic_csv(filename):
	csv_writer = csv.writer(open(filename, 'w'))
	csv_writer.writerow(['排名', '小说名', '提名次数'])
	return csv_writer


	if __name__ == '__main__':
	question_path = f'./{question_id}'
	if not os.path.isdir(question_path):
	os.mkdir(question_path)
	novels_file = f'{question_id}.csv'
	interval = 20
	offset = 0
	rank = 100
	all_novels = list()
	all_answers = list()

	# 先把该问题下所有的回答和带《》的小说名记录下来
	# 正则匹配被《》括起来的书名
	rex = r'《(.+?)》'
	while True:
	print(f'答案数 {offset} 到 {offset + interval}')
	# 知乎获取回答分页API
	url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers?include=content,voteup_count&limit={interval}&offset={offset}&sort_by=default'
	r = requests.get(url, headers=headers)
	answers = r.json()['data']
	if len(answers) == 0:
	break
	for answer in answers:
	all_answers.append(answer['content'])

	# 结果去重，过滤掉同一个回答里的多次提及导致的重复统计
	results = set(re.findall(rex, answer['content']))
	for novel_name in results:
	if novel_name and novel_name not in all_novels:
	all_novels.append(novel_name)

	offset += interval

	# 再遍历一次所有回答，统计每个小说名出现的次数
	def make_statistics(novel_name):
	novels_num = map(lambda answer: 1 if novel_name in answer else 0, all_answers)
	novel_sum = reduce(lambda a, b: a + b, novels_num)
	return novel_name, novel_sum

	novels_rank_list = map(make_statistics, all_novels)
	novels_rank_list = map(lambda x: (f'《{x[0]}》', x[1]), novels_rank_list)

	# 把结果按提名次数从高到低排序
	novels_rank_list = sorted(novels_rank_list, key=lambda x: x[1], reverse=True)
	print(f'提名小说总数：{len(novels_rank_list)}')

	# 将统计结果保存到本地CSV文件
	writer = open_statistic_csv(os.path.join(question_path, novels_file))
	for i, novel in enumerate(novels_rank_list):
	writer.writerow([f'{i + 1}', novel[0], novel[1]])

	# 打印前100
	for i, novel in enumerate(novels_rank_list):
	if i > rank - 1:
	break
	name = novel[0]
	num = novel[1]
	print(f'{i + 1}.{name}提名{num}次')