Created
February 3, 2020 13:28
-
-
Save YusukeOba/e849ab797941b655fc1192e5ec7aed47 to your computer and use it in GitHub Desktop.
カクヨムのランキングから★の平均値を抽出するpythonスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
import time | |
# ひとまず300位まで | |
page_fetch_limit = 3 | |
# ランキングから小説の固有ID一覧を抽出 | |
def fetch_book_identifiers(page_num: int): | |
r = requests.get("https://kakuyomu.jp/rankings/all/entire?page=" + str(page_num)) | |
soup = BeautifulSoup(r.content, "html.parser") | |
# ID抽出 | |
links = soup.find_all("a", "widget-workCard-titleLabel bookWalker-work-title") | |
def extract_id(raw_link): | |
link = raw_link.get("href").replace("/works/", "") | |
return link | |
ids = map(extract_id, links) | |
return ids | |
# 1本の小説の評価情報を抜き出し | |
def fetch_book_review_information(id: int): | |
r = requests.get("https://kakuyomu.jp/works/" + str(id) + "#reviews") | |
soup = BeautifulSoup(r.content, "html.parser") | |
# タイトル | |
title = soup.find("h1", id="workTitle").a.text | |
# ★の数 | |
raw_points = soup.find("span", "js-total-review-point-element").text | |
points = int(raw_points.replace(",", "")) | |
# 評価した人の数 | |
raw_reviewer_count = soup.find("span", "js-review-count-element").text | |
reviewer_count = int(raw_reviewer_count.replace(",", "")) | |
return [ | |
title, | |
points, | |
reviewer_count | |
] | |
f = open('book_reviews.csv', 'w') | |
writer = csv.writer(f, lineterminator='\n') | |
for i in range(page_fetch_limit): | |
# 小説のIDを取得 | |
ids = fetch_book_identifiers(i+1) | |
# レビュー数の平均とタイトルを出す | |
for id in ids: | |
review_info = fetch_book_review_information(id) | |
writer.writerow(review_info) | |
print("proceeded: " + review_info[0]) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment