Skip to content

Instantly share code, notes, and snippets.

Created Dec 8, 2021
What would you like to do?
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import re
import time
import functools
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?")
concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2)
# から取得できるアクセストークン
# 指定した id のアニメからあらすじを取得する関数
def get_anime_arasuji(id):
# 迷惑をかけないよう 1 秒 sleep する
url = f"{id}"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
divs ='div.container')
for i, div in enumerate(divs):
if div.text.strip() == "あらすじ":
raw_text = divs[i+1].select_one('div.c-body__content').text
return list(segmenter(raw_text))
return None
# 指定した id のアニメを含むシリーズもののあらすじのペアを取得する関数
def get_series_arasuji_pairs(id):
# 迷惑をかけないよう 1 秒 sleep する
url = f"{id}/related_works"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
div = soup.select_one('div.u-container-flat')
if div.text.strip() == "登録されていません":
return None
ids = []
for a in'a'):
id_text = a.attrs["href"]
id = re.sub(r"/works/(\d+)", r'\1', id_text)
ids = sorted(list(set(ids)))
arasujis = [get_anime_arasuji(id) for id in ids if get_anime_arasuji(id) is not None]
n = len(arasujis)
if n <= 1:
return None
res = []
for i in range(n):
for j in range(i+1, n):
res.append((arasujis[i], arasujis[j]))
return res
def main():
page_num = 0
while True:
print(f"page_num: {page_num}")
url = f"{ANNICT_SECRET_TOKEN}&per_page=50&sort_id=desc&page={page_num}"
r = requests.get(url)
annict_dict = r.json()
animes = annict_dict["works"]
if not annict_dict["works"]:
for anime in tqdm(animes):
# クール物の TV アニメ以外は除外する
if anime["media"] != "tv" or "season_name_text" not in anime:
current_pairs = get_series_arasuji_pairs(anime["id"])
if current_pairs is None:
for pair in current_pairs:
res = {
"arasuji1": pair[0],
"arasuji2": pair[1],
with open("annict_pair_data.jsonl", "a") as f:
f.write(f"{json.dumps(res, ensure_ascii=False)}\n")
# あとで uniq とかで重複行を消す必要がある
page_num += 1
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment