import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import re
import time
import functools
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?")
concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2)
# から取得できるアクセストークン
# 指定した id のアニメからあらすじを取得する関数
def get_anime_arasuji(id):
# 迷惑をかけないよう 1 秒 sleep する
url = f"{id}"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
divs ='div.container')
for i, div in enumerate(divs):
if div.text.strip() == "あらすじ":
raw_text = divs[i+1].select_one('div.c-body__content').text
return list(segmenter(raw_text))
return None
# 指定した id のアニメを含むシリーズもののあらすじのペアを取得する関数
def get_series_arasuji_pairs(id):
# 迷惑をかけないよう 1 秒 sleep する
url = f"{id}/related_works"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
div = soup.select_one('div.u-container-flat')
if div.text.strip() == "登録されていません":
return None
ids = []
for a in'a'):
id_text = a.attrs["href"]
id = re.sub(r"/works/(\d+)", r'\1', id_text)
ids = sorted(list(set(ids)))
arasujis = [get_anime_arasuji(id) for id in ids if get_anime_arasuji(id) is not None]
n = len(arasujis)
if n <= 1:
return None
res = []
for i in range(n):
for j in range(i+1, n):
res.append((arasujis[i], arasujis[j]))
return res
def main():
page_num = 0
while True:
print(f"page_num: {page_num}")
url = f"{ANNICT_SECRET_TOKEN}&per_page=50&sort_id=desc&page={page_num}"
r = requests.get(url)
annict_dict = r.json()
animes = annict_dict["works"]
if not annict_dict["works"]:
for anime in tqdm(animes):
# クール物の TV アニメ以外は除外する
if anime["media"] != "tv" or "season_name_text" not in anime:
current_pairs = get_series_arasuji_pairs(anime["id"])
if current_pairs is None:
for pair in current_pairs:
res = {
"arasuji1": pair[0],
"arasuji2": pair[1],
with open("annict_pair_data.jsonl", "a") as f:
f.write(f"{json.dumps(res, ensure_ascii=False)}\n")
# あとで uniq とかで重複行を消す必要がある
page_num += 1
if __name__ == '__main__':
