Skip to content

Instantly share code, notes, and snippets.

@8X7K
Created Dec 8, 2021
Embed
What would you like to do?
シリーズ関係にある作品のペアを抽出するコード
'''
シリーズもののあらすじのペアを抽出する
'''
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
import re
import time
import functools
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?")
concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2)
# https://annict.com/settings/apps から取得できるアクセストークン
ANNICT_SECRET_TOKEN = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# 指定した id のアニメからあらすじを取得する関数
def get_anime_arasuji(id):
# 迷惑をかけないよう 1 秒 sleep する
time.sleep(1)
url = f"https://annict.com/works/{id}"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.select('div.container')
for i, div in enumerate(divs):
if div.text.strip() == "あらすじ":
raw_text = divs[i+1].select_one('div.c-body__content').text
return list(segmenter(raw_text))
return None
# 指定した id のアニメを含むシリーズもののあらすじのペアを取得する関数
def get_series_arasuji_pairs(id):
# 迷惑をかけないよう 1 秒 sleep する
time.sleep(1)
url = f"https://annict.com/works/{id}/related_works"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
div = soup.select_one('div.u-container-flat')
if div.text.strip() == "登録されていません":
return None
ids = []
for a in div.select('a'):
id_text = a.attrs["href"]
id = re.sub(r"/works/(\d+)", r'\1', id_text)
ids.append(int(id))
ids = sorted(list(set(ids)))
arasujis = [get_anime_arasuji(id) for id in ids if get_anime_arasuji(id) is not None]
n = len(arasujis)
if n <= 1:
return None
res = []
for i in range(n):
for j in range(i+1, n):
res.append((arasujis[i], arasujis[j]))
return res
def main():
page_num = 0
while True:
print(f"page_num: {page_num}")
url = f"https://api.annict.com/v1/works?access_token={ANNICT_SECRET_TOKEN}&per_page=50&sort_id=desc&page={page_num}"
r = requests.get(url)
annict_dict = r.json()
animes = annict_dict["works"]
if not annict_dict["works"]:
break
for anime in tqdm(animes):
# クール物の TV アニメ以外は除外する
if anime["media"] != "tv" or "season_name_text" not in anime:
continue
current_pairs = get_series_arasuji_pairs(anime["id"])
if current_pairs is None:
continue
for pair in current_pairs:
res = {
"arasuji1": pair[0],
"arasuji2": pair[1],
}
with open("annict_pair_data.jsonl", "a") as f:
f.write(f"{json.dumps(res, ensure_ascii=False)}\n")
# あとで uniq とかで重複行を消す必要がある
page_num += 1
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment