Skip to content

Instantly share code, notes, and snippets.

@8X7K

8X7K/extract.py Secret

Created Dec 8, 2021
Embed
What would you like to do?
各作品のあらすじを抽出するコード
import requests
from bs4 import BeautifulSoup
import json
import sys
import time
import functools
from ja_sentence_segmenter.common.pipeline import make_pipeline
from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?…")
concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2)
# https://annict.com/settings/apps から取得できるアクセストークン
ANNICT_SECRET_TOKEN = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# 指定した id のアニメからあらすじを取得する関数
def get_anime_arasuji(id):
# 迷惑をかけないよう 1 秒 sleep する
time.sleep(1)
url = f"https://annict.com/works/{id}"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.select('div.container')
for i, div in enumerate(divs):
if div.text.strip() == "あらすじ":
raw_text = divs[i+1].select_one('div.c-body__content').text
return list(segmenter(raw_text))
return None
page_num = 1
while True:
url = f"https://api.annict.com/v1/works?access_token={ANNICT_SECRET_TOKEN}&per_page=50&sort_id=desc&page={page_num}"
print(page_num)
sys.stdout.flush()
r = requests.get(url)
annict_dict = r.json()
animes = annict_dict["works"]
if not annict_dict["works"]:
break
for anime in animes:
# クール物の TV アニメ以外は除外する
if anime["media"] != "tv" or "season_name_text" not in anime:
continue
arasuji = get_anime_arasuji(anime["id"])
if arasuji is not None:
print(f'arasuji found in {anime["title"]}')
res = {
"タイトル": anime["title"],
"あらすじ": arasuji,
"時期": anime["season_name_text"]
}
with open("annict_data.jsonl", "a") as f:
f.write(f"{json.dumps(res, ensure_ascii=False)}\n")
else:
print(f'arasuji not found in {anime["title"]}')
sys.stdout.flush()
page_num += 1
@koke2c95
Copy link

koke2c95 commented Dec 30, 2021

@8X7K
hello!
thanks interesting idea

I just discovered tmdb can be a good source
some well-known example:
Story synopsis - API
epsiode synopsis - API

if Interested on multilingual sbert multilingual model

I didn't found who build a Parallel text dataset or dump with translations dev reference
you can use discover API to get anime tv/moive id

and here is a related anime Recommendation Systems project

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment