Last active
December 29, 2023 07:14
-
-
Save Ray-Eldath/2e3a8052dcf558df17aa2b0215b22cb1 to your computer and use it in GitHub Desktop.
HoYoLAB Observatory Center Spider (moved to https://github.com/Ray-Eldath/obcSpider)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this has been moved to https://github.com/Ray-Eldath/obcSpider | |
import requests | |
from lxml import etree | |
Configurations = { | |
'genshin_impact': { | |
'home_url': 'https://api-static.mihoyo.com/common/blackboard/ys_obc/v1/home/content/list?app_sn=ys_obc&channel_id=189', | |
'detail_url': 'https://api-static.mihoyo.com/common/blackboard/ys_obc/v1/content/info?app_sn=ys_obc&content_id={}', | |
'language_tabs': ['汉语', '日语', '韩语', '英语'], | |
'character_fn': lambda home: select(select(home, "图鉴", "children"), '角色'), | |
'root_html_fn': lambda contents: select(contents, '角色展示', sub='text'), | |
}, | |
'honkai:_star_rail': { | |
'home_url': 'https://api-static.mihoyo.com/common/blackboard/sr_wiki/v1/home/content/list?app_sn=sr_wiki&channel_id=17', | |
'detail_url': 'https://api-static.mihoyo.com/common/blackboard/sr_wiki/v1/content/info?app_sn=sr_wiki&content_id={}', | |
'language_tabs': ['中', '日', '英', '韩'], | |
'character_fn': lambda home: select(select(home, "游戏图鉴", "children"), '角色'), | |
'root_html_fn': lambda contents: select(contents, '角色百科', sub='text'), | |
} | |
} | |
Configuration = Configurations['honkai:_star_rail'] | |
VoiceLines = list[tuple[str, str, str]] | |
def initial(x): | |
# assert len(x) == 1 | |
return x[0] | |
def select(doc, val: str, sub: str = 'list') -> any: | |
return initial([e[sub] for e in doc if e['name'] == val]) | |
def lift(x, f=lambda id: id): | |
assert len(x) <= 1 | |
return f(x[0]) if len(x) == 1 else None | |
def extract_lang_id(doc, lang_id: int) -> int: | |
languages = Configuration['language_tabs'] | |
return initial( | |
doc.xpath(f'//ul[@data-target="voiceTab.attr"][1]/li[text()[contains(., "{languages[lang_id]}")]]/@data-index')) | |
def extract_voice_lines(doc, lang_idx: int) -> VoiceLines: | |
tbody_xpath = f'//li[@data-index="{lang_idx}"]/table[@class="obc-tmpl-character__voice-pc"]/tbody' | |
voice_xpath = f'{tbody_xpath}/tr/td/div' | |
titles = [s.strip() for s in doc.xpath(f'{tbody_xpath}/tr/td[@class="h3"]/text()')] | |
voices = doc.xpath(voice_xpath) | |
lines = [lift(e.xpath('./span/text()'), lambda str: str.strip()) for e in voices] | |
audios = [lift(e.xpath('./div/audio/source/@src')) for e in voices] | |
assert len(titles) == len(lines) == len(audios) | |
return list(zip(titles, lines, audios)) | |
class ObcSpider: | |
def __init__(self, include: list[str] = None, | |
exclude: list[str] = None, | |
lang_id: int = 0): | |
home_url = Configuration['home_url'] | |
home = requests.get(home_url).json()['data']['list'] | |
character = Configuration['character_fn'](home) | |
print([(e['title'], e['content_id']) for e in character]) | |
content_ids = [e['content_id'] for e in character if | |
(include is None or e['title'] in include) and (exclude is None or e['title'] not in exclude)] | |
assert include is None or len(content_ids) == len(include) | |
self.content_ids = content_ids | |
self.lang_id = lang_id | |
self.idx = 0 | |
def next(self): | |
cid = self.content_ids[self.idx] | |
detail_url = Configuration['detail_url'].format(cid) | |
detail_payload = requests.get(detail_url).json() | |
if detail_payload['retcode'] < 0: | |
return None | |
detail = detail_payload['data']['content'] | |
try: | |
root = Configuration['root_html_fn'](detail['contents']) | |
except IndexError: | |
return detail['title'], detail['summary'], cid, [] | |
root_html = etree.HTML(root) | |
lang_idx = extract_lang_id(root_html, lang_id) | |
return detail['title'], detail['summary'], cid, extract_voice_lines(root_html, lang_idx) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
while True: | |
if self.idx >= len(self.content_ids): | |
raise StopIteration | |
val = self.next() | |
self.idx += 1 | |
if val is not None: | |
return val | |
if __name__ == '__main__': | |
lang_id = 3 | |
for (name, summary, cid, lines) in ObcSpider(lang_id=lang_id, include=['彦卿']): | |
print(f"{name} - {summary}") | |
for (title, line, audio_url) in lines: | |
print(f"\t{title} - {line}: {audio_url}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment