Skip to content

Instantly share code, notes, and snippets.

Last active December 29, 2023 07:14
Show Gist options
  • Save Ray-Eldath/2e3a8052dcf558df17aa2b0215b22cb1 to your computer and use it in GitHub Desktop.
Save Ray-Eldath/2e3a8052dcf558df17aa2b0215b22cb1 to your computer and use it in GitHub Desktop.
HoYoLAB Observatory Center Spider (moved to
# this has been moved to
import requests
from lxml import etree
Configurations = {
'genshin_impact': {
'home_url': '',
'detail_url': '{}',
'language_tabs': ['汉语', '日语', '韩语', '英语'],
'character_fn': lambda home: select(select(home, "图鉴", "children"), '角色'),
'root_html_fn': lambda contents: select(contents, '角色展示', sub='text'),
'honkai:_star_rail': {
'home_url': '',
'detail_url': '{}',
'language_tabs': ['中', '日', '英', '韩'],
'character_fn': lambda home: select(select(home, "游戏图鉴", "children"), '角色'),
'root_html_fn': lambda contents: select(contents, '角色百科', sub='text'),
Configuration = Configurations['honkai:_star_rail']
VoiceLines = list[tuple[str, str, str]]
def initial(x):
# assert len(x) == 1
return x[0]
def select(doc, val: str, sub: str = 'list') -> any:
return initial([e[sub] for e in doc if e['name'] == val])
def lift(x, f=lambda id: id):
assert len(x) <= 1
return f(x[0]) if len(x) == 1 else None
def extract_lang_id(doc, lang_id: int) -> int:
languages = Configuration['language_tabs']
return initial(
doc.xpath(f'//ul[@data-target="voiceTab.attr"][1]/li[text()[contains(., "{languages[lang_id]}")]]/@data-index'))
def extract_voice_lines(doc, lang_idx: int) -> VoiceLines:
tbody_xpath = f'//li[@data-index="{lang_idx}"]/table[@class="obc-tmpl-character__voice-pc"]/tbody'
voice_xpath = f'{tbody_xpath}/tr/td/div'
titles = [s.strip() for s in doc.xpath(f'{tbody_xpath}/tr/td[@class="h3"]/text()')]
voices = doc.xpath(voice_xpath)
lines = [lift(e.xpath('./span/text()'), lambda str: str.strip()) for e in voices]
audios = [lift(e.xpath('./div/audio/source/@src')) for e in voices]
assert len(titles) == len(lines) == len(audios)
return list(zip(titles, lines, audios))
class ObcSpider:
def __init__(self, include: list[str] = None,
exclude: list[str] = None,
lang_id: int = 0):
home_url = Configuration['home_url']
home = requests.get(home_url).json()['data']['list']
character = Configuration['character_fn'](home)
print([(e['title'], e['content_id']) for e in character])
content_ids = [e['content_id'] for e in character if
(include is None or e['title'] in include) and (exclude is None or e['title'] not in exclude)]
assert include is None or len(content_ids) == len(include)
self.content_ids = content_ids
self.lang_id = lang_id
self.idx = 0
def next(self):
cid = self.content_ids[self.idx]
detail_url = Configuration['detail_url'].format(cid)
detail_payload = requests.get(detail_url).json()
if detail_payload['retcode'] < 0:
return None
detail = detail_payload['data']['content']
root = Configuration['root_html_fn'](detail['contents'])
except IndexError:
return detail['title'], detail['summary'], cid, []
root_html = etree.HTML(root)
lang_idx = extract_lang_id(root_html, lang_id)
return detail['title'], detail['summary'], cid, extract_voice_lines(root_html, lang_idx)
def __iter__(self):
return self
def __next__(self):
while True:
if self.idx >= len(self.content_ids):
raise StopIteration
val =
self.idx += 1
if val is not None:
return val
if __name__ == '__main__':
lang_id = 3
for (name, summary, cid, lines) in ObcSpider(lang_id=lang_id, include=['彦卿']):
print(f"{name} - {summary}")
for (title, line, audio_url) in lines:
print(f"\t{title} - {line}: {audio_url}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment