Created
January 5, 2019 04:10
Star
You must be signed in to star a gist
E绅士翻译注射器💉Wiki 解析实现
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
E绅士翻译注射器💉Wiki 解析实现 | |
直接用 EhTagBuilder 就能拿到 JSON 格式的数据库 | |
本脚本是为方便集成做的简易版解析实现 | |
参考了 EhTagTranslator 的实现过程 | |
''' | |
import re | |
import json | |
import requests | |
session = requests.Session() | |
db_raw_base = 'https://raw.githubusercontent.com/wiki/Mapaler/EhTagTranslator/database' | |
db_index = f'{db_raw_base}/rows.md' | |
def db_parse(md_text): | |
re_pattern = [ | |
'\|(.*?)\|(.*?)\|(.*?)\|(.*?)\|' | |
] | |
db_parsed = {} | |
parse_raw = [] | |
if re.search(re_pattern[0], md_text): | |
parse_raw = re.findall(re_pattern[0], md_text) | |
for item in parse_raw: | |
item_name = item[0].strip() | |
if '-' in item_name or '英' in item_name: | |
continue | |
db_parsed[item_name] = { | |
'zh_name': item[1].strip(), | |
'desc': item[2].strip() | |
} | |
return db_parsed | |
def get_index(): | |
print(f'Get page index...') | |
r = session.get(db_index) | |
return db_parse(r.text) | |
def get_db(index_parsed): | |
db_total = {} | |
for k,v in index_parsed.items(): | |
print(f'Fetch & parse: {k}') | |
cat_url = f'{db_raw_base}/{k}.md' | |
r = session.get(cat_url) | |
d = db_parse(r.text) | |
for i,j in d.items(): | |
db_total[i] = j | |
return db_total | |
with open('EhTagWiki.json','wb') as wp: | |
wp.write(json.dumps(get_db(get_index()),ensure_ascii=False,indent=4).encode('UTF-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment