Skip to content

Instantly share code, notes, and snippets.

@wareya
Created June 23, 2018 00:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wareya/cac3d61c6001ff88d2a327b9ef6eb3a1 to your computer and use it in GitHub Desktop.
Save wareya/cac3d61c6001ff88d2a327b9ef6eb3a1 to your computer and use it in GitHub Desktop.
#!python
# -*- coding: utf-8 -*-
from collections import OrderedDict
import json
import sys
from io import open
import re
entries = json.load(open("zero-shinjirin.json", encoding="utf-8"), object_pairs_hook=OrderedDict)
jj = []
def iskana(s):
for c in s:
c = ord(c)
if c == " " or c == " ":
continue
# hiragana and katakana
elif c >= 0x3040 and c < 0x3100:
continue
# halfwidth etc
elif c >= 0xFF65 and c < 0xFFA0:
continue
# ainu extensions
elif c >= 0x31F0 and c < 0x3200:
continue
# hentaigana A
elif c >= 0x1B000 and c < 0x1B100:
continue
# hentaigana B
elif c >= 0x1B100 and c < 0x1B130:
continue
else:
return False
return True
i = 0
for entry in entries["subbooks"][0]["entries"]:
#if i > 10000: break
i += 1
try:
# some entries are broken (zero-epwing gives no text, ebwin4 shows trash)
heading = entry["heading"]
text = entry["text"]
except:
continue
if "【" in heading:
reading = heading.split("【")[0]
spelling = heading.split("【")[1].split("】")[0]
elif "[" in heading:
reading = heading.split("[")[0]
spelling = heading.split("[")[1].split("]")[0]
else:
reading = heading
spelling = ""
reading = reading.replace("-", "")
reading = reading.replace("・", "")
spellings = spelling.split("・")
lines = text.split("\n")[1:-1]
if lines[0] != "" and lines[0][0] == "(" and lines[0][-1] == ")" and ")" not in lines[0][1:-1]:
lines = lines[1:]
if len(reading) == 1 and spellings[0] == "" and len(lines) > 10:
continue
if len(reading) == 0:
continue
if not iskana(reading):
continue
jj += [OrderedDict([("r",reading),("s",spellings),("l",lines)])]
f = open("shinjirin-jj.json", "w", newline="\n", encoding="utf-8")
f.write(json.dumps(jj, ensure_ascii=False, indent=4, separators=(',',':')))#, separators=(',',':')))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment