Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Last active November 22, 2018 04:58
Show Gist options
  • Save ikegami-yukino/68e322082c680d84fd886043718a173c to your computer and use it in GitHub Desktop.
Save ikegami-yukino/68e322082c680d84fd886043718a173c to your computer and use it in GitHub Desktop.
Convert CSJ's xml to plain text
import glob
import html
import re
import sys
import jaconv
re_ogt = re.compile(' OrthographicTranscription="([^"]+)"')
re_a = re.compile('\;([^\)]+)\)?')
re_semicolon = re.compile(';([^\)]+)\)?')
re_d = re.compile('\(D [^\)]+\)')
def main(pattern):
for path in glob.glob(pattern):
start_a = False
with open(path) as f:
for l in f:
if "<SUW" not in l:
continue
ogt = re_ogt.search(l).group(1)
if ogt.startswith(("(F ", "(D", "(M ", "(?")):
continue
elif re_a.search(ogt):
ogt = re_a.search(ogt).group(1)
start_a = False
elif ogt.startswith("(A ") and ";" not in ogt:
start_a = True
continue
elif "(R " in ogt:
ogt = ogt[3:-1]
elif re_semicolon.search(ogt):
ogt = re_semicolon.search(ogt).group(1)
elif ogt.startswith("(O "):
ogt = ogt[3:]
elif start_a:
continue
ogt = re_d.sub('', ogt)
ogt = html.unescape(ogt).replace('<FV>', '').replace('FV>', '')
ogt = ogt[:-1] if ogt.endswith(')') else ogt
ogt = jaconv.z2h(ogt, kana=False, ascii=True, digit=True)
print(ogt, end='')
if 'ClauseBoundaryLabel="[文末]"' in l:
print()
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment