Skip to content

Instantly share code, notes, and snippets.

@mkyt
Created January 16, 2020 16:08
Show Gist options
  • Save mkyt/746ce66c418fb82951cf79c6ba871b43 to your computer and use it in GitHub Desktop.
Save mkyt/746ce66c418fb82951cf79c6ba871b43 to your computer and use it in GitHub Desktop.
convert EIJIRO text data into JSON format
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""eijiro.py - convert EIJIRO text data into JSON format
Input: Raw EIJIRO text data (can be purchased from https://booth.pm/ja/items/777563 )
Output: Dictionary data in JSON format
< Data Schema >
top level: Word[]
Word: {
title: string
meanings: Definition[]
level: int? // SVL (standard vocabulary level)
phonetic: string? // phonetic transcription
kana: string? // pronunciation represented in kana
syllables: string? // syllabification
forms: string? // different forms (sg vs pl nouns, verb tenses, etc)
same_sound: string?
region_sound: string?
spell_warning: string?
}
Definition: {
wc: string?
body: string
}
"""
import sys
import re
import json
INPUT_FILE = 'EIJIRO-1448.TXT'
OUTPUT_FILE = 'eijiro-1448.json'
def load_file(fname):
'''return list of entryies'''
f = open(fname, 'r', encoding='cp932')
d = f.read()
d = '\n' + d
return d.split('\n■')[1:]
def collect_words(entries):
expr = re.compile(r' \{(.+)\}$')
d = {}
def add(t, wc, defin):
item = (wc, defin)
if t in d:
d[t].append(item)
else:
d[t] = [item]
for item in entries:
title, defin = item.split(' : ')
title = title.strip()
defin = defin.strip()
m = expr.search(title)
if m is not None:
wc = m.group(1)
title = title[:-(len(wc)+3)].strip()
add(title, wc, defin)
else:
add(title, None, defin)
return d
kls2key = {
'レベル': 'level',
'発音': 'phonetic',
'発音!': 'phonetic',
'@': 'kana',
'分節': 'syllables',
'変化': 'forms',
'同音': 'same_sound',
'発音の地域差': 'region_sound',
'スペリングに注意': 'spell_warning'
}
def objectify(dct):
res = []
for i, item in enumerate(dct.items()):
#if i % 100000 == 0:
# print('processing #{} of {}'.format(i, len(dct)))
k, vs = item
entry = {}
entry['title'] = k
meanings = []
for v in vs:
wc, defin = v
if wc is None and defin.startswith('【'): # info line
for s in defin[1:].split('、【'):
try:
kls, content = s.split('】')
except:
s = s.replace('】】', '】') # workaround for bug in `piranha`
kls, content = s.split('】')
key = kls2key[kls]
if key == 'level':
content = int(content)
entry[key] = content
else: # definition line
dfn = {'body': defin}
if wc is not None:
dfn['wc'] = wc
meanings.append(dfn)
entry['meanings'] = meanings
res.append(entry)
return res
def main():
print('loading dictionary file')
entries = load_file(INPUT_FILE)
print('collecting entry for each word')
dct = collect_words(entries)
print('generating word object')
res = objectify(dct)
print('writing json')
json.dump(res, open(OUTPUT_FILE, 'w'), ensure_ascii=False)
return 0
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment