Skip to content

Instantly share code, notes, and snippets.

@jmkim
Created September 15, 2023 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmkim/eea59a5128590c2359762054e1154a64 to your computer and use it in GitHub Desktop.
Save jmkim/eea59a5128590c2359762054e1154a64 to your computer and use it in GitHub Desktop.
import orjson
from typing import NamedTuple
IN_FILE="전체 내려받기_한국어기초사전_json_20230901/3_5000_20230901.json"
class WordEquivalent(NamedTuple):
language: str = None
lemma: str = None
definition: str = None
class WordExample(NamedTuple):
type: str = None
example: list[str] = []
class WordForm(NamedTuple):
type: str = None
pronunciation: str = None
sound: str = None
class WordType(NamedTuple):
homonym_number: str = None
lexicalUnit: str = None
partOfSpeech: str = None
vocabularyLevel: str = None
semanticCategory: str = None
subjectCategiory: str = None
annotation: str = None
origin: str = None
class WordMeaning(NamedTuple):
id: int = None
definition: str = None
syntacticPattern: str = None
annotation: str = None
equivalent: list[WordEquivalent] = []
form: list[WordForm] = []
type: list[WordType] = []
examples: list[WordExample] = []
class Word(NamedTuple):
writtenForm: str = None
variant: str = None
meanings: list[WordMeaning] = []
# 리스트가 아닐 경우 리스트를 씌워주기
def flatten_arrays(arr_or_obj):
arr = None
if not isinstance(arr_or_obj, list):
arr = [ arr_or_obj ]
else:
arr = arr_or_obj
return arr
# 사전 내 원본 feat = { "feat": [ { "att": KEY, "val": VALUE } ] }
# Flatten 결과 feat = { key: VALUE }
def flatten_feats(feats):
if feats is None or len(feats) <= 0:
return None
flat_feats = dict()
# 비정상 feat : feat이 배열로 묶여 있는 경우. 풀어서 하나씩 추가
# 비정상 예시 : [ { "feat" : {} }, { "feat" : {} } ]
if isinstance(feats, list) and len(feats) > 0 and feats[0].get("feat", None) is not None:
for a in feats:
flat_feats = flat_feats | flatten_feats(a)
# 정상 feat
# 정상 예시 : [ "feat" : [ {}, {} ]
else:
feats = feats["feat"]
feats = flatten_arrays(feats)
for feat in feats:
flat_feats[feat["att"]] = feat["val"]
return flat_feats
# JSON을 Word 객체로 변환
def json_to_word(json):
words = list[Word]
for lexres_k, lexres_v in json.get("LexicalResource").items():
if lexres_k == "GlobalInformation":
print(flatten_feats(lexres_v))
if lexres_k == "Lexicon":
print(lexres_k)
for lex_k, lex_v in lexres_v.items():
if lex_k == "LexicalEntry":
print(lex_k)
entries = lex_v
for entry in entries:
lemma = flatten_feats(entry["Lemma"])
writtenForm = lemma.get("writtenForm", None)
variant = lemma.get("variant", None)
meanings = []
senses = flatten_arrays(entry["Sense"])
for sense in senses:
equivalent = []
for eq_feat in sense.get("Equivalent", []):
equivalent.append(flatten_feats(eq_feat))
examples = []
exs = sense.get("SenseExample", [])
if not isinstance(exs, list):
exs = flatten_arrays(exs)
for ex_feat in exs:
examples.append(flatten_feats(ex_feat))
form = []
for fo_feat in sense.get("WordForm", []):
fo_feats.append(flatten_feats(fo_feat))
type = flatten_feats({"feat": entry.get("feat")})
feats = flatten_feats(sense)
meaning = WordMeaning(
id=int(sense["val"]),
definition=feats.get("definition", None),
syntacticPattern=feats.get("syntacticPattern", None),
annotation=feats.get("annotation", None),
equivalent=equivalent,
form=form,
type=type,
examples=examples
)
meanings.append(meaning)
w = Word(
writtenForm=writtenForm,
variant=variant,
meanings=meanings)
print(w)
with open(IN_FILE, "r", encoding="utf-8") as f:
json_obj = orjson.loads(f.read())
json_to_word(json_obj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment