Skip to content

Instantly share code, notes, and snippets.

@dahlia
Last active July 5, 2022 17:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dahlia/293e7de7b9645b3c9f6840ca0aea9b7d to your computer and use it in GitHub Desktop.
Save dahlia/293e7de7b9645b3c9f6840ca0aea9b7d to your computer and use it in GitHub Desktop.
Analyze conditional substructures of XML files exported from Korean Standard Language Dictionary (標準國語大辭典)

This script inductively analyzes relationships between conditional values and their dependent substructures from XML files exported from the official website of Standard Korean Language Dictionary (標準國語大辭典).1

Tested with Python 3.9, but it would probably work with Python 3.8, and maybe even 3.7.

The usage is simple. Just pass all exported dictionary XML files into its arguments:

./analyze.py ./*.xml

Here's an example result (as of July 2022):

<pos>
  (Common): comm_pattern_info, pos_code
  동사:
  구:
  명사:
  품사 없음:
  부사:
  형용사:
  어미:
  접사:
  의존 명사:
  대명사:
  관형사:
  보조 동사:
  감탄사:
  조사:
  수사:

<unit>
  (Common): link, link_target_code, type, word
  의미:
  어휘:

<word_type>
  (Common): lexical_info, origin, original_language_info, pos_info, relation_info, word, word_unit
  고유어: conju_info, pronunciation_info
  한자어: conju_info, pronunciation_info
  외래어: allomorph
  혼종어: conju_info, pronunciation_info

<word_unit>
  (Common): pos_info, word
  단어: allomorph, conju_info, lexical_info, origin, original_language_info, pronunciation_info, relation_info, word_type
  속담:
  구: lexical_info, original_language_info, word_type
  관용구:

I know it's trivial, but distributed under GPLv3 or later.

Footnotes

  1. For your information, you need an account for stdict.korean.go.kr to download exported XML files.

#!/usr/bin/env python3
# Analyze conditional substructures of Korean Standard Language Dictionary XMLs
# Copyright (C) 2022 Hong Minhee <https://hongminhee.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import dataclasses
import functools
import multiprocessing
import os
import pathlib
import sys
from typing import BinaryIO, Dict, Iterable, List, Set, Tuple
try:
from lxml import etree
except ImportError:
from xml.etree import ElementTree as etree
DISCRIMINATORS = {'word_unit', 'word_type', 'pos', 'unit'}
@dataclasses.dataclass(init=True, repr=True)
class Element:
tag: str
discriminators: Dict[str, str] = dataclasses.field(default_factory=dict)
collected_tags: Set[str] = dataclasses.field(default_factory=set)
def analyze(input_: BinaryIO) -> Dict[str, Dict[str, Set[str]]]:
result: Dict[str, Dict[str, Set[str]]] = {tag: {} for tag in DISCRIMINATORS}
stack: List[Element] = []
for event, element in etree.iterparse(input_, events=('start', 'end')):
if event == 'start':
stack.append(Element(element.tag))
continue
popped = stack.pop()
assert popped.tag == element.tag, f'{stack} != {element.tag}'
if stack:
el = stack[-1]
if element.tag in DISCRIMINATORS:
el.discriminators[element.tag] = element.text
el.collected_tags.add(element.tag)
for discriminator, case in el.discriminators.items():
result[discriminator] \
.setdefault(case, set()) \
.update(el.collected_tags - {discriminator})
return result
def merge_analytics(
analytics: Iterable[Dict[str, Dict[str, Set[str]]]]
) -> Dict[str, Dict[str, Set[str]]]:
result: Dict[str, Dict[str, Set[str]]] = {tag: {} for tag in DISCRIMINATORS}
for result in analytics:
for discriminator, cases in result.items():
for case, tags in cases.items():
result[discriminator].setdefault(case, set()).update(tags)
return result
def print_analytics(
analytics: Dict[str, Dict[str, Set[str]]],
*, file=sys.stdout
) -> None:
started = False
for discriminator, cases in sorted(analytics.items(), key=lambda p: p[0]):
if started:
print(file=file)
print(f'<{discriminator}>', file=file)
common_tags: Set[str] = \
functools.reduce(set.intersection, cases.values()) \
if cases \
else set()
print(f' (Common): {", ".join(sorted(common_tags))}', file=file)
for case, tags in cases.items():
print(
f' {case}: {", ".join(sorted(tags - common_tags))}',
file=file
)
started = True
def analyze_file(filename: os.PathLike) -> Dict[str, Dict[str, Set[str]]]:
with open(filename, 'rb') as f:
return analyze(f)
def main():
if len(sys.argv) < 2:
print('error: too few arguments', file=sys.stderr)
print('usage:', sys.argv[0], 'FILE...')
raise SystemExit(1)
files = [pathlib.Path(p) for p in sys.argv[1:]]
if len(files) == 1:
result = analyze_file(files[0])
else:
with multiprocessing.Pool() as pool:
result = merge_analytics(pool.imap_unordered(analyze_file, files))
print_analytics(result)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment