Last active
October 10, 2019 13:12
-
-
Save hhsprings/014339eed49db47373a25800bbc66884 to your computer and use it in GitHub Desktop.
MeCab のユーザ辞書作成支援的ななにか (2)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8-unix -*- | |
# | |
# mecab-ipadic-neologd の辞書修正・追加の考え方に共感できないところがある | |
# ため取捨選択しての利用としたい。 | |
# | |
import io | |
import os | |
import re | |
import csv | |
import logging | |
import sys | |
import unicodedata | |
from glob import glob | |
from collections import defaultdict | |
fieldnames_all = [ | |
"表層形", | |
"左文脈ID", "右文脈ID", | |
"コスト", | |
"品詞", "品詞細分類1", "品詞細分類2", "品詞細分類3", | |
"活用形1", "活用形2", | |
"原形", "読み", "発音" | |
] | |
_search_rgx = re.compile(rb"^[-+](?![-+]).*$", flags=re.M) | |
class _Splitter(object): | |
def __init__(self, args): | |
self._args = args | |
orig = self._collect(args.diffdir) | |
result = self._categorize_difftypes(orig) | |
self._write(result) | |
def _collect(self, diffdir): | |
diffdir = diffdir or "." | |
result = {} | |
for fn in sorted(glob(os.path.join(diffdir, "*.diff"))): | |
type_key = re.sub(r"\.csv\.\d+\.diff$", "", os.path.basename(fn)) | |
if type_key not in result: | |
result[type_key] = defaultdict(list) | |
raw = io.open(fn, "rb").read() # バイトの欠落してるのがいる | |
targets = [line.decode("utf-8") | |
for line in _search_rgx.findall(raw)] | |
for line in csv.reader(targets): | |
sgn, key = line[0][0], line[0][1:] | |
if result[type_key][key]: | |
if sgn == "-": | |
trash = False | |
for i in range( | |
len(result[type_key][key]) - 1, -1, -1): | |
prev = result[type_key][key][i] | |
if prev[0][0] == "+" and all([prev[j] == line[j] | |
for j in range(1, len(prev))]): | |
result[type_key][key].pop(i) | |
trash = True | |
break | |
if trash: | |
continue | |
result[type_key][key].append(line) | |
return result | |
def _categorize_difftypes(self, orig): | |
def _difftype(diffpair): | |
if len(diffpair) == 1: | |
t = diffpair[0] | |
return t[0][0], [], [t[0][1:]] + t[1:] | |
else: | |
lhs, rhs = diffpair | |
changes = { | |
fieldnames_all[i]: (lhs[i], rhs[i]) | |
for i in range(1, len(fieldnames_all)) | |
if lhs[i] != rhs[i]} | |
return "M", changes, [rhs[0][1:]] + rhs[1:] | |
# | |
result = {} | |
for type_key, diffs in orig.items(): | |
per_diffkinds = defaultdict(list) | |
for k, v in sorted(diffs.items()): | |
# 今のところ「ひとつ追加 or 削除」「ひとつ変更」「ふたつ変更」 | |
# の3パターンしかないみたい。 | |
if len(v) < 2: | |
dty = _difftype(v) | |
if dty[0] == "-": | |
per_diffkinds[dty[0]].append(dty[2]) | |
else: | |
# 全角を半角に変えただけのものが多い。これは分別したい。 | |
# (ただし、あくまでも「差分」だけをみている。これを | |
# している場合は必ず全角バージョンのほうの「原形」を | |
# 変えているようなので、見つかるはず。) | |
nk = unicodedata.normalize("NFKC", k) | |
for k_in, v_in in diffs.items(): | |
if k_in != k: | |
nk_in = unicodedata.normalize("NFKC", k_in) | |
if nk_in == nk: | |
cmptrg = [line for line in v_in if line[0][0] == "+"][-1] | |
ctk_all = [ | |
"左文脈ID", "右文脈ID", | |
"品詞", "品詞細分類1", "品詞細分類2", "品詞細分類3", | |
"活用形1", "活用形2", | |
] | |
idxes = [fieldnames_all.index(ctk) for ctk in ctk_all] | |
if all([cmptrg[ix] == dty[2][ix] for ix in idxes]): | |
per_diffkinds["+h"].append(dty[2]) | |
break | |
else: | |
per_diffkinds[dty[0]].append(dty[2]) | |
else: | |
for i in range(len(v) // 2): | |
dty = _difftype((v[i], v[len(v) // 2 + i])) | |
mf = dty[1].keys() | |
if '左文脈ID' in mf: | |
# なぜ変更で対応しようとするのか理解不能。 | |
# たとえば「高良」は性でも名でもありえる。 | |
# なぜ名前だけだと思い込む。 | |
# ユーザ辞書として切り出す目的の場合は | |
# これは「追加」でいい。 | |
per_diffkinds["+v"].append(dty[2]) | |
elif '読み' in mf or '発音' in mf: | |
# どうもこれに該当するものは全てオリジナル | |
# IPA辞書の凡ミスらしい。 | |
# これの細分類として、コストも調整してるかどうか。 | |
if self._args.ign_cost_change and "コスト" in dty[1]: | |
cidx = fieldnames_all.index("コスト") | |
dty[2][cidx] = dty[1]["コスト"][0] # 元のに戻す | |
per_diffkinds["r"].append(dty[2]) | |
else: # コストのみ or 原形のみ or 原形+コスト | |
ign_cc = self._args.ign_cost_change | |
ign_oc = self._args.ign_originalform_change | |
if ign_cc and ign_oc: | |
continue | |
if ign_cc and "コスト" in dty[1]: | |
cidx = fieldnames_all.index("コスト") | |
dty[2][cidx] = dty[1]["コスト"][0] # 元のに戻す | |
elif ign_oc and "原形" in dty[1]: | |
cidx = fieldnames_all.index("原形") | |
dty[2][cidx] = dty[1]["原形"][0] # 元のに戻す | |
per_diffkinds["c"].append(dty[2]) | |
result[type_key] = per_diffkinds | |
return result | |
def _write(self, result): | |
dnm = { | |
"+": "add", | |
"+v": "add_var", | |
"+h": "add_hankaku", | |
"-": "del", | |
"c": "cost", # and original-form | |
"r": "reading", | |
} | |
# type_key: per_diffkinds | |
for type_key, per_diffkinds in result.items(): | |
for dty, contents in per_diffkinds.items(): | |
#print(type_key, dty, contents[0]) | |
dd = self._args.distdir | |
if not self._args.flat: | |
dd = os.path.join(self._args.distdir, dnm[dty]) | |
if not os.path.exists(dd): | |
os.makedirs(dd) | |
if not self._args.flat: | |
ofn = os.path.join(dd, type_key + ".csv") | |
else: | |
ofn = os.path.join(dd, type_key + "._" + dnm[dty] + ".csv") | |
with io.open(ofn, "wb") as fo: | |
fo.write(("\n".join( | |
[",".join(line) | |
for line in contents | |
])).encode("utf-8")) | |
# | |
if __name__ == '__main__': | |
logging.basicConfig(stream=sys.stderr, level=logging.INFO) | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("diffdir", nargs="?") | |
parser.add_argument("--distdir", default=".") | |
parser.add_argument("--ign-cost-change", action="store_true") | |
parser.add_argument("--ign-originalform-change", action="store_true") | |
parser.add_argument("--flat", action="store_true") | |
args = parser.parse_args() | |
_Splitter(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment