Skip to content

Instantly share code, notes, and snippets.

@hhsprings
Last active October 10, 2019 13:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hhsprings/014339eed49db47373a25800bbc66884 to your computer and use it in GitHub Desktop.
Save hhsprings/014339eed49db47373a25800bbc66884 to your computer and use it in GitHub Desktop.
MeCab のユーザ辞書作成支援的ななにか (2)
# -*- coding: utf-8-unix -*-
#
# mecab-ipadic-neologd の辞書修正・追加の考え方に共感できないところがある
# ため取捨選択しての利用としたい。
#
import io
import os
import re
import csv
import logging
import sys
import unicodedata
from glob import glob
from collections import defaultdict
fieldnames_all = [
"表層形",
"左文脈ID", "右文脈ID",
"コスト",
"品詞", "品詞細分類1", "品詞細分類2", "品詞細分類3",
"活用形1", "活用形2",
"原形", "読み", "発音"
]
_search_rgx = re.compile(rb"^[-+](?![-+]).*$", flags=re.M)
class _Splitter(object):
def __init__(self, args):
self._args = args
orig = self._collect(args.diffdir)
result = self._categorize_difftypes(orig)
self._write(result)
def _collect(self, diffdir):
diffdir = diffdir or "."
result = {}
for fn in sorted(glob(os.path.join(diffdir, "*.diff"))):
type_key = re.sub(r"\.csv\.\d+\.diff$", "", os.path.basename(fn))
if type_key not in result:
result[type_key] = defaultdict(list)
raw = io.open(fn, "rb").read() # バイトの欠落してるのがいる
targets = [line.decode("utf-8")
for line in _search_rgx.findall(raw)]
for line in csv.reader(targets):
sgn, key = line[0][0], line[0][1:]
if result[type_key][key]:
if sgn == "-":
trash = False
for i in range(
len(result[type_key][key]) - 1, -1, -1):
prev = result[type_key][key][i]
if prev[0][0] == "+" and all([prev[j] == line[j]
for j in range(1, len(prev))]):
result[type_key][key].pop(i)
trash = True
break
if trash:
continue
result[type_key][key].append(line)
return result
def _categorize_difftypes(self, orig):
def _difftype(diffpair):
if len(diffpair) == 1:
t = diffpair[0]
return t[0][0], [], [t[0][1:]] + t[1:]
else:
lhs, rhs = diffpair
changes = {
fieldnames_all[i]: (lhs[i], rhs[i])
for i in range(1, len(fieldnames_all))
if lhs[i] != rhs[i]}
return "M", changes, [rhs[0][1:]] + rhs[1:]
#
result = {}
for type_key, diffs in orig.items():
per_diffkinds = defaultdict(list)
for k, v in sorted(diffs.items()):
# 今のところ「ひとつ追加 or 削除」「ひとつ変更」「ふたつ変更」
# の3パターンしかないみたい。
if len(v) < 2:
dty = _difftype(v)
if dty[0] == "-":
per_diffkinds[dty[0]].append(dty[2])
else:
# 全角を半角に変えただけのものが多い。これは分別したい。
# (ただし、あくまでも「差分」だけをみている。これを
# している場合は必ず全角バージョンのほうの「原形」を
# 変えているようなので、見つかるはず。)
nk = unicodedata.normalize("NFKC", k)
for k_in, v_in in diffs.items():
if k_in != k:
nk_in = unicodedata.normalize("NFKC", k_in)
if nk_in == nk:
cmptrg = [line for line in v_in if line[0][0] == "+"][-1]
ctk_all = [
"左文脈ID", "右文脈ID",
"品詞", "品詞細分類1", "品詞細分類2", "品詞細分類3",
"活用形1", "活用形2",
]
idxes = [fieldnames_all.index(ctk) for ctk in ctk_all]
if all([cmptrg[ix] == dty[2][ix] for ix in idxes]):
per_diffkinds["+h"].append(dty[2])
break
else:
per_diffkinds[dty[0]].append(dty[2])
else:
for i in range(len(v) // 2):
dty = _difftype((v[i], v[len(v) // 2 + i]))
mf = dty[1].keys()
if '左文脈ID' in mf:
# なぜ変更で対応しようとするのか理解不能。
# たとえば「高良」は性でも名でもありえる。
# なぜ名前だけだと思い込む。
# ユーザ辞書として切り出す目的の場合は
# これは「追加」でいい。
per_diffkinds["+v"].append(dty[2])
elif '読み' in mf or '発音' in mf:
# どうもこれに該当するものは全てオリジナル
# IPA辞書の凡ミスらしい。
# これの細分類として、コストも調整してるかどうか。
if self._args.ign_cost_change and "コスト" in dty[1]:
cidx = fieldnames_all.index("コスト")
dty[2][cidx] = dty[1]["コスト"][0] # 元のに戻す
per_diffkinds["r"].append(dty[2])
else: # コストのみ or 原形のみ or 原形+コスト
ign_cc = self._args.ign_cost_change
ign_oc = self._args.ign_originalform_change
if ign_cc and ign_oc:
continue
if ign_cc and "コスト" in dty[1]:
cidx = fieldnames_all.index("コスト")
dty[2][cidx] = dty[1]["コスト"][0] # 元のに戻す
elif ign_oc and "原形" in dty[1]:
cidx = fieldnames_all.index("原形")
dty[2][cidx] = dty[1]["原形"][0] # 元のに戻す
per_diffkinds["c"].append(dty[2])
result[type_key] = per_diffkinds
return result
def _write(self, result):
dnm = {
"+": "add",
"+v": "add_var",
"+h": "add_hankaku",
"-": "del",
"c": "cost", # and original-form
"r": "reading",
}
# type_key: per_diffkinds
for type_key, per_diffkinds in result.items():
for dty, contents in per_diffkinds.items():
#print(type_key, dty, contents[0])
dd = self._args.distdir
if not self._args.flat:
dd = os.path.join(self._args.distdir, dnm[dty])
if not os.path.exists(dd):
os.makedirs(dd)
if not self._args.flat:
ofn = os.path.join(dd, type_key + ".csv")
else:
ofn = os.path.join(dd, type_key + "._" + dnm[dty] + ".csv")
with io.open(ofn, "wb") as fo:
fo.write(("\n".join(
[",".join(line)
for line in contents
])).encode("utf-8"))
#
if __name__ == '__main__':
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("diffdir", nargs="?")
parser.add_argument("--distdir", default=".")
parser.add_argument("--ign-cost-change", action="store_true")
parser.add_argument("--ign-originalform-change", action="store_true")
parser.add_argument("--flat", action="store_true")
args = parser.parse_args()
_Splitter(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment