Skip to content

Instantly share code, notes, and snippets.

@kiwanami
Created January 12, 2012 15:34
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kiwanami/1601147 to your computer and use it in GitHub Desktop.
Save kiwanami/1601147 to your computer and use it in GitHub Desktop.
Emacsで日本語逆変換
;;; decompjp.el --- decompose Japanese text.
;; Copyright (C) 2011 SAKURAI Masashi
;; Author: SAKURAI Masashi <m.sakurai at kiwanami.net>
;; Keywords: languages, tools
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; MeCabなどを利用して、逆変換を試みるプログラム。
;;
;; dcj:reverse-translate コマンドを実行すると、現在の位置から適当な位置
;; まで形態素解析して逆変換を試みる。リージョンが指定されてあれば、その
;; 範囲を逆変換する。
;;
;; ibus-mozc, anthy など、一般的な変換プログラムで利用できるはず。
;;; Code:
(eval-when-compile (require 'cl))
;;==================================================
;; Customize
(defvar dcj:key-commit ?\ "変換開始キー")
(defvar dcj:romaji-kana-table
'(( "a" . "あ") ( "i" . "い") ( "u" . "う") ( "e" . "え") ( "o" . "お")
("ka" . "か") ("ki" . "き") ("ku" . "く") ("ke" . "け") ("ko" . "こ")
("sa" . "さ") ("si" . "し") ("su" . "す") ("se" . "せ") ("so" . "そ")
("ta" . "た") ("ti" . "ち") ("tu" . "つ") ("te" . "て") ("to" . "と")
("na" . "な") ("ni" . "に") ("nu" . "ぬ") ("ne" . "ね") ("no" . "の")
("ha" . "は") ("hi" . "ひ") ("hu" . "ふ") ("he" . "へ") ("ho" . "ほ")
("ma" . "ま") ("mi" . "み") ("mu" . "む") ("me" . "め") ("mo" . "も")
("ya" . "や") ("yu" . "ゆ") ("yo" . "よ")
("ra" . "ら") ("ri" . "り") ("ru" . "る") ("re" . "れ") ("ro" . "ろ")
("wa" . "わ") ("wi" . "ゐ") ("wu" . "う") ("we" . "ゑ") ("wo" . "を")
("n'" . "ん") ( "n" . "ん") ("m'" . "ん") ( "m" . "ん")
("ga" . "が") ("gi" . "ぎ") ("gu" . "ぐ") ("ge" . "げ") ("go" . "ご")
("za" . "ざ") ("zi" . "じ") ("zu" . "ず") ("ze" . "ぜ") ("zo" . "ぞ")
("da" . "だ") ("di" . "ぢ") ("du" . "づ") ("de" . "で") ("do" . "ど")
("ba" . "ば") ("bi" . "び") ("bu" . "ぶ") ("be" . "べ") ("bo" . "ぼ")
("pa" . "ぱ") ("pi" . "ぴ") ("pu" . "ぷ") ("pe" . "ぺ") ("po" . "ぽ")
("kya" . "きゃ") ("kyu" . "きゅ") ("kye" . "きぇ") ("kyo" . "きょ")
("sya" . "しゃ") ("syu" . "しゅ") ("sye" . "しぇ") ("syo" . "しょ")
("sha" . "しゃ") ("shu" . "しゅ") ("she" . "しぇ") ("sho" . "しょ")
("cha" . "ちゃ") ("chu" . "ちゅ") ("che" . "ちぇ") ("cho" . "ちょ")
("cya" . "ちゃ") ("cyu" . "ちゅ") ("cye" . "ちぇ") ("cyo" . "ちょ")
("tya" . "ちゃ") ("tyu" . "ちゅ") ("tye" . "ちぇ") ("tyo" . "ちょ")
("nya" . "にゃ") ("nyu" . "にゅ") ("nye" . "にぇ") ("nyo" . "にょ")
("hya" . "ひゃ") ("hyu" . "ひゅ") ("hye" . "ひぇ") ("hyo" . "ひょ")
("mya" . "みゃ") ("myu" . "みゅ") ("mye" . "みぇ") ("myo" . "みょ")
("rya" . "りゃ") ("ryu" . "りゅ") ("rye" . "りぇ") ("ryo" . "りょ")
("lya" . "りゃ") ("lyu" . "りゅ") ("lye" . "りぇ") ("lyo" . "りょ")
("gya" . "ぎゃ") ("gyu" . "ぎゅ") ("gye" . "ぎぇ") ("gyo" . "ぎょ")
("zya" . "じゃ") ("zyu" . "じゅ") ("zye" . "じぇ") ("zyo" . "じょ")
("jya" . "じゃ") ("jyu" . "じゅ") ("jye" . "じぇ") ("jyo" . "じょ")
( "ja" . "じゃ") ( "ju" . "じゅ") ( "je" . "じぇ") ( "jo" . "じょ")
("bya" . "びゃ") ("byu" . "びゅ") ("bye" . "びぇ") ("byo" . "びょ")
("pya" . "ぴゃ") ("pyu" . "ぴゅ") ("pye" . "ぴぇ") ("pyo" . "ぴょ")
("kwa" . "くゎ") ("kwi" . "くぃ") ("kwe" . "くぇ") ("kwo" . "くぉ")
("tsa" . "つぁ") ("tsi" . "つぃ") ("tse" . "つぇ") ("tso" . "つぉ")
( "fa" . "ふぁ") ( "fi" . "ふぃ") ( "fe" . "ふぇ") ( "fo" . "ふぉ")
("gwa" . "ぐゎ") ("gwi" . "ぐぃ") ("gwe" . "ぐぇ") ("gwo" . "ぐぉ")
("dyi" . "でぃ") ("dyu" . "どぅ") ("dye" . "でぇ") ("dyo" . "どぉ")
("dhi" . "でぃ") ("dhu" . "どぅ") ("dhe" . "でぇ") ("dho" . "どぉ")
("shi" . "し") ("tyi" . "てぃ") ("thi" . "てぃ")
("chi" . "ち") ("tsu" . "つ") ("ji" . "じ")
( "fu" . "ふ") ( "ye" . "いぇ")
("va" . "ヴぁ") ("vi" . "ヴぃ") ("vu" . "ヴ") ("ve" . "ヴぇ") ("vo" . "ヴぉ")
( "xa" . "ぁ") ( "xi" . "ぃ") ( "xu" . "ぅ") ( "xe" . "ぇ") ( "xo" . "ぉ")
( "la" . "ぁ") ( "li" . "ぃ") ( "lu" . "ぅ") ( "le" . "ぇ") ("lo" . "ぉ")
("xtu" . "っ") ("xya" . "ゃ") ("xyu" . "ゅ") ("xyo" . "ょ") ("xwa" . "ゎ")
("ltu" . "っ") ("lya" . "ゃ") ("lyu" . "ゅ") ("lyo" . "ょ") ("lwa" . "ゎ")
("xka" . "ヵ") ("xke" . "ヶ")
("1" . "1") ("2" . "2") ("3" . "3") ("4" . "4") ("5" . "5")
("6" . "6") ("7" . "7") ("8" . "8") ("9" . "9") ("0" . "0")
("!" . "!") ("@" . "@") ("#" . "#") ("$" . "$") ("%" . "%")
("^" . "^") ("&" . "&") ("*" . "*") ("(" . "(") (")" . ")")
("-" . "ー") ("=" . "=") ("`" . "`") ("\\" . "¥") ("|" . "|")
("_" . "_") ("+" . "+") ("~" . " ̄") ("[" . "「") ("]" . "」")
("{" . "{") ("}" . "}") (":" . ":") (";" . ";") ("\"" . "”")
("'" . "’") ("." . "。") ("," . "、") ("<" . "<") (">" . ">")
("?" . "?") ("/" . "/")
))
(defvar dcj:kana-regexp
(regexp-opt
(mapcar 'cdr (sort (copy-sequence dcj:romaji-kana-table)
(lambda (i j) (> (length (cdr i))
(length (cdr j)))))))
"かなを拾う正規表現。 dcj:romaji-kana-table を更新したらこの変数も更新すること。")
;;==================================================
;; 逆変換
(defvar dcj:reverse-delimiter-regexp
(concat "[][\n\r\t "
",.?!()-=+|/&$#@~:;\"{}<>"
"、。 ※・?!()=+|&$#@:;”’"
"{}<>「」【】○●◎■□◇◆△▲▽▼]")
"逆変換の先頭を探す目印の正規表現")
(defvar dcj:reverse-through-delimiters
"、。※・?!()=+|&$#@:;”’{}<>「」【】○●◎■□◇◆△▲▽▼"
"直前ならやっぱり逆変換に含めたい文字列")
(defun dcj:reverse-search-begin ()
"現在のポイント位置から前に走査して、逆変換を開始する位置を返す。
見つからなかったら現在のポイント位置を返す。"
(save-excursion
(when (and ; 直前の文字が through-delimiters に含まれていれば無視する
(char-before)
(loop with prechar = (char-before)
for i across dcj:reverse-through-delimiters
if (eql i prechar)
return t finally return nil))
(goto-char (1- (point))))
(or
(and (re-search-backward dcj:reverse-delimiter-regexp nil t)
(1+ (point)))
(line-beginning-position))))
(defvar dcj:reverse-translate-driver 'dcj:reverse-translate-driver-mecab "形態素解析に何を使うか。
今のところ 'dcj:reverse-translate-driver-mecab / 'dcj:reverse-translate-driver-chasen が使える。
ドライバーは変換済み文字列を受け取って、元の単語と読みのひらがなの alist を返す。")
(defun dcj:reverse-translate-driver-mecab (kanji)
"MeCab driver"
(let (result)
(with-temp-buffer
(call-process-shell-command
(format "echo '%s' | mecab " kanji) nil t)
(goto-char (point-min))
(while (re-search-forward "^\\([^\t]+\\)\t\\(.*\\)$" nil t)
(let* ((org (match-string 1))
(cols (match-string 2))
(yomi (nth 7 (split-string cols ","))))
(push (cons org
(japanese-hiragana
(if yomi yomi org)))
result))))
(nreverse result)))
(defun dcj:reverse-translate-driver-chasen (kanji)
"ChaSen driver"
(let (result)
(with-temp-buffer
(call-process-shell-command
(format "echo '%s' | chasen " kanji) nil t)
(goto-char (point-min))
(while (re-search-forward "^\\([^\t]+\\)\t\\([^\t]*\\)\t.*$" nil t)
(let* ((org (match-string 1))
(yomi (match-string 2)))
(push (cons org
(japanese-hiragana
(if (or (string-match "[0-9a-zA-Z]" org)
(or (null yomi) (= 0 (length yomi))))
org yomi)))
result))))
(nreverse result)))
(defun dcj:reverse-translate-driver (kanji)
"漢字からひらがな・単語区切りにする。実際にはドライバーに丸投げする。"
(funcall dcj:reverse-translate-driver kanji))
(defun dcj:reverse-kanji-to-kana (kanji)
"漢字からひらがなにする。"
(let ((words (dcj:reverse-translate-driver kanji)))
(when words
(loop for (org . kana) in words
concat kana))))
(defun dcj:reverse-kana-to-romaji (source)
"アルファベットローマ字にする。"
(with-temp-buffer
(insert source)
(goto-char (point-min))
(while (re-search-forward dcj:kana-regexp nil t)
(let ((match (match-string 0)))
(replace-match
(car (rassoc match dcj:romaji-kana-table)))))
(buffer-string)))
(defun dcj:reverse-translate ()
"現在のポイント位置から逆変換を試みる"
(interactive)
(let ((poss (if (region-active-p)
(min (mark) (point))
(dcj:reverse-search-begin)))
(pose (if (region-active-p)
(max (mark) (point)) (point)))
org romaji)
(when (< poss pose)
(setq org (buffer-substring poss pose)
romaji (dcj:reverse-kana-to-romaji
(dcj:reverse-kanji-to-kana org)))
(delete-region poss pose)
(setq unread-command-events
(append unread-command-events
(listify-key-sequence romaji) (list dcj:key-commit))))))
(defun dcj:reverse-test ()
"逆変換のテスト"
(interactive)
(pop-to-buffer "*dcj-kanji-to-romaji-test*")
(erase-buffer)
(loop for q in '(dcj:reverse-translate-driver-mecab) ; '(dcj:reverse-translate-driver-mecab dcj:reverse-translate-driver-chasen)
do
(let ((dcj:reverse-translate-driver q))
(insert "==(" (symbol-name q) ")==========\n")
(loop for (in out) in
'(("太郎は花子が好きだ" "tarouhahanakogasukida") ; 普通
("今日は良い天気ですね" "kyouhayoiten'kidesune") ; ん
("逆変換を試みる" "gyakuhen'kan'wokokoromiru") ; ん
("東京特許許可局" "toukyoutoxtukyokyokakyoku") ; 小さいつ
("1太郎は花子が好きだ" "1tarouhahanakogasukida") ; 数字込み
("デフォルトのカーソールの色" "deforutonoka-so-runoiro") ; 未知語
("AはBを含む。" "AhaBwohukumu.") ; アルファベット込み
("MySQLにJSONを登録する。" "MySQLniJSONwotourokusuru.")) ; 英単語込み
for kana = (dcj:reverse-kanji-to-kana in)
for romaji = (dcj:reverse-kana-to-romaji kana)
do
(insert (format "%s : %s / %s / %s <= %s\n"
(if (equal romaji out) "OK" "NG")
out romaji kana in))))))
;; (progn (eval-current-buffer) (dcj:reverse-test))
(provide 'decompjp)
;;; decompjp.el ends here
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment