Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Emacsで日本語逆変換
;;; decompjp.el --- decompose Japanese text.
;; Copyright (C) 2011 SAKURAI Masashi
;; Author: SAKURAI Masashi <m.sakurai at kiwanami.net>
;; Keywords: languages, tools
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;; MeCabなどを利用して、逆変換を試みるプログラム。
;;
;; dcj:reverse-translate コマンドを実行すると、現在の位置から適当な位置
;; まで形態素解析して逆変換を試みる。リージョンが指定されてあれば、その
;; 範囲を逆変換する。
;;
;; ibus-mozc, anthy など、一般的な変換プログラムで利用できるはず。
;;; Code:
(eval-when-compile (require 'cl))
;;==================================================
;; Customize
(defvar dcj:key-commit ?\ "変換開始キー")
(defvar dcj:romaji-kana-table
'(( "a" . "") ( "i" . "") ( "u" . "") ( "e" . "") ( "o" . "")
("ka" . "") ("ki" . "") ("ku" . "") ("ke" . "") ("ko" . "")
("sa" . "") ("si" . "") ("su" . "") ("se" . "") ("so" . "")
("ta" . "") ("ti" . "") ("tu" . "") ("te" . "") ("to" . "")
("na" . "") ("ni" . "") ("nu" . "") ("ne" . "") ("no" . "")
("ha" . "") ("hi" . "") ("hu" . "") ("he" . "") ("ho" . "")
("ma" . "") ("mi" . "") ("mu" . "") ("me" . "") ("mo" . "")
("ya" . "") ("yu" . "") ("yo" . "")
("ra" . "") ("ri" . "") ("ru" . "") ("re" . "") ("ro" . "")
("wa" . "") ("wi" . "") ("wu" . "") ("we" . "") ("wo" . "")
("n'" . "") ( "n" . "") ("m'" . "") ( "m" . "")
("ga" . "") ("gi" . "") ("gu" . "") ("ge" . "") ("go" . "")
("za" . "") ("zi" . "") ("zu" . "") ("ze" . "") ("zo" . "")
("da" . "") ("di" . "") ("du" . "") ("de" . "") ("do" . "")
("ba" . "") ("bi" . "") ("bu" . "") ("be" . "") ("bo" . "")
("pa" . "") ("pi" . "") ("pu" . "") ("pe" . "") ("po" . "")
("kya" . "きゃ") ("kyu" . "きゅ") ("kye" . "きぇ") ("kyo" . "きょ")
("sya" . "しゃ") ("syu" . "しゅ") ("sye" . "しぇ") ("syo" . "しょ")
("sha" . "しゃ") ("shu" . "しゅ") ("she" . "しぇ") ("sho" . "しょ")
("cha" . "ちゃ") ("chu" . "ちゅ") ("che" . "ちぇ") ("cho" . "ちょ")
("cya" . "ちゃ") ("cyu" . "ちゅ") ("cye" . "ちぇ") ("cyo" . "ちょ")
("tya" . "ちゃ") ("tyu" . "ちゅ") ("tye" . "ちぇ") ("tyo" . "ちょ")
("nya" . "にゃ") ("nyu" . "にゅ") ("nye" . "にぇ") ("nyo" . "にょ")
("hya" . "ひゃ") ("hyu" . "ひゅ") ("hye" . "ひぇ") ("hyo" . "ひょ")
("mya" . "みゃ") ("myu" . "みゅ") ("mye" . "みぇ") ("myo" . "みょ")
("rya" . "りゃ") ("ryu" . "りゅ") ("rye" . "りぇ") ("ryo" . "りょ")
("lya" . "りゃ") ("lyu" . "りゅ") ("lye" . "りぇ") ("lyo" . "りょ")
("gya" . "ぎゃ") ("gyu" . "ぎゅ") ("gye" . "ぎぇ") ("gyo" . "ぎょ")
("zya" . "じゃ") ("zyu" . "じゅ") ("zye" . "じぇ") ("zyo" . "じょ")
("jya" . "じゃ") ("jyu" . "じゅ") ("jye" . "じぇ") ("jyo" . "じょ")
( "ja" . "じゃ") ( "ju" . "じゅ") ( "je" . "じぇ") ( "jo" . "じょ")
("bya" . "びゃ") ("byu" . "びゅ") ("bye" . "びぇ") ("byo" . "びょ")
("pya" . "ぴゃ") ("pyu" . "ぴゅ") ("pye" . "ぴぇ") ("pyo" . "ぴょ")
("kwa" . "くゎ") ("kwi" . "くぃ") ("kwe" . "くぇ") ("kwo" . "くぉ")
("tsa" . "つぁ") ("tsi" . "つぃ") ("tse" . "つぇ") ("tso" . "つぉ")
( "fa" . "ふぁ") ( "fi" . "ふぃ") ( "fe" . "ふぇ") ( "fo" . "ふぉ")
("gwa" . "ぐゎ") ("gwi" . "ぐぃ") ("gwe" . "ぐぇ") ("gwo" . "ぐぉ")
("dyi" . "でぃ") ("dyu" . "どぅ") ("dye" . "でぇ") ("dyo" . "どぉ")
("dhi" . "でぃ") ("dhu" . "どぅ") ("dhe" . "でぇ") ("dho" . "どぉ")
("shi" . "") ("tyi" . "てぃ") ("thi" . "てぃ")
("chi" . "") ("tsu" . "") ("ji" . "")
( "fu" . "") ( "ye" . "いぇ")
("va" . "ヴぁ") ("vi" . "ヴぃ") ("vu" . "") ("ve" . "ヴぇ") ("vo" . "ヴぉ")
( "xa" . "") ( "xi" . "") ( "xu" . "") ( "xe" . "") ( "xo" . "")
( "la" . "") ( "li" . "") ( "lu" . "") ( "le" . "") ("lo" . "")
("xtu" . "") ("xya" . "") ("xyu" . "") ("xyo" . "") ("xwa" . "")
("ltu" . "") ("lya" . "") ("lyu" . "") ("lyo" . "") ("lwa" . "")
("xka" . "") ("xke" . "")
("1" . "") ("2" . "") ("3" . "") ("4" . "") ("5" . "")
("6" . "") ("7" . "") ("8" . "") ("9" . "") ("0" . "")
("!" . "") ("@" . "") ("#" . "") ("$" . "") ("%" . "")
("^" . "") ("&" . "") ("*" . "") ("(" . "") (")" . "")
("-" . "") ("=" . "") ("`" . "") ("\\" . "") ("|" . "")
("_" . "_") ("+" . "") ("~" . "") ("[" . "") ("]" . "")
("{" . "") ("}" . "") (":" . "") (";" . "") ("\"" . "")
("'" . "") ("." . "") ("," . "") ("<" . "") (">" . "")
("?" . "") ("/" . "")
))
(defvar dcj:kana-regexp
(regexp-opt
(mapcar 'cdr (sort (copy-sequence dcj:romaji-kana-table)
(lambda (i j) (> (length (cdr i))
(length (cdr j)))))))
"かなを拾う正規表現。 dcj:romaji-kana-table を更新したらこの変数も更新すること。")
;;==================================================
;; 逆変換
(defvar dcj:reverse-delimiter-regexp
(concat "[][\n\r\t "
",.?!()-=+|/&$#@~:;\"{}<>"
"、。 ※・?!()=+|&$#@:;”’"
"{}<>「」【】○●◎■□◇◆△▲▽▼]")
"逆変換の先頭を探す目印の正規表現")
(defvar dcj:reverse-through-delimiters
"、。※・?!()=+|&$#@:;”’{}<>「」【】○●◎■□◇◆△▲▽▼"
"直前ならやっぱり逆変換に含めたい文字列")
(defun dcj:reverse-search-begin ()
"現在のポイント位置から前に走査して、逆変換を開始する位置を返す。
見つからなかったら現在のポイント位置を返す。"
(save-excursion
(when (and ; 直前の文字が through-delimiters に含まれていれば無視する
(char-before)
(loop with prechar = (char-before)
for i across dcj:reverse-through-delimiters
if (eql i prechar)
return t finally return nil))
(goto-char (1- (point))))
(or
(and (re-search-backward dcj:reverse-delimiter-regexp nil t)
(1+ (point)))
(line-beginning-position))))
(defvar dcj:reverse-translate-driver 'dcj:reverse-translate-driver-mecab "形態素解析に何を使うか。
今のところ 'dcj:reverse-translate-driver-mecab / 'dcj:reverse-translate-driver-chasen が使える。
ドライバーは変換済み文字列を受け取って、元の単語と読みのひらがなの alist を返す。")
(defun dcj:reverse-translate-driver-mecab (kanji)
"MeCab driver"
(let (result)
(with-temp-buffer
(call-process-shell-command
(format "echo '%s' | mecab " kanji) nil t)
(goto-char (point-min))
(while (re-search-forward "^\\([^\t]+\\)\t\\(.*\\)$" nil t)
(let* ((org (match-string 1))
(cols (match-string 2))
(yomi (nth 7 (split-string cols ","))))
(push (cons org
(japanese-hiragana
(if yomi yomi org)))
result))))
(nreverse result)))
(defun dcj:reverse-translate-driver-chasen (kanji)
"ChaSen driver"
(let (result)
(with-temp-buffer
(call-process-shell-command
(format "echo '%s' | chasen " kanji) nil t)
(goto-char (point-min))
(while (re-search-forward "^\\([^\t]+\\)\t\\([^\t]*\\)\t.*$" nil t)
(let* ((org (match-string 1))
(yomi (match-string 2)))
(push (cons org
(japanese-hiragana
(if (or (string-match "[0-9a-zA-Z]" org)
(or (null yomi) (= 0 (length yomi))))
org yomi)))
result))))
(nreverse result)))
(defun dcj:reverse-translate-driver (kanji)
"漢字からひらがな・単語区切りにする。実際にはドライバーに丸投げする。"
(funcall dcj:reverse-translate-driver kanji))
(defun dcj:reverse-kanji-to-kana (kanji)
"漢字からひらがなにする。"
(let ((words (dcj:reverse-translate-driver kanji)))
(when words
(loop for (org . kana) in words
concat kana))))
(defun dcj:reverse-kana-to-romaji (source)
"アルファベットローマ字にする。"
(with-temp-buffer
(insert source)
(goto-char (point-min))
(while (re-search-forward dcj:kana-regexp nil t)
(let ((match (match-string 0)))
(replace-match
(car (rassoc match dcj:romaji-kana-table)))))
(buffer-string)))
(defun dcj:reverse-translate ()
"現在のポイント位置から逆変換を試みる"
(interactive)
(let ((poss (if (region-active-p)
(min (mark) (point))
(dcj:reverse-search-begin)))
(pose (if (region-active-p)
(max (mark) (point)) (point)))
org romaji)
(when (< poss pose)
(setq org (buffer-substring poss pose)
romaji (dcj:reverse-kana-to-romaji
(dcj:reverse-kanji-to-kana org)))
(delete-region poss pose)
(setq unread-command-events
(append unread-command-events
(listify-key-sequence romaji) (list dcj:key-commit))))))
(defun dcj:reverse-test ()
"逆変換のテスト"
(interactive)
(pop-to-buffer "*dcj-kanji-to-romaji-test*")
(erase-buffer)
(loop for q in '(dcj:reverse-translate-driver-mecab) ; '(dcj:reverse-translate-driver-mecab dcj:reverse-translate-driver-chasen)
do
(let ((dcj:reverse-translate-driver q))
(insert "==(" (symbol-name q) ")==========\n")
(loop for (in out) in
'(("太郎は花子が好きだ" "tarouhahanakogasukida") ; 普通
("今日は良い天気ですね" "kyouhayoiten'kidesune") ;
("逆変換を試みる" "gyakuhen'kan'wokokoromiru") ;
("東京特許許可局" "toukyoutoxtukyokyokakyoku") ; 小さいつ
("1太郎は花子が好きだ" "1tarouhahanakogasukida") ; 数字込み
("デフォルトのカーソールの色" "deforutonoka-so-runoiro") ; 未知語
("AはBを含む。" "AhaBwohukumu.") ; アルファベット込み
("MySQLにJSONを登録する。" "MySQLniJSONwotourokusuru.")) ; 英単語込み
for kana = (dcj:reverse-kanji-to-kana in)
for romaji = (dcj:reverse-kana-to-romaji kana)
do
(insert (format "%s : %s / %s / %s <= %s\n"
(if (equal romaji out) "OK" "NG")
out romaji kana in))))))
;; (progn (eval-current-buffer) (dcj:reverse-test))
(provide 'decompjp)
;;; decompjp.el ends here
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment