Skip to content

Instantly share code, notes, and snippets.

@zarigani
Created May 28, 2009 22:50
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zarigani/119641 to your computer and use it in GitHub Desktop.
Save zarigani/119641 to your computer and use it in GitHub Desktop.
saykanji
saykanji()
{
voice=`kanji2voice "$1"`
echokanji "$voice"
/usr/local/bin/saykana "$voice" &
}
#FORMAT_OPTION='--node-format=%pS%f[8] --unk-format=%M --eos-format="\n"'
sayeb()
{
# res=`init_voice_text "$1" | /usr/local/bin/mecab -Owakati | /usr/local/bin/mecab $FORMAT_OPTION`
res=`init_voice_text "$1"`
/usr/local/bin/saykana "$res" &
echo "$res"
}
echokanji()
{
echo "=> voice..."
echo "$1"
echo
echo "=> mecab..."
echo "$mecab_text"
cat /tmp/saykanji_mecab.txt
}
kanji2voice()
{
local res
# init_voice_text "$1"
# パイプで繋げた方が、何をやっているか明快になる
init_voice_text "$1" | /usr/local/bin/mecab > /tmp/saykanji_mecab.txt
while read line; do
parse_line "$line"
# <NUMK>タグを生成する
if [ -n "$val" ] && \
echo "$kanji"|grep -iq '[^0-9\.,]'
then
counter=`josusi_dic_find "$kanji"`
if [ -n "$counter" ]; then
val=`echo "$val"|sed 's/,//g'`
res+="/<NUMK VAL=${val} COUNTER=${counter}>"
kugiri="/"
val=""
continue
else
if [ "$kind" = '名詞,接尾,助数詞,*' ]; then
val=`echo "$val"|sed 's/,//g'`
res+="/<NUMK VAL=${val}>"
kugiri="/"
val=""
else
res+="$val"
val=""
fi
fi
fi
# 助詞類が途切れたら、発音辞書を利用して変換する
# if [ -n "$phrase" ] && \
# [ "$hinsi_stock" = '助詞' -o "$hinsi_stock" = '助動詞' -o "$kind_stock" = '動詞,非自立,*,*' ] && \
# [ "$hinsi" != '助詞' -a "$hinsi" != '助動詞' ]
if [ -n "$phrase" ] && \
[ "$hinsi_stock" = '助詞' -o "$hinsi_stock" = '助動詞' ] && \
[ "$hinsi" != '助詞' -a "$hinsi" != '助動詞' -a "$kanji" != 'の' ]
then
accent=`accent_dic_find "$phrase|$hinsi_stock"`
if [ -z "$accent" ]; then
accent=`accent_dic_find "$phrase"`
fi
if [ -n "$accent" ]; then
if [ "$hinsi_stock" != '助動詞' ]; then
res+="$kugiri"
kugiri="/"
fi
res+="($accent)"
phrase=""
else
res+="($phrase)"
phrase=""
fi
fi
# 終端記号EOSであれば、プロパティをクリア
if [ "$kind" = 'EOS' ]; then
kanji=''
voice=''
kind=''
hinsi=''
kugiri=''
fi
# 句読点が出てきたら、区切り予約をキャンセルする
# 名詞が連続する場合も、区切り予約をキャンセルする
# 注意)AppleScriptからは、grep '[、。]'が利用できない(非ASCIIテキストと[]リテラルが×)
match=`match_REG_with_STR "[、。?!「」()]" $kanji`
if [ -n "$match" ]; then
kugiri=''
fi
# 区切り予約されていて、かつ特定の品詞であれば、区切る
# 名詞 動詞 形容詞 形容動詞 副詞 連体詞 接続詞 感動詞 助詞 助動詞 など
if [ -n "$kugiri" ]; then
if [ "$hinsi" != '助詞' ] && [ "$kind" = '動詞,非自立,*,*' -o "$kind_stock" = '助詞,連体化,*,*' ] || [ "$kanji" = 'し' ]; then
res+="+"
kugiri=''
fi
if [ "$hinsi_stock" = '助詞' -o "$hinsi_stock" = '助動詞' -o "$hinsi_stock" = '連体詞' -o "$hinsi_stock" = '感動詞' -o "$hinsi_stock" = '接続詞' ] && \
[ "$hinsi" != '助詞' ] && [ "$kind" != '動詞,非自立,*,*' -a "$kind_stock" != '助詞,連体化,*,*' ]
then
res+="$kugiri"
kugiri=''
fi
fi
# <<<<<<<<<<ここまで、前回のループに影響される事前処理
# >>>>>>>>>>ここから、今回のループ処理
# 助詞類であれば、phraseに保存しておくだけ
# if [ "$hinsi" = '助詞' -o "$hinsi" = '助動詞' -o "$kanji" = 'の' -o "$kind" = '動詞,非自立,*,*' ]; then
if [ "$hinsi" = '助詞' -o "$hinsi" = '助動詞' -o "$kanji" = 'の' ]; then
# if [ "$hinsi_stock" = '動詞' -o "$hinsi_stock" = '助動詞' ]; then
# kugiri=""
# fi
phrase+="$voice"
continue
fi
# 数値であれば、valに保存しておくだけ
if echo "$kanji"|grep -iq '^[0-9]\{1,\}$' || \
echo "$kanji"|grep -iq '^[\.\,]$'
then
val+="$kanji"
continue
fi
# 連体詞または感動詞なら、区切る
if [ "$hinsi" = '連体詞' -o "$hinsi" = '感動詞' ]; then
kugiri="/"
fi
# 発音辞書に登録がある品詞を変換する
# 発音辞書はアクセントを含めた読みに変換するcsv(例:"今度","こ'んど")
# 短い品詞の重複ヒットを避けるため、単語を「""」で囲んで登録してある
inject_and_continue_if `accent_dic_find "$kanji|$hinsi"`
inject_and_continue_if `accent_dic_find "$kanji|$yomi"`
inject_and_continue_if `accent_dic_find "$kanji"`
# # continueをネストさせる必要があるので断念
# for word in "$kanji|$hinsi" "$kanji|$yomi" "$kanji"
# do
# accent=`accent_dic_find "$word"`
# inject_and_continue_if "$accent"
# done
# 英単語は(NADの)カタカナ英語辞書で変換する
# ヒットした単語には、アクセントを追加する
if echo $kanji|grep -iq '^[a-z]'; then
kana=`nad_dic_find "$kanji"`
accent=`add_foreign_accent_to "$kana"`
inject_and_continue_if "$accent"
fi
# カタカナ語であれば、アクセントを追加する
inject_and_continue_if `add_foreign_accent_to_if "$kanji"`
# どこにもヒットしない品詞はmecabの情報をそのまま適用する
inject_with_kugiri "${voice:-$kanji}"
done < /tmp/saykanji_mecab.txt
echo "$res"
# cat /tmp/saykanji_mecab.txt
# /usr/local/bin/saykana "$res"&
return 0
}
# mecabの解析結果を修正して、ファイル(/tmp/saykanji_mecab.txt)に保存する
init_voice_text()
{
voice_text=`han2zen " $1"`
voice_text=`echo "$voice_text"|sed 's/\([^0-9]\),\([^0-9]\)/\1、\2/g'` #, -> 、
voice_text=`echo "$voice_text"|sed 's/\([^0-9]\)+ *\([0-9]\)/\1、ぷらす\2/g'`
voice_text=`echo "$voice_text"|sed 's/\([^0-9]\)- *\([0-9]\)/\1、まいなす\2/g'`
voice_text=`echo "$voice_text"|sed 's/\([^0-9]\)▲ *\([0-9]\)/\1、まいなす\2/g'`
voice_text=`echo "$voice_text"|sed 's/\([^0-9]\)△ *\([0-9]\)/\1、まいなす\2/g'`
voice_text=`echo "$voice_text"|sed 's/\(\.\{2,\}\)/、\"\1\"/g'` #... -> 、"..."
voice_text=`echo "$voice_text"|sed 's/%/%/g'` #% -> %
echo "$voice_text"
##mecab=`echo "$voice_text"|/usr/local/bin/mecab`
##echo "$mecab"|while read line
# パイプでwhileに渡すと、変数resの値を参照できなくなるため、
# 一時保存して、リダイレクトしている(while ... done </tmp/saykanji_mecab.txt)
# http://his.luky.org/ML/linux-users.9/msg06099.html
# http://his.luky.org/ML/linux-users.9/msg06102.html
##echo "$voice_text"|/usr/local/bin/mecab >/tmp/saykanji_mecab.txt
}
# mecab解析結果の1行をパースして事前準備
parse_line()
{
# 直前の状態を保存しておく
kanji_stock="$kanji"
voice_stock="$voice"
kind_stock="$kind"
hinsi_stock="$hinsi"
# 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音
kanji=`echo "$1"|cut -f1` # 表層形
yomi=`echo "$1"|cut -f2|cut -d , -f8|sed 's/\*//'` #読み
yomi=`kata_to_hira $yomi`
voice=`echo "$1"|cut -f2|cut -d , -f9|sed 's/\*//'` #発音
voice=`kata_to_hira $voice`
kind=`echo "$1"|cut -f2|cut -d , -f1-4` #品詞,品詞細分類1,品詞細分類2,品詞細分類3
hinsi=`echo "$1"|cut -f2|cut -d , -f1` #品詞
}
# 助数詞辞書で調べる
josusi_dic_find()
{
grep -i "\"$1\"" ~/Downloads/助数詞辞書.csv|head -1|cut -d , -f2|sed 's/"//g'
}
# 発音辞書で調べる
accent_dic_find()
{
grep -i "\"$1\"" ~/Downloads/発音辞書.csv|head -1|cut -d , -f2|sed 's/"//g'
}
# NADのカタカナ語辞書で調べる
nad_dic_find()
{
grep -i "\"$1\"" ~/Downloads/カタカナ英語辞書10.6/カタカナ英語辞書10.6.txt|head -1|cut -d , -f1|sed 's/"//g'
}
# ヒットした単語をresに追加して、次のループに進む
# 適切な位置で区切るため、区切り予約をする(kugiri="/")
# すでに区切り予約ありの状態でヒットした場合は、区切り処理をする(inject_with_kugiri)
inject_and_continue_if()
{
if [ -n "$1" ]; then
inject_with_kugiri "$1" /
continue
fi
}
# 音声記号と区切りをresに追加する
inject_with_kugiri()
{
# # 区切らない条件(名詞が連続する場合、但し'名詞,副詞可能,*,*'を除く)
# if [ "$kind_stock" != '名詞,副詞可能,*,*' -a "$kind_stock" != '名詞,数,*,*' -a "$hinsi_stock" = '名詞' -a "$hinsi" = '名詞' ]
# # 区切る条件(名詞が連続しない場合、但し'名詞,副詞可能,*,*'ならOK)
# if [ "$kind_stock" != '名詞,一般,*,*' -o "$kind" != '名詞,一般,*,*' ] || \
# [ "$kind_stock" != '名詞,サ変接続,*,*' -o "$kind" != '名詞,サ変接続,*,*' ] && \
# [ "$hinsi_stock" != '動詞' -o "$hinsi" != '助動詞' ]
# then
# 区切らない条件(名詞が連続しない場合、但し'名詞,副詞可能,*,*'ならOK)
if echo $kanji_stock|grep -iq '^[^a-z]' && echo $kanji|grep -iq '^[^a-z]' && \
[ "$kind_stock" = '名詞,一般,*,*' -o "$kind_stock" = '名詞,サ変接続,*,*' ] && [ "$kind" = '名詞,一般,*,*' -o "$kind" = '名詞,サ変接続,*,*' -o "$kind" = '名詞,接尾,一般,*' ] || \
[ "$hinsi_stock" = '動詞' -a "$hinsi" = '助動詞' ]
then
# res+="+"
top_accent=`reset_top_accent "$1"`
if [ -n "$top_accent" ]; then
res+="$top_accent"
kugiri="/"
else
res+="$1"
fi
else
if [ -n "$2" ]; then
res+="$kugiri"
kugiri="$2"
fi
res+="$1"
fi
}
# アクセントを1文字目に変更する
REG_TOP_ACCENT="^([ぁ-んァ-ン])([ぁぃぅぇぉゃゅょァィゥェォャュョ]?)([ぁ-んァ-ンー]{1,})"
reset_top_accent()
{
ruby -e "require 'jcode';\$KCODE='u';print(\"$1\".gsub(/\'/, '').sub(/$REG_TOP_ACCENT/, '\1\2\'\3'));"
}
# マルチバイト文字対応の正規表現比較を行い、マッチした文字を返す
match_REG_with_STR()
{
ruby -e "require 'jcode';\$KCODE='u';print('$2') if /$1/=~'$2';"
}
# カタカナをひらがなに変換する
kata_to_hira()
{
ruby -e "require 'jcode';\$KCODE='u';print('$1'.tr('ァ-ン', 'ぁ-ん'));"
}
# アクセントを追加する
REG_HIRA="(.*[^っー])([っー]?)([^ぁぃぅぇぉゃゅょ][ぁぃぅぇぉゃゅょ]?[^ぁぃぅぇぉゃゅょ][ぁぃぅぇぉゃゅょ]?)$"
add_foreign_accent_to()
{
ruby -e "require 'jcode';\$KCODE='u';print('$1'.sub(/$REG_HIRA/, '\1\'\2\3'));"
}
# カタカナ英語なら、アクセントを追加する
REG_KATA="(.*[^ッー])([ッー]?)([^ァィゥェォャュョ][ァィゥェォャュョ]?[^ァィゥェォャュョ][ァィゥェォャュョ]?)$"
add_foreign_accent_to_if()
{
ruby -e "require 'jcode';\$KCODE='u';print('$1'.sub(/$REG_KATA/, '\1\'\2\3')) if /[ァ-ンー]{3,}/=~'$1';"
}
HAN='。「」、ァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン'
ZEN='。「」、ァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン'
DAKUON='ガ/ガ ギ/ギ グ/グ ゲ/ゲ ゴ/ゴ ザ/ザ ジ/ジ ズ/ズ ゼ/ゼ ゾ/ゾ ダ/ダ ヂ/ヂ ヅ/ヅ デ/デ ド/ド バ/バ ビ/ビ ブ/ブ ベ/ベ ボ/ボ ヴ/ヴ パ/パ ピ/ピ プ/プ ペ/ペ ポ/ポ'
han2zen()
{
zenkaku=$1
# 濁音から変換する必要あり(順番は大切)
for trans in $DAKUON; do
zenkaku=`echo "$zenkaku"|sed "s/$trans/g"`
done
# echo "$zenkaku"|tr "$HAN" "$ZEN"
# AppleScriptから呼び出した時、日本語処理が正常に実行されないので、rubyを利用
ruby -e "require 'jcode';\$KCODE='u';print('$zenkaku'.tr('$HAN','$ZEN'));"
}
ねん
がつ
にち
ふん
びょー
えん
かい
か'い
ヵ月 か'げつ
カロリー か'ろりー
きゅー
ぎょー
きょく
キロ きろ
けん
にん
さい
時間 じ'かん
だい
丁目 ちょーめ
つき
ばん
ほん
ひき
ぱーせ'んと
We can make this file beautiful and searchable if this error is corrected: It looks like row 241 should actually have 2 columns, instead of 3. in line 240.
"*","-こめまーく-"
"中|なか","な'か"
"Rails","れ'いるず"
"彼氏","か'れし"
"準備","じゅ'んび"
"今度","こ'んど"
"もう少し","もうすこ'し"
"数","す'ー"
"ウェア","うぇ'あ"
"何と","な'んと"
"なく","な'く"
"みる","み'る"
"ダウンロードフォルダ","だうんろーどふぉ'るだ"
"特に","と'くに"
"英単語","えーた'んご"
"ちょっと","ちょ'っと"
"タイトル","た'いとる"
"本文","ほ'んぶん"
"受信","じゅし'ん"
"もはや","も'はや"
"しっかり","しっか'り"
"読め","よ'め"
"日本","にほ'ん"
"ある","あ'る"
"溢れ","あふ'れ"
"話せる","はなせ'る"
"おっしゃる","おっしゃ'る"
"そもそも","そ'もそも"
"ライナー","ら'いなー"
"スクリプト","すくり'ぷと"
"理解","り'かい"
"シンプル","し'んぷる"
"十分","じゅうぶ'ん"
"通り","と'おり"
"イラストレーター","いらすとれ'ーたー"
"思っ","おも'っ"
"始め","は'じめ"
"次","つぎ'"
"ような","ような'"
"話そ","はなそ'"
"たい","た'い"
"しかし","しか'し"
"等","と'ー"
"される","される"
"saykanji","せ'い/かん'じ"
"満足","ま'んぞく"
"ひとまず","ひと/まず"
"られる","られ'る"
"なっ","な'っ"
"一つ","ひと'つ"
"料金","りょ'ーきん"
"光","ひか'り"
"定義","て'ーぎ"
"仮名","かな"
"アルファベット","あるふぁべ'っど"
"出","で'"
"何とか","な'んとか"
"バイリンガル","ばいり'んがる"
"片言","かたこと"
"日本語","にほんご"
"saykana","せ'い/かな"
"もん","も'ん"
"テキスト","てき'すと"
"あれ","あ'れ"
"スピーカー","すぴ'ーかー"
"できる","でき'る"
"能力","の'うりょく"
"秘め","ひ'め"
"(","、か'っこ、"
")","、か'っこ/とじ'る、"
"表記","ひょ'うき"
"天気","て'んき"
"番号","ばんご'う"
"各種","か'くしゅ"
"音声","お'んせい"
"案内","あんな'い"
"銀世界","ぎんせ'かい"
"よろしい","よろし'い"
"ターミナル","た'ーみなる"
"すれ","すれ'"
"思う","おも'う"
"以下","い'か"
"読み","よみ'"
"試し","ためし'"
"読む","よ'む"
"ごく","ご'く"
"しゃべっ","しゃべ'っ"
"結構","け'っこう"
"面白い","おもし'ろい"
"無意識","むいし'き"
"部分","ぶ'ぶん"
"列","れ'つ"
"について","につ'いて"
"として","と/して"
"AquesTalk","あくえすと'ーく"
"文章","ぶ'んしょう"
"辞書","じ'しょ"
"たぶん","た'ぶん"
"もらう","もら'う"
"項目","こ'うもく"
"良い","よ'い"
"良く","よ'く"
"そう","そ'う"
"作業","さ'ぎょう"
"ところで","ところ'で"
"以前","い'ぜん"
"今","い'ま"
"お世話","おせ'わ"
"ます","ま'す"
"候補","こ'ーほ"
"出し","だ'し"
"よう","よ'う"
"AppleWorks","あっぷる'わーくす"
"!","、!"
"ただいま","ただい'ま"
"進路","し'んろ"
"とれ","と'れ"
"例えば","たとえ'ば"
"文書","ぶ'んしょ"
"れる","れ'る"
"」","」、"
"ところ","とこ'ろ"
"なる","な'る"
"たい","た'い"
"文","ぶ'ん"
"あと","あ'と"
"にも","に'も"
"入っ","は'いっ"
"親切","し'んせつ"
"処理","しょ'り"
"でわ","で'わ"
"シラミ","しらみ"
"出","で'"
"など","な'ど"
"類","る'い"
"さらに","さ'らに"
"によってわ","によって'わ"
"本来","ほ'んらい"
"ました","ま'した"
"感謝","か'んしゃ"
"まで","ま'で"
"普段","ふ'だん"
"意識","い'しき"
"細部","さ'いぶ"
"垣間見る","かいまみ'る"
"出来","で'き"
"多く","お'おく"
"区別","く'べつ"
"たり","た'り"
"たとは","た'とは"
"話し","はなし'"
"見","み'"
"日本人","にっぽん'じん"
"区切り","くぎ'り"
"読み方","よみ'かた"
"流暢","りゅ'うちょう"
"複数","ふくす'う"
"であった","'で/あ'った"
"つけ","つけ'"
"困難","こ'んなん"
"導く","みちび'く"
"その他","その'た"
"適切","てきせつ"
"知識","ち'しき"
"かなり","か'なり"
"向け","むけ;"
"如何","いか;"
"だろうか","だろ'うか"
"まだ","ま'だ"
"なかった","な'かった"
"前","ま'え"
"道路","ど'ーろ"
"出口","で'ぐち"
"堪え","たえ'"
"聞く","きく'"
"美しい","うつくし'い"
"精算","せいさ'ん"
"NEXCO","ね'くすこ"
"機能","き'のう"
"にとってわ","にと'ってわ"
"大抵","たいて'い"
"事前","じぜ'ん"
"素早く","すば'やく"
"時","と'き"
"おじさん","おじさ'ん"
"尋ねる","たずね'る"
"訪ねる","たずね'る"
"最後","さ'いご"
"のんびり","のんび'り"
"いつも","い'つも"
"ゴニョゴニョ","ゴ'ニョゴニョ"
"声","こ'え"
"遠ざかっ","とおざか'っ"
"終える","いいおえる"
"どう","ど'う"
"たいした","た'いした"
"頃|ころ","こ'ろ"
"あらゆる","あらゆ'る"
"綺麗","き'れい"
"わあ","わあ'"
"わー","わー'"
"年","ね'ん"
"情報","じょ'ーほー"
"でき","でき'"
"ない","な'い"
"風|ふう","ふ'ー"
"ほとんど","ほと'んど"
"範囲","は'んい"
"くれる","くれる'"
"英数字","えーす'ーじ"
"とりあえず","とりあえ'ず"
"見つける","みつける'"
"ところが","ところ'が"
"悪く","わ'るく"
"試す","ため'す"
"名|めい","'めい"
"say","せ'ー"
"形態素","けいたい+そ"
"はず","+はず"
"大","だい'"
"新しい","あたらし'い"
"開い","ひらい'"
"からわ","から'わ"
"する","する"
"操作","そ'うさ"
"方|ほう","ほ'ー"
"疎い","うと'い"
"前回","ぜ'んかい"
"すべて","す'べて"
"もつ","も'つ"
"アクセント","あ'_クせんと"
"無視","む'し"
"不自然","ふし'ぜん"
"はっきり","はっき'り"
"経|けい","きょう"
"読ん","よ'ん"
"べく","べ'く"
"での","で'の"
"たが","'たが"
"ながら","な'がら"
"み","み'","動詞,非自立,*,*"
"割り当て","わりあ'て"
"みた","み'た"
"目指し","めざし'"
"みたが","み'たが"
"もらいたい","もらいた'い"
"もらうにわ","もらう'にわ"
"思い出し","おもいだ'し"
"みると","み'ると"
"今|接頭詞","いま'"
"より","より"
"みたく","みた'く"
"調査","ちょ'うさ"
"制限","せいげ'ん"
"速度","そ'くど"
"なので","な'ので"
"読みで","よみ'で"
"物|ぶつ","ぶ'つ"
"mecab","めかぶ"
"さまざま","さまざ'ま"
"QuickTimePlayer","くいっくたいむぷれ'ーやー"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment