Last active
December 28, 2015 16:18
-
-
Save sharapeco/7527681 to your computer and use it in GitHub Desktop.
かな入力用ゆとり辞書を作る
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$ret = main(); | |
exit($ret); | |
// https://code.google.com/p/mecab/ から | |
// mecab-jumandic-7.0-20130310.tar.gz をダウンロード | |
// 展開してできる ContentW.csv を使わせてもらう | |
// | |
// ああ,1430,1430,8661,感動詞,*,*,*,ああ,ああ,代表表記:ああ/ああ | |
// 愛,1133,1133,5256,名詞,普通名詞,*,*,愛,あい,代表表記:愛/あい 漢字読み:音 カテゴリ:抽象物 | |
// あい,1133,1133,12206,名詞,普通名詞,*,*,あい,あい,代表表記:愛/あい 漢字読み:音 カテゴリ:抽象物 | |
// 藍,1133,1133,5042,名詞,普通名詞,*,*,藍,あい,代表表記:藍/あい カテゴリ:植物 | |
// あい,1133,1133,12206,名詞,普通名詞,*,*,あい,あい,代表表記:藍/あい カテゴリ:植物 | |
// ... | |
function main() { | |
$src = 'mecab-jumandic-7.0-20130310/ContentW.csv'; // UTF-8 | |
$IN = fopen($src, 'r'); | |
if (! $IN) { | |
fputs(STDERR, 'no data'); | |
return 1; | |
} | |
$OUT = fopen('kana-dic-mac.txt', 'w'); | |
if (! $OUT) { | |
fputs(STDERR, 'cannot output'); | |
return 2; | |
} | |
while (! feof($IN)) { | |
$line = fgets($IN); | |
if ($word = getWord($line)) { | |
$buf = '"' . implode('","', $word) . '"' . "\n"; | |
$buf = mb_convert_encoding($buf, 'Shift_JIS', 'UTF-8'); | |
fwrite($OUT, $buf); | |
} | |
} | |
fclose($OUT); | |
fclose($IN); | |
return 0; | |
} | |
function getWord($line) { | |
if (strpos($line, '名詞') === false) { | |
return null; | |
} | |
list ($word, $t, $t, $t, $type, $stype, $t, $t, $t, $kana, $misc) = explode(',', $line); | |
$kana_smpl = dekkaku($kana); | |
if ($kana_smpl === $kana) { | |
return null; | |
} | |
if (preg_match('{代表表記:(.+?)/}', $misc, $m)) { | |
if ($m[1] !== $word) { | |
return null; | |
} | |
} | |
return array($kana_smpl, $word, convType($type, $stype)); | |
} | |
function convType($type, $stype) { | |
switch ($stype) { | |
case 'サ変名詞': return 'サ変名詞'; | |
case '時相名詞': break; | |
} | |
return '普通名詞'; | |
} | |
// ここで濁点はずしをしてもいい | |
function dekkaku($str) { | |
static $table = array( | |
'ぁ' => 'あ', | |
'ぃ' => 'い', | |
'ぅ' => 'う', | |
'ぇ' => 'え', | |
'ぉ' => 'お', | |
'っ' => 'つ', | |
'ゃ' => 'や', | |
'ゅ' => 'ゆ', | |
'ょ' => 'よ', | |
'ァ' => 'ア', | |
'ィ' => 'イ', | |
'ゥ' => 'ウ', | |
'ェ' => 'エ', | |
'ォ' => 'オ', | |
'ッ' => 'ツ', | |
'ャ' => 'ヤ', | |
'ュ' => 'ユ', | |
'ョ' => 'ヨ', | |
); | |
return strtr($str, $table); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$ret = main(); | |
exit($ret); | |
// https://code.google.com/p/mecab/ から | |
// mecab-jumandic-7.0-20130310.tar.gz をダウンロード | |
// 展開してできる ContentW.csv を使わせてもらう | |
// | |
// ああ,1430,1430,8661,感動詞,*,*,*,ああ,ああ,代表表記:ああ/ああ | |
// 愛,1133,1133,5256,名詞,普通名詞,*,*,愛,あい,代表表記:愛/あい 漢字読み:音 カテゴリ:抽象物 | |
// あい,1133,1133,12206,名詞,普通名詞,*,*,あい,あい,代表表記:愛/あい 漢字読み:音 カテゴリ:抽象物 | |
// 藍,1133,1133,5042,名詞,普通名詞,*,*,藍,あい,代表表記:藍/あい カテゴリ:植物 | |
// あい,1133,1133,12206,名詞,普通名詞,*,*,あい,あい,代表表記:藍/あい カテゴリ:植物 | |
// ... | |
function main() { | |
$src = 'mecab-jumandic-7.0-20130310/ContentW.csv'; // UTF-8 | |
$IN = fopen($src, 'r'); | |
if (! $IN) { | |
fputs(STDERR, 'no data'); | |
return 1; | |
} | |
$OUT = fopen('kana-dic.txt', 'w'); | |
if (! $OUT) { | |
fputs(STDERR, 'cannot output'); | |
return 2; | |
} | |
while (! feof($IN)) { | |
$line = fgets($IN); | |
if ($word = getWord($line)) { | |
$buf = implode("\t", $word) . "\n"; | |
$buf = mb_convert_encoding($buf, 'Shift_JIS', 'UTF-8'); | |
fwrite($OUT, $buf); | |
} | |
} | |
fclose($OUT); | |
fclose($IN); | |
return 0; | |
} | |
function getWord($line) { | |
if (strpos($line, '名詞') === false) { | |
return null; | |
} | |
list ($word, $t, $t, $t, $type, $stype, $t, $t, $t, $kana, $misc) = explode(',', $line); | |
$kana_smpl = dekkaku($kana); | |
if ($kana_smpl === $kana) { | |
return null; | |
} | |
if (preg_match('{代表表記:(.+?)/}', $misc, $m)) { | |
if ($m[1] !== $word) { | |
return null; | |
} | |
} | |
return array($kana_smpl, $word, convType($type, $stype)); | |
} | |
function convType($type, $stype) { | |
switch ($stype) { | |
case 'サ変名詞': return 'さ変名詞'; | |
case '時相名詞': break; | |
} | |
return '名詞'; | |
} | |
// ここで濁点はずしをしてもいい | |
function dekkaku($str) { | |
static $table = array( | |
'ぁ' => 'あ', | |
'ぃ' => 'い', | |
'ぅ' => 'う', | |
'ぇ' => 'え', | |
'ぉ' => 'お', | |
'っ' => 'つ', | |
'ゃ' => 'や', | |
'ゅ' => 'ゆ', | |
'ょ' => 'よ', | |
'ァ' => 'ア', | |
'ィ' => 'イ', | |
'ゥ' => 'ウ', | |
'ェ' => 'エ', | |
'ォ' => 'オ', | |
'ッ' => 'ツ', | |
'ャ' => 'ヤ', | |
'ュ' => 'ユ', | |
'ョ' => 'ヨ', | |
); | |
return strtr($str, $table); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment