Last active
February 26, 2019 08:52
-
-
Save lnanase/aa5d0161c2031ae698d470a2fec771c8 to your computer and use it in GitHub Desktop.
アイマスIME辞書をmecabで読み込むcsvに変換する
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# IME用アイマス辞書をmecab辞書作成用のcsvに変換する | |
# | |
# usage: convert_imasdic_to_csv.rb | |
require 'csv' | |
word = {} | |
# アイマスIME辞書をmecabで読み込むcsvに変換する | |
# コストは推定させるため空白とする | |
# カンマが入っているデータはcsvでエラーになるので削除 | |
# ひらがなの読みが後行に入ってることが多いのでハッシュ後勝ちにする | |
File.open('all_dic.txt', 'rb:UTF-16LE:UTF-8') do |f| | |
f.each_line do |line| | |
if line.include?("人名") | |
data = line.strip.gsub(',', '').split("\t") | |
word[data[1]] = data[1] + ',,,,名詞,固有名詞,人名,一般,*,*,' + data[1] + ',' + data[0] + ',' + data[0] | |
elsif line.include?("固有名詞") | |
data = line.strip.gsub(',', '').split("\t") | |
word[data[1]] = data[1] + ',,,,名詞,固有名詞,一般,*,*,*,' + data[1] + ',' + data[0] + ',' + data[0] | |
end | |
end | |
end | |
File.open("output.csv", 'w') do |file| | |
file.write(word.values.join("\n")) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment