Created
February 5, 2019 12:39
-
-
Save umaz/aa0f9d419c2d3832823092d4a3bf17d1 to your computer and use it in GitHub Desktop.
マルコフ連鎖による文章生成
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
require 'natto' | |
require 'csv' | |
# 分かち書きで出力 | |
$nm = Natto::MeCab.new(dicdir: "/usr/lib/mecab/dic/mecab-ipadic-neologd", output_format_type: :wakati) | |
def mecab(txt) | |
parse = $nm.parse(txt) | |
parse = "BOS " + parse + "EOS" | |
return parse | |
end | |
data = [] | |
start = [] | |
file = File.foreach("text.txt") do |line| | |
res = mecab(line) | |
res.split(" ").each_cons(3) do |a| # 配列を前から3語ずつ1ずらしで保存する | |
h = {'head' => a[0], 'middle' => a[1], 'end' => a[2]} | |
data << h | |
end | |
end | |
CSV.open("data.csv", "wb") do |csv| | |
csv << ['head', 'middle', 'end'] | |
data.each do |h| | |
csv << h.values | |
end | |
end | |
x = data.size | |
while true | |
exit = gets.chomp | |
break if exit == "exit" | |
while true | |
n = rand(x) # ランダムで最初の2語を決定 | |
start = data[n]['head'] | |
if start == "BOS" | |
t1 = data[n]['middle'] | |
t2 = data[n]['end'] | |
new_text = t1 + t2 | |
break | |
end | |
end | |
#前2語が同じもの(3語目の候補)リストを作成 | |
while true | |
next_word = [] | |
data.each do |hash| | |
if hash['head'] == t1 && hash['middle'] == t2 | |
next_word << hash | |
end | |
end | |
break if next_word.size == 0 | |
num = rand(next_word.size) # 乱数で次の語を決定する | |
new_text = new_text + next_word[num]['end'] | |
break if next_word[num]['end'] == "EOS" | |
# 二語の更新 | |
t1 = next_word[num]['middle'] | |
t2 = next_word[num]['end'] | |
end | |
# EOSを削除して、結果出力 | |
tweet = new_text.gsub!(/EOS$/,'') | |
puts tweet | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment