Skip to content

Instantly share code, notes, and snippets.

@hamach0
Last active April 18, 2020 00:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hamach0/5fdbf63b2c4749b3621844a8052ca381 to your computer and use it in GitHub Desktop.
Save hamach0/5fdbf63b2c4749b3621844a8052ca381 to your computer and use it in GitHub Desktop.
import codecs
import re
import random
from janome.tokenizer import Tokenizer
def buildWordDict(text):
# 改行や空白、ふりがなを消す
delete_symbol = ["|", " "]
delete_expression = ["《.*》", "[.*\]", "\r", "\n"]
for ds in delete_symbol:
text = text.replace(ds, "")
for de in delete_expression:
text = re.sub(de, "", text)
# トークンを生成して、テキストを単語ごとに区切る
t = Tokenizer()
words = t.tokenize(text, wakati=True)
wordDict = {}
# 単語の辞書を作成
for i in range(0, len(words)-1):
previousPhrase = ""
nextPhrase = ""
for word in words[i]:
previousPhrase += word
for word in words[i+1]:
nextPhrase += word
wordDict.setdefault(previousPhrase, [])
wordDict[previousPhrase].append(nextPhrase)
return wordDict
def retrieveRandomword(wordList):
randomIndex = random.randint(0, len(wordList)-1)
return wordList[randomIndex]
# テキストファイルを読み込む。読み込めないものは無視するためにcodecsモジュールを使い、引数にignoreを指定
text = codecs.open('/Users/yusuke/Downloads/kokoro.txt', 'r', 'shift-jis', 'ignore').read()
wordDict = buildWordDict(text)
# 何回続けるか定義
length = 100
chain =["私"]
for i in range(1, length):
newWord = retrieveRandomword(wordDict[chain[-1]])
chain.append(newWord)
print("".join(chain))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment