Last active
April 18, 2020 00:41
-
-
Save hamach0/5fdbf63b2c4749b3621844a8052ca381 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import re | |
import random | |
from janome.tokenizer import Tokenizer | |
def buildWordDict(text): | |
# 改行や空白、ふりがなを消す | |
delete_symbol = ["|", " "] | |
delete_expression = ["《.*》", "[.*\]", "\r", "\n"] | |
for ds in delete_symbol: | |
text = text.replace(ds, "") | |
for de in delete_expression: | |
text = re.sub(de, "", text) | |
# トークンを生成して、テキストを単語ごとに区切る | |
t = Tokenizer() | |
words = t.tokenize(text, wakati=True) | |
wordDict = {} | |
# 単語の辞書を作成 | |
for i in range(0, len(words)-1): | |
previousPhrase = "" | |
nextPhrase = "" | |
for word in words[i]: | |
previousPhrase += word | |
for word in words[i+1]: | |
nextPhrase += word | |
wordDict.setdefault(previousPhrase, []) | |
wordDict[previousPhrase].append(nextPhrase) | |
return wordDict | |
def retrieveRandomword(wordList): | |
randomIndex = random.randint(0, len(wordList)-1) | |
return wordList[randomIndex] | |
# テキストファイルを読み込む。読み込めないものは無視するためにcodecsモジュールを使い、引数にignoreを指定 | |
text = codecs.open('/Users/yusuke/Downloads/kokoro.txt', 'r', 'shift-jis', 'ignore').read() | |
wordDict = buildWordDict(text) | |
# 何回続けるか定義 | |
length = 100 | |
chain =["私"] | |
for i in range(1, length): | |
newWord = retrieveRandomword(wordDict[chain[-1]]) | |
chain.append(newWord) | |
print("".join(chain)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment