Skip to content

Instantly share code, notes, and snippets.

@kakakaya
Last active September 9, 2023 07:14
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kakakaya/141838e738a2fd9667b5e4fd2b79c4c7 to your computer and use it in GitHub Desktop.
Save kakakaya/141838e738a2fd9667b5e4fd2b79c4c7 to your computer and use it in GitHub Desktop.
Learn from aozora bunko and output markov chain
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from glob import iglob
import re
import MeCab
import markovify
def load_from_file(files_pattern):
"""read and merge files which matches given file pattern, prepare for parsing and return it.
"""
# read text
text = ""
for path in iglob(files_pattern):
with open(path, 'r') as f:
text += f.read().strip()
# delete some symbols
unwanted_chars = ['\r', '\u3000', '-', '|']
for uc in unwanted_chars:
text = text.replace(uc, '')
# delete aozora bunko notations
unwanted_patterns = [re.compile(r'《.*》'), re.compile(r'[#.*]')]
for up in unwanted_patterns:
text = re.sub(up, '', text)
return text
def split_for_markovify(text):
"""split text to sentences by newline, and split sentence to words by space.
"""
# separate words using mecab
mecab = MeCab.Tagger()
splitted_text = ""
# these chars might break markovify
# https://github.com/jsvine/markovify/issues/84
breaking_chars = [
'(',
')',
'[',
']',
'"',
"'",
]
# split whole text to sentences by newline, and split sentence to words by space.
for line in text.split():
mp = mecab.parseToNode(line)
while mp:
try:
if mp.surface not in breaking_chars:
splitted_text += mp.surface # skip if node is markovify breaking char
if mp.surface != '。' and mp.surface != '、':
splitted_text += ' ' # split words by space
if mp.surface == '。':
splitted_text += '\n' # reresent sentence by newline
except UnicodeDecodeError as e:
# sometimes error occurs
print(line)
finally:
mp = mp.next
return splitted_text
def main():
# load text
rampo_text = load_from_file('*.txt')
# split text to learnable form
splitted_text = split_for_markovify(rampo_text)
# learn model from text.
text_model = markovify.NewlineText(splitted_text, state_size=3)
# ... and generate from model.
sentence = text_model.make_sentence()
print(''.join(sentence.split())) # need to concatenate space-splitted text
# save learned data
with open('learned_data.json', 'w') as f:
f.write(text_model.to_json())
# later, if you want to reuse learned data...
"""
with open('learned_data.json') as f:
text_model = markovify.NewlineText.from_json(f.read())
"""
if __name__ == '__main__':
main()
@Moonlight6583
Copy link

私は神になる

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment