Skip to content

Instantly share code, notes, and snippets.

@okapies
Last active June 18, 2020 03:07
Show Gist options
  • Save okapies/3eaf220bcdba2b8dd0893d329549d099 to your computer and use it in GitHub Desktop.
Save okapies/3eaf220bcdba2b8dd0893d329549d099 to your computer and use it in GitHub Desktop.
Japanese text generator using Markov chain algorithm
# -*- coding: utf-8 -*-
# ref. https://qiita.com/k-jimon/items/f02fae75e853a9c02127
from collections import deque, defaultdict, Counter
from itertools import chain, islice, takewhile
import MeCab
import os
import random
import re
import sys
from typing import Deque, Dict, Iterable, List, Tuple
ORDER = 2
SENTENCE_NUM = 5
MECAB_OPTS = ' '.join([
'-Owakati',
'-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd',
])
SPACES = re.compile('[\s]')
TrainingModel = Dict[Tuple[str, ...], List[str]]
InferenceModel = Dict[Tuple[str, ...], Tuple[List[str], List[int]]]
def train_model(
texts: Iterable[str],
order: int,
tagger: MeCab.Tagger,
) -> TrainingModel:
model: Dict[Tuple[str, ...], list] = defaultdict(list)
words = wakati(texts, tagger)
head_words = ['[BOS]'] + list(islice(words, order - 1))
queue = deque(head_words, order)
for markov_value in words:
if queue[-1] == '。':
update_model(model, queue, '[BOS]')
update_model(model, queue, markov_value)
return model
def wakati(texts: Iterable[str], tagger: MeCab.Tagger) -> Iterable[str]:
return (
chain.from_iterable(
tagger.parse(SPACES.sub('', sentence)).rstrip('\n').split(' ')
for text
in texts
for sentence
in re.findall(".*?。", text)
)
)
def update_model(model: TrainingModel, queue: Deque[str], value: str) -> None:
markov_key = tuple(queue)
model[markov_key].append(value)
queue.append(value)
def optimize_model(model: TrainingModel) -> InferenceModel:
return {
k: tuple(zip(*Counter(v).items())) # type: ignore
for k, v
in model.items()
}
def gen_sentences(
model: InferenceModel,
order: int,
sentence_num: int,
seed: str = '[BOS]',
max_words: int = 100,
) -> Iterable[str]:
key_candidates = [key for key in model if key[0] == seed]
if not key_candidates:
print('keyword not found', file=sys.stderr)
return iter([])
markov_key = random.choice(key_candidates)
queue = deque(list(markov_key), order)
return (gen_sentence(model, queue, max_words) for _ in range(sentence_num))
def gen_sentence(
model: InferenceModel,
queue: Deque[str],
max_words: int,
) -> str:
return ''.join(
list(
filter(
lambda w: w not in ['[BOS]'],
takewhile(
lambda w: w != '。',
(gen_word(model, queue) for _ in range(max_words))
)
)
) + ['。']
)
def gen_word(model: InferenceModel, queue: Deque[str]) -> str:
markov_key = tuple(queue)
next_word = random.choices(*model[markov_key])[0]
queue.append(next_word)
return markov_key[0]
def main():
tagger = MeCab.Tagger(MECAB_OPTS)
text = ''.join(l for l in sys.stdin)
model = train_model([text], ORDER, tagger)
model = optimize_model(model)
sentences = gen_sentences(model, ORDER, SENTENCE_NUM)
for sentence in sentences:
print(sentence)
if __name__ == "__main__":
main()
$ cat dazai.txt | python markov.py
私はこれまで、こんな不思議な表情の子供を見た事が、その子供の笑顔は、最も奇怪なのである。
そうして、ただもう不愉快、イライラしていた。
軽薄と言っても足りない。
あ、こんな不思議な美貌の学生にも、まんざら空お世辞に聞えないくらいの、まことに奇妙な、そうして、どこか怪談じみた気味悪いものが感ぜられて来るのである。
所謂「死相」というものになるであろうか、思い出した、というようなよろこびさえ無い。
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment