Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
markov chain n-gram language model generator
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import argparse
import random
from pathlib import Path
from itertools import chain
# fname = Path('~/Downloads/vpylm/out_2018-04-01T040258.txt')
def Markov_table(wakati, n=2, stop='\n'):
"""
wakati: list of str(words)
"""
if n < 2:
raise (ValueError('n must be >= 2 integer'))
Markov = {}
w = [stop] * (n-1) # dict key: forward (n-1) word
w[:(n-1)] = wakati[:n-1] # initial arrange
wakati = wakati[n-1:]
for word in wakati:
# shift word subsequence
# count words arrangement
if tuple(w) not in Markov.keys():
Markov[tuple(w)] = []
Markov[tuple(w)] += [word]
w = w[1:] + [word]
return(Markov)
def gen_phrase(Markov, stop='\n', Max=10000, seed=None):
if seed is not None:
random.seed(seed)
# choice initial word(s)
w = random.choice([x for x in Markov.keys() if x[0] == stop])
n = len(w)
phrase = list(w)[1:]
plen = len(' '.join(phrase))
# w = [''] * len(next(iter(Markov)))
# w = [x for x in Markov.keys() if x[0] == stopword]
while plen <= Max:
nw = random.choice(Markov[w])
if plen + len(nw) + 1 <= Max:
phrase += [nw]
if nw == stop:
break
plen += len(nw) + 1
w = tuple(phrase[-n:])
return(' '.join(phrase))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f',
type=str,
help='入力テキスト')
parser.add_argument('-n',
type=int,
default=10,
help='生成する文章の数.')
parser.add_argument('--n-gram',
type=int,
default=2,
help='n-gram の n.')
parser.add_argument('-s', '--stop-word',
type=str,
default='\n',
help='ストップワード. デフォルト: \\n')
parser.add_argument('-M', '--max',
type=int,
default=10000,
help='文章の最大文字数. デフォルト:10000')
parser.add_argument('-S', '--seed',
type=int,
help='乱数の種')
args = parser.parse_args()
with Path(args.f).expanduser().open('r') as f:
scentence = f.readlines()
scentence = [x.split(' ') for x in scentence]
scentence = list(chain.from_iterable(scentence))
random.seed(args.seed)
seeds = [random.randint(-sys.maxsize - 1,
sys.maxsize) for x in range(args.n)]
for s in seeds:
print(gen_phrase(Markov_table(scentence, n=args.n_gram,
stop=args.stop_word),
stop=args.stop_word, Max=args.max, seed=s))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.