starovoitovs/markov.py

## markov.py
import itertools
import random
import nltk


def shuffled(language, term):

    # while part of speech tag is not a punctuation mark
    while term[1] not in [".", ","]:
        yield term[0]
        term = random.choice(language[term])


def get_sentence(language, seed=("the", "DT")):

    # use generator to sample words and join with space
    return " ".join(shuffled(language, seed))


def get_language(text):

    # splits text into pairs like ("the", "DT"); word and corresponding part-of-speech tag
    words = nltk.pos_tag([word.lower() for word in nltk.word_tokenize(text)])

    # create pairs of subsequent words and group by first element (preceding word)
    groups = itertools.groupby(sorted(zip(words, words[1:]), key=lambda x: x[0]), lambda x: x[0])

    # return dictionary with possible continuations
    return {k: [x[1] for x in v] for k, v in groups}
	import itertools
	import random
	import nltk


	def shuffled(language, term):

	# while part of speech tag is not a punctuation mark
	while term[1] not in [".", ","]:
	yield term[0]
	term = random.choice(language[term])


	def get_sentence(language, seed=("the", "DT")):

	# use generator to sample words and join with space
	return " ".join(shuffled(language, seed))


	def get_language(text):

	# splits text into pairs like ("the", "DT"); word and corresponding part-of-speech tag
	words = nltk.pos_tag([word.lower() for word in nltk.word_tokenize(text)])

	# create pairs of subsequent words and group by first element (preceding word)
	groups = itertools.groupby(sorted(zip(words, words[1:]), key=lambda x: x[0]), lambda x: x[0])

	# return dictionary with possible continuations
	return {k: [x[1] for x in v] for k, v in groups}