Created
February 1, 2018 20:28
-
-
Save starovoitovs/042ef58aafd205ad621d29614805de6f to your computer and use it in GitHub Desktop.
Uninspiring quotes with Markov chains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import random | |
import nltk | |
def shuffled(language, term): | |
# while part of speech tag is not a punctuation mark | |
while term[1] not in [".", ","]: | |
yield term[0] | |
term = random.choice(language[term]) | |
def get_sentence(language, seed=("the", "DT")): | |
# use generator to sample words and join with space | |
return " ".join(shuffled(language, seed)) | |
def get_language(text): | |
# splits text into pairs like ("the", "DT"); word and corresponding part-of-speech tag | |
words = nltk.pos_tag([word.lower() for word in nltk.word_tokenize(text)]) | |
# create pairs of subsequent words and group by first element (preceding word) | |
groups = itertools.groupby(sorted(zip(words, words[1:]), key=lambda x: x[0]), lambda x: x[0]) | |
# return dictionary with possible continuations | |
return {k: [x[1] for x in v] for k, v in groups} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment