Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
import numpy as np
import glob
import markdown
import re
from bs4 import BeautifulSoup
def make_pairs(corpus):
for i in range(len(corpus)-1):
yield (corpus[i], corpus[i+1])
def visible(element):
if in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True
corpus = []
for f in glob.glob("path/to/blogposts/*.md"):
data = open(f, encoding='utf8').read()
header, rest = data.split("\n\n", 1)
html = markdown.markdown(rest)
soup = BeautifulSoup(html, "lxml")
data = soup.findAll(text=True)
result = filter(visible, data)
corpus += " ".join(list(result)).split()
pairs = make_pairs(corpus)
word_dict = {}
for word_1, word_2 in pairs:
if word_1 in word_dict.keys():
word_dict[word_1] = [word_2]
first_word = np.random.choice([x for x in corpus if x[0]==x[0].upper() and x[0] in "ABCDEFGHIJKLMONPQRSTUVWXYZ"])
chain = [first_word]
n_words = 30
for i in range(n_words):
print(' '.join(chain))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment