Skip to content

Instantly share code, notes, and snippets.

@stuartlangridge
Created June 15, 2018 00:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stuartlangridge/be7b608c5a0d1a3356082c8efdecfc7b to your computer and use it in GitHub Desktop.
Save stuartlangridge/be7b608c5a0d1a3356082c8efdecfc7b to your computer and use it in GitHub Desktop.
import numpy as np
import glob
import markdown
import re
from bs4 import BeautifulSoup
def make_pairs(corpus):
for i in range(len(corpus)-1):
yield (corpus[i], corpus[i+1])
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element.encode('utf-8'))):
return False
return True
corpus = []
for f in glob.glob("path/to/blogposts/*.md"):
data = open(f, encoding='utf8').read()
header, rest = data.split("\n\n", 1)
html = markdown.markdown(rest)
soup = BeautifulSoup(html, "lxml")
data = soup.findAll(text=True)
result = filter(visible, data)
corpus += " ".join(list(result)).split()
pairs = make_pairs(corpus)
word_dict = {}
for word_1, word_2 in pairs:
if word_1 in word_dict.keys():
word_dict[word_1].append(word_2)
else:
word_dict[word_1] = [word_2]
first_word = np.random.choice([x for x in corpus if x[0]==x[0].upper() and x[0] in "ABCDEFGHIJKLMONPQRSTUVWXYZ"])
chain = [first_word]
n_words = 30
for i in range(n_words):
chain.append(np.random.choice(word_dict[chain[-1]]))
print(' '.join(chain))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment