Skip to content

Instantly share code, notes, and snippets.

@shaldengeki
Created June 25, 2013 01:04
Show Gist options
  • Save shaldengeki/5855100 to your computer and use it in GitHub Desktop.
Save shaldengeki/5855100 to your computer and use it in GitHub Desktop.
markov post model. horribly unoptimized but you know whatever
import bs4
import re
import random
APOSTROPHE_REGEX = re.compile("'")
NON_ALPHANUMERIC_REGEX = re.compile('[^a-zA-Z0-9]+')
def strip_tags(text, valid_tags):
text = bs4.BeautifulSoup(text)
while len(text.contents) == 1:
text = text.contents[0]
if isinstance(text, bs4.NavigableString):
text = text.parent
break
for tag in text.findChildren(True):
if tag.name in valid_tags:
s = ""
for c in tag.contents:
if not isinstance(c, bs4.NavigableString):
c = strip_tags(unicode(c), valid_tags)
s += unicode(c)
tag.replaceWith(s)
else:
tag.extract()
return unicode(text)
def tokenize(text, valid_tags=False):
if not valid_tags:
valid_tags = {'b':1, 'i':1, 'u':1}
else:
valid_tags = dict(zip(valid_tags, [1]*len(valid_tags))) if isinstance(valid_tags, list) else []
text = strip_tags(text, valid_tags)
text = bs4.BeautifulSoup(text).text
#text = re.sub(APOSTROPHE_REGEX, '', text)
#text = re.sub(NON_ALPHANUMERIC_REGEX, ' ', text)
tokens = text.split()
return tokens
class MarkovModel(object):
def __init__(self, min_freq=0.01):
self.docs = []
self.min_freq = float(min_freq)
self._tokens = self._freqs = self._model = None
def reset(self):
self.docs = []
self._tokens = self._model = None
return self
def store(self, docs):
# stores a list of documents.
self.docs = (doc for doc in docs)
return self
def add(self, doc):
# adds a document to the store.
self.docs.append(doc)
def tokenize(self, docs=None):
# tokenizes the model's currently-stored posts.
self._tokens = {}
for doc in self.docs:
tokens = tokenize(doc)
numTokens = len(tokens)
if numTokens < 1:
continue
for i in range(numTokens):
if i == 0:
prevWord = ''
else:
prevWord = tokens[i-1]
if prevWord in self._tokens:
if tokens[i] in self._tokens[prevWord]:
self._tokens[prevWord][tokens[i]] += 1
else:
self._tokens[prevWord][tokens[i]] = 1
else:
self._tokens[prevWord] = {tokens[i]: 1}
if tokens[numTokens-1] not in self._tokens:
self._tokens[tokens[numTokens-1]] = {}
if '' in self._tokens[tokens[numTokens-1]]:
self._tokens[tokens[numTokens-1]][''] += 1
else:
self._tokens[tokens[numTokens-1]][''] = 1
return self
def normalize(self, min_freq=None):
# goes through a dict of tokens, converting raw counts to frequencies.
# skips any leaf nodes that fall under min_freq.
if min_freq is None:
min_freq = self.min_freq
if self._tokens is None:
self.tokenize()
self._freqs = {}
for word in self._tokens:
self._freqs[word] = []
word_sum = sum([self._tokens[word][leaf_word] for leaf_word in self._tokens[word]])
min_count = int(min_freq * word_sum)
# we have to recalculate the sum while we delete before we normalize.
word_sum = float(0)
filtered_words = []
for leaf_word in self._tokens[word]:
if self._tokens[word][leaf_word] >= min_count:
word_sum += self._tokens[word][leaf_word]
filtered_words.append(leaf_word)
partial_sum = 0.0
for leaf_word in filtered_words:
partial_sum += self._tokens[word][leaf_word]
self._freqs[word].append((leaf_word, partial_sum / word_sum))
return self
def phrases(self, num=1, word=None):
# generates a markov phrase given a number of words and a seed word.
if self._freqs is None:
self.normalize()
random.seed()
for i in range(num):
thisWord = word
if thisWord is None:
while thisWord is None:
randNum = random.random()
for (model_word, freq) in self._freqs['']:
if randNum <= freq:
thisWord = model_word
break
sentence = [thisWord]
while thisWord is not '':
if thisWord not in self._freqs:
break;
randNum = random.random()
for (model_word, freq) in self._freqs[thisWord]:
if randNum < freq:
sentence.append(model_word)
thisWord = model_word
break
print ' '.join(sentence).strip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment