Skip to content

Instantly share code, notes, and snippets.

@ktibb
Created April 9, 2012 14:57
Show Gist options
  • Save ktibb/2344051 to your computer and use it in GitHub Desktop.
Save ktibb/2344051 to your computer and use it in GitHub Desktop.
Not Working: feeding webtext into Markov code
import urllib
import BeautifulSoup
import re
from random import choice
#import context_free
import sys
import markov
html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
soup = BeautifulSoup.BeautifulSoup(html)
#texts = soup.findAll(text=True)
#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
#print(texts)
#print type(texts)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
text = visible_texts[0].getText()
generator = markov.MarkovGenerator(n=5, max=500)
for line in text:
stringline=line.encode('utf-8')
generator.feed(stringline)
# print line
# line = line.strip(";:#-?.,")
# stringline=line.encode('utf-8')
# print stringline
# generator.feed(stringline)
# word=line.split(" ")
for i in range(1):
print generator.generate()
def feed(self, text):
tokens = self.tokenize(text)
# discard this line if it's too short
if len(tokens) < self.n:
return
# store the first ngram of this line
beginning = tuple(tokens[:self.n])
self.beginnings.append(beginning)
for i in range(len(tokens) - self.n):
gram = tuple(tokens[i:i+self.n])
next = tokens[i+self.n] # get the element after the gram
# if we've already seen this ngram, append; otherwise, set the
# value for this key as a new list
if gram in self.ngrams:
self.ngrams[gram].append(next)
else:
self.ngrams[gram] = [next]
# generate a text from the information in self.ngrams
def generate(self):
from random import choice
# get a random line beginning; convert to a list.
current = choice(self.beginnings)
output = list(current)
for i in range(self.max):
if current in self.ngrams:
possible_next = self.ngrams[current]
next = choice(possible_next)
output.append(next)
# get the last N entries of the output; we'll use this to look up
# an ngram in the next iteration of the loop
current = tuple(output[-self.n:])
else:
break
output_str = self.concatenate(output)
return output_str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment