Skip to content

Instantly share code, notes, and snippets.

@ktibb
Created April 11, 2012 02:47
Show Gist options
  • Save ktibb/2356525 to your computer and use it in GitHub Desktop.
Save ktibb/2356525 to your computer and use it in GitHub Desktop.
Markov code applied to the text of 1984
import urllib
import BeautifulSoup
import re
from random import choice
#import context_free
import sys
import markov
html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
soup = BeautifulSoup.BeautifulSoup(html)
#texts = soup.findAll(text=True)
#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
#print "from beautifulsoup: " + str(texts)
#print type(texts)
def visible(element):
if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
return False
elif re.match('<!--.*-->', str(element)):
return False
return True
visible_texts = filter(visible, texts)
text = visible_texts[0].getText()
#print "from getText() " + str(text)
#print "text is a... " + str(type(text))
generator = markov.MarkovGenerator(n=2, max=500)
for line in text.split('. '):
#print line; prints paragrphs of text over and over...
stringline=line.encode('utf-8')
generator.feed(stringline)
for i in range(1):
print generator.generate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment