ktibb/Web_to_Markov.py

## Web_to_Markov.py
import urllib
import BeautifulSoup
import re
from random import choice
#import context_free
import sys
import markov


html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
soup = BeautifulSoup.BeautifulSoup(html)
#texts = soup.findAll(text=True)

#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
#print(texts)
#print type(texts)

def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
       return False
    return True

visible_texts = filter(visible, texts)
text = visible_texts[0].getText()

generator = markov.MarkovGenerator(n=5, max=500)
for line in text:
    stringline=line.encode('utf-8')
    generator.feed(stringline)


#    print line
#    line = line.strip(";:#-?.,")
#    stringline=line.encode('utf-8')
#    print stringline
#    generator.feed(stringline)
#    word=line.split(" ")


for i in range(1):
    print generator.generate()


def feed(self, text):
     tokens = self.tokenize(text)
     # discard this line if it's too short
     if len(tokens) < self.n:
        return

    # store the first ngram of this line
     beginning = tuple(tokens[:self.n])
     self.beginnings.append(beginning)

     for i in range(len(tokens) - self.n):

        gram = tuple(tokens[i:i+self.n])
        next = tokens[i+self.n] # get the element after the gram

# if we've already seen this ngram, append; otherwise, set the
# value for this key as a new list
     if gram in self.ngrams:
        self.ngrams[gram].append(next)
     else:
        self.ngrams[gram] = [next]


# generate a text from the information in self.ngrams
def generate(self):

    from random import choice

# get a random line beginning; convert to a list.
    current = choice(self.beginnings)
    output = list(current)

    for i in range(self.max):
        if current in self.ngrams:
            possible_next = self.ngrams[current]
            next = choice(possible_next)
            output.append(next)
# get the last N entries of the output; we'll use this to look up
# an ngram in the next iteration of the loop
            current = tuple(output[-self.n:])
        else:
            break

        output_str = self.concatenate(output)
        return output_str
	import urllib
	import BeautifulSoup
	import re
	from random import choice
	#import context_free
	import sys
	import markov


	html = urllib.urlopen('http://www.george-orwell.org/1984/0.html').read()
	soup = BeautifulSoup.BeautifulSoup(html)
	#texts = soup.findAll(text=True)

	#soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
	texts=soup.html.body.table.contents[1].td.contents[3].tr.contents[3].table.contents[5].tr.td.table.tr.td.tr.td.table.contents[3].td.contents[4]
	#print(texts)
	#print type(texts)

	def visible(element):
	if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
	return False
	elif re.match('<!--.*-->', str(element)):
	return False
	return True

	visible_texts = filter(visible, texts)
	text = visible_texts[0].getText()

	generator = markov.MarkovGenerator(n=5, max=500)
	for line in text:
	stringline=line.encode('utf-8')
	generator.feed(stringline)


	# print line
	# line = line.strip(";:#-?.,")
	# stringline=line.encode('utf-8')
	# print stringline
	# generator.feed(stringline)
	# word=line.split(" ")



	for i in range(1):
	print generator.generate()


	def feed(self, text):
	tokens = self.tokenize(text)
	# discard this line if it's too short
	if len(tokens) < self.n:
	return

	# store the first ngram of this line
	beginning = tuple(tokens[:self.n])
	self.beginnings.append(beginning)

	for i in range(len(tokens) - self.n):

	gram = tuple(tokens[i:i+self.n])
	next = tokens[i+self.n] # get the element after the gram

	# if we've already seen this ngram, append; otherwise, set the
	# value for this key as a new list
	if gram in self.ngrams:
	self.ngrams[gram].append(next)
	else:
	self.ngrams[gram] = [next]



	# generate a text from the information in self.ngrams
	def generate(self):

	from random import choice

	# get a random line beginning; convert to a list.
	current = choice(self.beginnings)
	output = list(current)

	for i in range(self.max):
	if current in self.ngrams:
	possible_next = self.ngrams[current]
	next = choice(possible_next)
	output.append(next)
	# get the last N entries of the output; we'll use this to look up
	# an ngram in the next iteration of the loop
	current = tuple(output[-self.n:])
	else:
	break

	output_str = self.concatenate(output)
	return output_str