StephenKrewson/Twitter Random Language Poetry

## Twitter Random Language Poetry
import re, sys, nltk, string, matplotlib, HTMLParser
from nltk.corpus import cmudict
from random import choice, randint
from nltk.tokenize import *
from nltk import FreqDist
from twython import Twython

# OAuth2 authentication
t = Twython('',
            '',
            '',
            '')

tweets = [s['text'].encode('utf-8') for s in t.search(q=sys.argv[1], count=sys.argv[2])['statuses']]

# (1.2) Store NLTK libraries (runtime atrocious if these are in the functions)
books = nltk.corpus.gutenberg.fileids()
lexicon = nltk.corpus.cmudict.dict()

### SECTION 2: FUNCTIONS ###
############################
def stripWord(word):
    """Returns tuple of stripped word, lexical stress count"""
    stress = 0
    word = HTMLParser.HTMLParser().unescape(word)
    stripped = word.lower().translate(string.maketrans('',''), string.punctuation)
    # Regex filters out hashtags, URLs
    if not re.match(r'\"*[@#]|http|RT', word):
        if stripped in lexicon:
            for j in ''.join(lexicon[stripped][0]):
                if j in ('1', '2'): # CMU dict has stress values of 1 or 2
                    stress += 1
            return word.encode('utf-8'), stress
        else: # Unknown words approximated as 1 total stress
            return word.encode('utf-8'), 1

def buildBackground(num):
    """Generates a random chunk of Project Gutenberg text"""
    # Grab a whole book with random 'choice' method
    background = nltk.corpus.gutenberg.raw(choice(books)).replace('\n', ' ')
    # Random start point with enough room for selection of 'num' length
    start = randint(0, len(background) - int(num))
    return background[start:start + int(num)]

def buildPoem(tweets):
    """Returns an array of lines determined by number of stresses"""
    raw_poem, line, stress = [], '', 0
    for word in ' '.join(tweets).split():
        try:
            stripped = stripWord(word)
            stress += stripped[1]
            if stress < 7:
                line += stripped[0] + ' '
            else:
                raw_poem.append(line.encode('utf-8'))
                stress = 1
                line = stripped[0] + ' '
        except: ValueError # stripWord will not return non-words
    return raw_poem

def storeResults(poem):
    """Append output of buildPoem to a master data file"""
    with open('XS_data.txt', 'ab') as f:
        f.write(' '.join(poem))

def dataAnalyze():
    """Creates matplotlib chart of 50 most popular words"""
    with open('XS_data.txt', 'r') as f:
        data = f.read().decode('utf-8').replace('\n', ' ')
    tokens = wordpunct_tokenize(data)
    fdist = FreqDist(tokens)
    return fdist.plot(10)

def superFunction(string, num):
    """Puts everything together!"""
    poem = buildPoem(string, num)
    #storeResults(poem[1])
    for i in poem[0]:
        print ' |||| '.join(i)
    print
    #dataAnalyze()

###Main Code###
for line in buildPoem(tweets):
    print line
	import re, sys, nltk, string, matplotlib, HTMLParser
	from nltk.corpus import cmudict
	from random import choice, randint
	from nltk.tokenize import *
	from nltk import FreqDist
	from twython import Twython

	# OAuth2 authentication
	t = Twython('',
	'',
	'',
	'')

	tweets = [s['text'].encode('utf-8') for s in t.search(q=sys.argv[1], count=sys.argv[2])['statuses']]

	# (1.2) Store NLTK libraries (runtime atrocious if these are in the functions)
	books = nltk.corpus.gutenberg.fileids()
	lexicon = nltk.corpus.cmudict.dict()

	### SECTION 2: FUNCTIONS ###
	############################
	def stripWord(word):
	"""Returns tuple of stripped word, lexical stress count"""
	stress = 0
	word = HTMLParser.HTMLParser().unescape(word)
	stripped = word.lower().translate(string.maketrans('',''), string.punctuation)
	# Regex filters out hashtags, URLs
	if not re.match(r'\"*[@#]\|http\|RT', word):
	if stripped in lexicon:
	for j in ''.join(lexicon[stripped][0]):
	if j in ('1', '2'): # CMU dict has stress values of 1 or 2
	stress += 1
	return word.encode('utf-8'), stress
	else: # Unknown words approximated as 1 total stress
	return word.encode('utf-8'), 1

	def buildBackground(num):
	"""Generates a random chunk of Project Gutenberg text"""
	# Grab a whole book with random 'choice' method
	background = nltk.corpus.gutenberg.raw(choice(books)).replace('\n', ' ')
	# Random start point with enough room for selection of 'num' length
	start = randint(0, len(background) - int(num))
	return background[start:start + int(num)]

	def buildPoem(tweets):
	"""Returns an array of lines determined by number of stresses"""
	raw_poem, line, stress = [], '', 0
	for word in ' '.join(tweets).split():
	try:
	stripped = stripWord(word)
	stress += stripped[1]
	if stress < 7:
	line += stripped[0] + ' '
	else:
	raw_poem.append(line.encode('utf-8'))
	stress = 1
	line = stripped[0] + ' '
	except: ValueError # stripWord will not return non-words
	return raw_poem

	def storeResults(poem):
	"""Append output of buildPoem to a master data file"""
	with open('XS_data.txt', 'ab') as f:
	f.write(' '.join(poem))

	def dataAnalyze():
	"""Creates matplotlib chart of 50 most popular words"""
	with open('XS_data.txt', 'r') as f:
	data = f.read().decode('utf-8').replace('\n', ' ')
	tokens = wordpunct_tokenize(data)
	fdist = FreqDist(tokens)
	return fdist.plot(10)

	def superFunction(string, num):
	"""Puts everything together!"""
	poem = buildPoem(string, num)
	#storeResults(poem[1])
	for i in poem[0]:
	print ' \|\|\|\| '.join(i)
	print
	#dataAnalyze()

	###Main Code###
	for line in buildPoem(tweets):
	print line