kshepp/Token_Words.py

## Token_Words.py
from __future__ import division
import nltk
import re
import pprint
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
import numpy as np


tokenizer = RegexpTokenizer(r'\w+') #this is important because it takes out the punctuation that Python can't read
f= open('Statuses.txt').read() # open the file
# number = re.search(r'\d+', f).group()
# print number
statuses = f.split(' | ')


exclude_words=["http", "rt", "co", "in", "of", "is", "you", "me", "my", "mine", "to", "the", "i", "them", "so", "t", "by", "?", "it",
"so", "continue", "will", "probably", "was", "one", "two", "aboard", "about", "above", "across", "after", "against",
"along", "amid", "among", "anti", "around" "as", "at", "before", "behind", "below", "beneath", "beside", "besides",
"between", "btw", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting",
"excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto",
"opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "since", "than", "through", "to", "toward",
"towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without", "?",
"!", "?", "out", "it", "as", "when", "will", "not", "probably", "was", "have", "has", "this", "that", "a", "for",
"htt", "https", "many", "we", "st", "if", "ok", "okay", "all", "and", "just", "did", "amp", "what", "your", "or", "either",
"k", "ain", "ain't", "here", "are", "there", "their", "htt?", "need", "basically", "way", "why", "who", "thru",
"can", "be", "get", "today", "guy", "day", "help", "time", "tomorrow", "tonight", "now", "try", "please"]


for status in statuses:  #this tokenizes each sentence individually
    tokens = tokenizer.tokenize(status) #create "tokens" out of statuses
    words = [w.lower() for w in tokens] #make everything lower case
    sentence = sorted(set(words)) #sort all the words in alphabetical order

    for exclude in exclude_words:
        for s in sentence:        #This gets rid of all the small, unimportant connector words in the Tweets
            if s==exclude:
                sentence.remove(exclude)
            filter (lambda x: '\\' in s, sentence)
            if len(s) <= 2:      #Gets rid of all the one and two letter words
                sentence.remove(s)
    for wordle_words in sentence:
        number = any(char.isdigit() for char in wordle_words) #This gets rid of all the numbers from urls that were split up and from screen names
        if number:
            continue
        else:
            print wordle_words    #This gives you the final list to copy and paste into Wordle
	from __future__ import division
	import nltk
	import re
	import pprint
	from nltk import word_tokenize
	from nltk.tokenize import RegexpTokenizer
	import numpy as np


	tokenizer = RegexpTokenizer(r'\w+') #this is important because it takes out the punctuation that Python can't read
	f= open('Statuses.txt').read() # open the file
	# number = re.search(r'\d+', f).group()
	# print number
	statuses = f.split(' \| ')


	exclude_words=["http", "rt", "co", "in", "of", "is", "you", "me", "my", "mine", "to", "the", "i", "them", "so", "t", "by", "?", "it",
	"so", "continue", "will", "probably", "was", "one", "two", "aboard", "about", "above", "across", "after", "against",
	"along", "amid", "among", "anti", "around" "as", "at", "before", "behind", "below", "beneath", "beside", "besides",
	"between", "btw", "beyond", "but", "by", "concerning", "considering", "despite", "down", "during", "except", "excepting",
	"excluding", "following", "for", "from", "in", "inside", "into", "like", "minus", "near", "of", "off", "on", "onto",
	"opposite", "outside", "over", "past", "per", "plus", "regarding", "round", "since", "than", "through", "to", "toward",
	"towards", "under", "underneath", "unlike", "until", "up", "upon", "versus", "via", "with", "within", "without", "?",
	"!", "?", "out", "it", "as", "when", "will", "not", "probably", "was", "have", "has", "this", "that", "a", "for",
	"htt", "https", "many", "we", "st", "if", "ok", "okay", "all", "and", "just", "did", "amp", "what", "your", "or", "either",
	"k", "ain", "ain't", "here", "are", "there", "their", "htt?", "need", "basically", "way", "why", "who", "thru",
	"can", "be", "get", "today", "guy", "day", "help", "time", "tomorrow", "tonight", "now", "try", "please"]


	for status in statuses: #this tokenizes each sentence individually
	tokens = tokenizer.tokenize(status) #create "tokens" out of statuses
	words = [w.lower() for w in tokens] #make everything lower case
	sentence = sorted(set(words)) #sort all the words in alphabetical order

	for exclude in exclude_words:
	for s in sentence: #This gets rid of all the small, unimportant connector words in the Tweets
	if s==exclude:
	sentence.remove(exclude)
	filter (lambda x: '\\' in s, sentence)
	if len(s) <= 2: #Gets rid of all the one and two letter words
	sentence.remove(s)
	for wordle_words in sentence:
	number = any(char.isdigit() for char in wordle_words) #This gets rid of all the numbers from urls that were split up and from screen names
	if number:
	continue
	else:
	print wordle_words #This gives you the final list to copy and paste into Wordle