Summer Rankin 1fmusic

## ted_talk1_output.txt
Good morning. How are you?(Laughter)It\'s been great, hasn\'t it?
I\'ve been blown away by the whole thing. In fact, I\'m leaving.
(Laughter)There have been three themes running through the conference
which are relevant to what I want to talk about. One is the extraordinary
evidence of human creativity

## ted_stopwords.py
from nltk.corpus import stopwords
    stop = stopwords.words('english')
    stop += ['.'," \'", 'ok','okay','yeah','ya','stuff','?']

## ted_count_vect.py
c_vectorizer = CountVectorizer(ngram_range=(1,3),
                                     stop_words='english',
                                     max_df = 0.6,
                                     max_features=10000)

        # call `fit` to build the vocabulary
c_vectorizer.fit(cleaned_talks)

        # finally, call `transform` to convert text to a bag of words
c_x = c_vectorizer.transform(cleaned_talks)

## ted_trigram_output.py
    OUPTUT:
    new york city 236
    000 year ago 135
    new york times 123
    10 year ago 118
    every single day 109
    million year ago 109
    people around world 101
    two year ago 100
    world war ii 99

## ted_count_bigrams.py

    from collections import Counter
    from operator import itemgetter

    counter = Counter()

    n = 2
    for doc in cleaned_talks:
        words = TextBlob(doc).words
        bigrams = ngrams(words, n)

## ted_lemmatize.py
from nltk.stem import WordNetLemmatizer lemmizer = WordNetLemmatizer() clean_words = []

for word in docwords2:

    #remove stop words
     if word.lower() not in stop:
         low_word = lemmizer.lemmatize(word)

         #another shot at removing stopwords
         if low_word.lower() not in stop:

## tokenize_ted.py
from nltk.tokenize import wordpunct_tokenize

doc_words2 = [wordpunct_tokenize(docs[fileid]) for fileid in fileids]

print('\n-----\n'.join(wordpunct_tokenize(docs[1][0])))

OUTPUT:

Good
-----

## topics.txt
Topic  0
woman men child girl family community young black mother sex boy home man country white school story female father gender
Topic  1
food plant water eat farmer product plastic waste grow seed feed farm produce crop egg soil diet eating percent agriculture
Topic  2
universe earth planet space light star science mars physic particle galaxy sun theory billion matter hole black number image away
Topic  3
water ocean specie animal 000 tree sea forest fish earth planet ice area year ago river million coral bird foot shark
Topic  4
country percent money dollar business africa company million market economy billion 000 government cost global economic job india growth china

## topic_names.py
topic_names = tsne_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"

## ted_tsne.py
from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results

tsne_model = TSNE(n_components=3, verbose=1, random_state=44, angle=.50,
                  perplexity=18,early_exaggeration=1,learning_rate=50.0)#, init='pca'

# 20-D -> 3-D
	Good morning. How are you?(Laughter)It\'s been great, hasn\'t it?
	I\'ve been blown away by the whole thing. In fact, I\'m leaving.
	(Laughter)There have been three themes running through the conference
	which are relevant to what I want to talk about. One is the extraordinary
	evidence of human creativity
	from nltk.corpus import stopwords
	stop = stopwords.words('english')
	stop += ['.'," \'", 'ok','okay','yeah','ya','stuff','?']
	c_vectorizer = CountVectorizer(ngram_range=(1,3),
	stop_words='english',
	max_df = 0.6,
	max_features=10000)

	# call `fit` to build the vocabulary
	c_vectorizer.fit(cleaned_talks)

	# finally, call `transform` to convert text to a bag of words
	c_x = c_vectorizer.transform(cleaned_talks)
	OUPTUT:
	new york city 236
	000 year ago 135
	new york times 123
	10 year ago 118
	every single day 109
	million year ago 109
	people around world 101
	two year ago 100
	world war ii 99

	from collections import Counter
	from operator import itemgetter

	counter = Counter()

	n = 2
	for doc in cleaned_talks:
	words = TextBlob(doc).words
	bigrams = ngrams(words, n)
	from nltk.stem import WordNetLemmatizer lemmizer = WordNetLemmatizer() clean_words = []

	for word in docwords2:

	#remove stop words
	if word.lower() not in stop:
	low_word = lemmizer.lemmatize(word)

	#another shot at removing stopwords
	if low_word.lower() not in stop:
	from nltk.tokenize import wordpunct_tokenize

	doc_words2 = [wordpunct_tokenize(docs[fileid]) for fileid in fileids]

	print('\n-----\n'.join(wordpunct_tokenize(docs[1][0])))

	OUTPUT:

	Good
	-----
	Topic 0
	woman men child girl family community young black mother sex boy home man country white school story female father gender
	Topic 1
	food plant water eat farmer product plastic waste grow seed feed farm produce crop egg soil diet eating percent agriculture
	Topic 2
	universe earth planet space light star science mars physic particle galaxy sun theory billion matter hole black number image away
	Topic 3
	water ocean specie animal 000 tree sea forest fish earth planet ice area year ago river million coral bird foot shark
	Topic 4
	country percent money dollar business africa company million market economy billion 000 government cost global economic job india growth china
	topic_names = tsne_labels
	topic_names[topic_names==0] = "family"
	topic_names[topic_names==1] = "agriculture"
	topic_names[topic_names==2] = "space"
	topic_names[topic_names==3] = "environment"
	topic_names[topic_names==4] = "global economy"
	topic_names[topic_names==5] = "writing"
	topic_names[topic_names==6] = "sounds"
	topic_names[topic_names==7] = "belief, mortality"
	topic_names[topic_names==8] = "transportation"
	from sklearn.manifold import TSNE

	# a t-SNE model
	# angle value close to 1 means sacrificing accuracy for speed
	# pca initializtion usually leads to better results

	tsne_model = TSNE(n_components=3, verbose=1, random_state=44, angle=.50,
	perplexity=18,early_exaggeration=1,learning_rate=50.0)#, init='pca'

	# 20-D -> 3-D