Skip to content

Instantly share code, notes, and snippets.

Good morning. How are you?(Laughter)It\'s been great, hasn\'t it?
I\'ve been blown away by the whole thing. In fact, I\'m leaving.
(Laughter)There have been three themes running through the conference
which are relevant to what I want to talk about. One is the extraordinary
evidence of human creativity
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += ['.'," \'", 'ok','okay','yeah','ya','stuff','?']
c_vectorizer = CountVectorizer(ngram_range=(1,3),
stop_words='english',
max_df = 0.6,
max_features=10000)
# call `fit` to build the vocabulary
c_vectorizer.fit(cleaned_talks)
# finally, call `transform` to convert text to a bag of words
c_x = c_vectorizer.transform(cleaned_talks)
OUPTUT:
new york city 236
000 year ago 135
new york times 123
10 year ago 118
every single day 109
million year ago 109
people around world 101
two year ago 100
world war ii 99
from collections import Counter
from operator import itemgetter
counter = Counter()
n = 2
for doc in cleaned_talks:
words = TextBlob(doc).words
bigrams = ngrams(words, n)
from nltk.stem import WordNetLemmatizer lemmizer = WordNetLemmatizer() clean_words = []
for word in docwords2:
#remove stop words
if word.lower() not in stop:
low_word = lemmizer.lemmatize(word)
#another shot at removing stopwords
if low_word.lower() not in stop:
@1fmusic
1fmusic / tokenize_ted.py
Last active June 12, 2019 21:14
tokenize ted
from nltk.tokenize import wordpunct_tokenize
doc_words2 = [wordpunct_tokenize(docs[fileid]) for fileid in fileids]
print('\n-----\n'.join(wordpunct_tokenize(docs[1][0])))
OUTPUT:
Good
-----
Topic 0
woman men child girl family community young black mother sex boy home man country white school story female father gender
Topic 1
food plant water eat farmer product plastic waste grow seed feed farm produce crop egg soil diet eating percent agriculture
Topic 2
universe earth planet space light star science mars physic particle galaxy sun theory billion matter hole black number image away
Topic 3
water ocean specie animal 000 tree sea forest fish earth planet ice area year ago river million coral bird foot shark
Topic 4
country percent money dollar business africa company million market economy billion 000 government cost global economic job india growth china
topic_names = tsne_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"
@1fmusic
1fmusic / ted_tsne.py
Created June 12, 2019 21:00
ted_tsne
from sklearn.manifold import TSNE
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results
tsne_model = TSNE(n_components=3, verbose=1, random_state=44, angle=.50,
                  perplexity=18,early_exaggeration=1,learning_rate=50.0)#, init='pca'
# 20-D -> 3-D