Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#source of code : https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
from sklearn.feature_extraction.text import CountVectorizer
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
# Creating individual dataframes of different emotions
sadness = train[train['label']==0]['text']
joy = train[train['label']==1]['text']
love = train[train['label']==2]['text']
anger = train[train['label']==3]['text']
fear = train[train['label']==4]['text']
surprise = train[train['label']==5]['text']
# Calculating unigrams, bigrams and trigrams for different emotions.
sadness_unigrams = get_top_n_words(sadness.values,15)[2:] # leaving the top 2 as they are feel and feeling
sad_bigrams = get_top_n_gram(sadness.values,(2,2),7)[2:]
sad_trigrams = get_top_n_gram(sadness.values,(3,3),7)[2:]
# Repeat the above code for other emotions
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment