Last active
September 17, 2021 01:58
-
-
Save parulnith/e2cacbf9a67ddb069d45905f480b8e15 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#source of code : https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d | |
from sklearn.feature_extraction.text import CountVectorizer | |
def get_top_n_gram(corpus,ngram_range,n=None): | |
vec = CountVectorizer(ngram_range=ngram_range,stop_words = 'english').fit(corpus) | |
bag_of_words = vec.transform(corpus) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] | |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) | |
return words_freq[:n] | |
# Creating individual dataframes of different emotions | |
sadness = train[train['label']==0]['text'] | |
joy = train[train['label']==1]['text'] | |
love = train[train['label']==2]['text'] | |
anger = train[train['label']==3]['text'] | |
fear = train[train['label']==4]['text'] | |
surprise = train[train['label']==5]['text'] | |
# Calculating unigrams, bigrams and trigrams for different emotions. | |
sadness_unigrams = get_top_n_words(sadness.values,15)[2:] # leaving the top 2 as they are feel and feeling | |
sad_bigrams = get_top_n_gram(sadness.values,(2,2),7)[2:] | |
sad_trigrams = get_top_n_gram(sadness.values,(3,3),7)[2:] | |
# Repeat the above code for other emotions | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment