Created
August 2, 2021 13:19
-
-
Save arimitramaiti/7a790e4e605dc4b01bd6ae93b7ea7268 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pre_processing(dataset, my_stop_list): | |
store = [] | |
##Use contractions | |
dataset['review_modified'] = dataset['review'].apply(lambda x: [contractions.fix(word) for word in x.split()]) | |
##convert list to string | |
dataset['review_modified_unlist'] = [' '.join(map(str, l)) for l in dataset['review_modified']] | |
##convert string to tokens | |
dataset['review_mod_tokens'] = dataset['review_modified_unlist'].apply(nltk.word_tokenize) | |
##convert token to lower case | |
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word.lower() for word in x]) | |
##remove punctuations | |
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in string.punctuation]) | |
stop_set = set(nltk.corpus.stopwords.words('english')) | |
stop_set.update(my_stop_list) | |
##remove default and custom stop words | |
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in stop_set]) | |
##extract parts of speech tags | |
dataset['review_tags'] = dataset['review_mod_tokens'].apply(nltk.tag.pos_tag) | |
def get_wordnet_pos(tag): | |
if tag.startswith('J'): | |
return wordnet.ADJ | |
elif tag.startswith('V'): | |
return wordnet.VERB | |
elif tag.startswith('N'): | |
return wordnet.NOUN | |
elif tag.startswith('R'): | |
return wordnet.ADV | |
else: | |
return wordnet.NOUN | |
dataset['review_wordnet'] = dataset['review_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x]) | |
##use lemmatizer on parts of speech tags | |
lm = WordNetLemmatizer() | |
dataset['review_lemmatized'] = dataset['review_wordnet'].apply(lambda x: [lm.lemmatize(word, tag) for word, tag in x]) | |
dataset['review_str'] = [' '.join(map(str,l)) for l in dataset['review_lemmatized']] | |
##extract sentiment score from parts of speech tags | |
dataset['sentiment_score_textblob'] = dataset['review_str'].apply(lambda x: TextBlob(x).sentiment.polarity) | |
def afinn_polarity(text): | |
return Afinn().score(text) | |
def score_sentiwordnet(doc): | |
synsets = (wordnet.synsets(w) for w in nltk.word_tokenize(doc) if w is not None) # tokenize and lookup synonyms in WordNet | |
words = (s[0].name() for s in synsets if s is not None and len(s) > 0) # use the first (most common) synonym | |
words = (swn.senti_synset(s) for s in words) # get sentiment scores which have pos, neg, and neutral parts | |
scores = ((1 - w.obj_score()) * (w.pos_score() - w.neg_score()) for w in words) # calculate individual word scores | |
return (sum(scores)) # classify document as positive if resulting score is positive | |
def ifformula(score): | |
if score < 0: | |
return "0" | |
elif score == 0: | |
return "1" | |
else: | |
return "2" | |
dataset['sentiment_score_afinn'] = dataset['review_str'].apply(afinn_polarity) | |
dataset['sentiment_score_sentinet'] = dataset['review_str'].apply(score_sentiwordnet) | |
dataset['sentiment_score_afinn'] = dataset['sentiment_score_afinn'].apply(ifformula) | |
dataset['sentiment_score_textblob'] = dataset['sentiment_score_textblob'].apply(ifformula) | |
dataset['sentiment_score_sentinet'] = dataset['sentiment_score_sentinet'].apply(ifformula) | |
dataset = dataset[['UniversalMessageId', 'Sentiment', 'review_date','review_lemmatized', 'review_str', 'sentiment_score_textblob', 'sentiment_score_sentinet', 'sentiment_score_afinn']] | |
dataset['word_count'] = dataset['review_lemmatized'].apply(lambda x: len(str(x).split())) | |
dataset['review_length'] = dataset['review_str'].astype(str).apply(len) | |
###Process to extract text2emotion for every row | |
for rx in range(0, dataset.shape[0]): | |
text = dataset['review_str'][rx] | |
result = te.get_emotion(text) | |
result = pd.DataFrame(result.items(), columns=['Emotion', 'Score']) | |
result = result.sort_values(by=['Score'], ascending=False) | |
result['cum_percent'] = 100*(result.Score.cumsum() / result.Score.sum()) | |
result['cum_percent'].fillna(0, inplace=True) | |
result.reset_index(drop=True, inplace=True) | |
print(rx) | |
##This would pick only those rows where cumulative emotion touches 100% | |
store1 = [] | |
for i in range(0, result.shape[0]): | |
a = result['cum_percent'][i] | |
if a == 100.0: | |
b = result['Emotion'][i] | |
store1.append(b) | |
break | |
else: | |
b = result['Emotion'][i] | |
store1.append(b) | |
c = store1 | |
mylist = [rx, c, result['cum_percent'].max()] | |
store.append(mylist) | |
# print(mylist) | |
result_emotions = pd.DataFrame(store, columns=['Index_col', 'Emotions', 'Max_score']) | |
result_emotions['Emotions'] = [','.join(map(str,l)) for l in result_emotions['Emotions']] | |
result_emotions['Emotions'] = np.where(result_emotions['Max_score']==0, 'None', result_emotions['Emotions']) | |
dataset = pd.merge(dataset, result_emotions, how="left", left_index = True, right_index = True) | |
return dataset |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment