Skip to content

Instantly share code, notes, and snippets.

@arimitramaiti
Created August 2, 2021 13:19
Show Gist options
  • Save arimitramaiti/7a790e4e605dc4b01bd6ae93b7ea7268 to your computer and use it in GitHub Desktop.
Save arimitramaiti/7a790e4e605dc4b01bd6ae93b7ea7268 to your computer and use it in GitHub Desktop.
def pre_processing(dataset, my_stop_list):
store = []
##Use contractions
dataset['review_modified'] = dataset['review'].apply(lambda x: [contractions.fix(word) for word in x.split()])
##convert list to string
dataset['review_modified_unlist'] = [' '.join(map(str, l)) for l in dataset['review_modified']]
##convert string to tokens
dataset['review_mod_tokens'] = dataset['review_modified_unlist'].apply(nltk.word_tokenize)
##convert token to lower case
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word.lower() for word in x])
##remove punctuations
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])
stop_set = set(nltk.corpus.stopwords.words('english'))
stop_set.update(my_stop_list)
##remove default and custom stop words
dataset['review_mod_tokens'] = dataset['review_mod_tokens'].apply(lambda x: [word for word in x if word not in stop_set])
##extract parts of speech tags
dataset['review_tags'] = dataset['review_mod_tokens'].apply(nltk.tag.pos_tag)
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
dataset['review_wordnet'] = dataset['review_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
##use lemmatizer on parts of speech tags
lm = WordNetLemmatizer()
dataset['review_lemmatized'] = dataset['review_wordnet'].apply(lambda x: [lm.lemmatize(word, tag) for word, tag in x])
dataset['review_str'] = [' '.join(map(str,l)) for l in dataset['review_lemmatized']]
##extract sentiment score from parts of speech tags
dataset['sentiment_score_textblob'] = dataset['review_str'].apply(lambda x: TextBlob(x).sentiment.polarity)
def afinn_polarity(text):
return Afinn().score(text)
def score_sentiwordnet(doc):
synsets = (wordnet.synsets(w) for w in nltk.word_tokenize(doc) if w is not None) # tokenize and lookup synonyms in WordNet
words = (s[0].name() for s in synsets if s is not None and len(s) > 0) # use the first (most common) synonym
words = (swn.senti_synset(s) for s in words) # get sentiment scores which have pos, neg, and neutral parts
scores = ((1 - w.obj_score()) * (w.pos_score() - w.neg_score()) for w in words) # calculate individual word scores
return (sum(scores)) # classify document as positive if resulting score is positive
def ifformula(score):
if score < 0:
return "0"
elif score == 0:
return "1"
else:
return "2"
dataset['sentiment_score_afinn'] = dataset['review_str'].apply(afinn_polarity)
dataset['sentiment_score_sentinet'] = dataset['review_str'].apply(score_sentiwordnet)
dataset['sentiment_score_afinn'] = dataset['sentiment_score_afinn'].apply(ifformula)
dataset['sentiment_score_textblob'] = dataset['sentiment_score_textblob'].apply(ifformula)
dataset['sentiment_score_sentinet'] = dataset['sentiment_score_sentinet'].apply(ifformula)
dataset = dataset[['UniversalMessageId', 'Sentiment', 'review_date','review_lemmatized', 'review_str', 'sentiment_score_textblob', 'sentiment_score_sentinet', 'sentiment_score_afinn']]
dataset['word_count'] = dataset['review_lemmatized'].apply(lambda x: len(str(x).split()))
dataset['review_length'] = dataset['review_str'].astype(str).apply(len)
###Process to extract text2emotion for every row
for rx in range(0, dataset.shape[0]):
text = dataset['review_str'][rx]
result = te.get_emotion(text)
result = pd.DataFrame(result.items(), columns=['Emotion', 'Score'])
result = result.sort_values(by=['Score'], ascending=False)
result['cum_percent'] = 100*(result.Score.cumsum() / result.Score.sum())
result['cum_percent'].fillna(0, inplace=True)
result.reset_index(drop=True, inplace=True)
print(rx)
##This would pick only those rows where cumulative emotion touches 100%
store1 = []
for i in range(0, result.shape[0]):
a = result['cum_percent'][i]
if a == 100.0:
b = result['Emotion'][i]
store1.append(b)
break
else:
b = result['Emotion'][i]
store1.append(b)
c = store1
mylist = [rx, c, result['cum_percent'].max()]
store.append(mylist)
# print(mylist)
result_emotions = pd.DataFrame(store, columns=['Index_col', 'Emotions', 'Max_score'])
result_emotions['Emotions'] = [','.join(map(str,l)) for l in result_emotions['Emotions']]
result_emotions['Emotions'] = np.where(result_emotions['Max_score']==0, 'None', result_emotions['Emotions'])
dataset = pd.merge(dataset, result_emotions, how="left", left_index = True, right_index = True)
return dataset
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment