Created
August 11, 2022 06:38
-
-
Save AkhilRD/904da4728512e475c37c2461fca5f16f to your computer and use it in GitHub Desktop.
A function that helps in pre-processing textual data and retrieve sentiment score in a pandas data-frame.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.tokenize import sent_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from nltk.corpus import stopwords | |
import unicodedata | |
import nltk | |
import re | |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
#function | |
def preprocessing(text): | |
stop = nltk.corpus.stopwords.words('english'). #stopwords | |
lem = WordNetLemmatizer() #initializing lemmatizer | |
text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') #ignoring unicode data | |
.decode('utf-8', 'ignore') | |
.lower()) #lower-casing | |
words = re.sub(r'[^\w\s]', '', text).split() | |
return [lem.lemmatize(w) for w in words if w not in stop] #returning words not in stopwords | |
df['text']=df.apply(lambda x: preprocessing(x['text']), axis=1) | |
def final(lem_col): | |
return (" ".join(lem_col)) #applying the function to a text column | |
df['text'] = df.apply(lambda x: final(x['text']),axis=1) | |
#sentiment | |
sent_analyzer = SentimentIntensityAnalyzer() | |
cs = [] | |
def senti(text): | |
for row in range(len(text)): | |
cs.append(sent_analyzer.polarity_scores((text).iloc[row])['compound']) | |
senti(df['text']) | |
df['sentiment_score'] = cs | |
df = df[(df[['sentiment_score']] != 0).all(axis=1)] #not considering neutral sentiments | |
df['sentiment_score'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment