Skip to content

Instantly share code, notes, and snippets.

@AkhilRD
Created August 11, 2022 06:38
Show Gist options
  • Save AkhilRD/904da4728512e475c37c2461fca5f16f to your computer and use it in GitHub Desktop.
Save AkhilRD/904da4728512e475c37c2461fca5f16f to your computer and use it in GitHub Desktop.
A function that helps in pre-processing textual data and retrieve sentiment score in a pandas data-frame.
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata
import nltk
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#function
def preprocessing(text):
stop = nltk.corpus.stopwords.words('english'). #stopwords
lem = WordNetLemmatizer() #initializing lemmatizer
text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') #ignoring unicode data
.decode('utf-8', 'ignore')
.lower()) #lower-casing
words = re.sub(r'[^\w\s]', '', text).split()
return [lem.lemmatize(w) for w in words if w not in stop] #returning words not in stopwords
df['text']=df.apply(lambda x: preprocessing(x['text']), axis=1)
def final(lem_col):
return (" ".join(lem_col)) #applying the function to a text column
df['text'] = df.apply(lambda x: final(x['text']),axis=1)
#sentiment
sent_analyzer = SentimentIntensityAnalyzer()
cs = []
def senti(text):
for row in range(len(text)):
cs.append(sent_analyzer.polarity_scores((text).iloc[row])['compound'])
senti(df['text'])
df['sentiment_score'] = cs
df = df[(df[['sentiment_score']] != 0).all(axis=1)] #not considering neutral sentiments
df['sentiment_score']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment