Skip to content

Instantly share code, notes, and snippets.

@quetzaluz
Created October 11, 2018 17:45
Show Gist options
  • Save quetzaluz/7f5b2f9f2cc089b321756da629e85220 to your computer and use it in GitHub Desktop.
Save quetzaluz/7f5b2f9f2cc089b321756da629e85220 to your computer and use it in GitHub Desktop.
Example of predicting NPS score from feedback comments, also means of bucketizing customer sentiment.
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import time
start_time = time.time()
print('Starting run of analytic model...')
data = pd.read_csv(os.path.join(os.path.dirname(__file__), '../datasets/medallia.csv'))
def bucketize_score(x):
if x <= 5:
return 'Negative'
if x == 6 or x == 7:
return 'Neutral'
else:
return 'Positive'
data['NPS_score'] = data['NPS_score'].apply(lambda x: bucketize_score(x))
train, test = train_test_split(data, test_size=0.15)
comments = []
stopwords_set = set(stopwords.words('english')) # You need to download this via nltk.download(), see readme
tokenizer = RegexpTokenizer(r'\w+') # Strip punctuation and special characters from words
def tokenize_words(strng):
strng = strng.replace('!', ' !').replace('?', ' ?').replace('.', ' .') # So that punctuation can be tokenized
words_filtered = [e.lower() for e in tokenizer.tokenize(strng) if len(e) >= 3]
words_cleaned = [word for word in words_filtered
if 'http' not in word
and word not in stopwords_set
]
return words_cleaned
for index, row in train.iterrows():
comments.append((tokenize_words(row.NPS_comment), row.NPS_score))
test_pos = test[test['NPS_score'] == 'Positive']
test_pos = test_pos['NPS_comment']
test_neu = test[test['NPS_score'] == 'Neutral']
test_neu = test_neu['NPS_comment']
test_neg = test[test['NPS_score'] == 'Negative']
test_neg = test_neg['NPS_comment']
# Extracting word features
def get_words_in_comments(comment):
all = []
for (words, sentiment) in comment:
all.extend(words)
return all
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
features = wordlist.keys()
return features
w_features = get_word_features(get_words_in_comments(comments))
def extract_features(document):
document_words = set(document)
features = {}
for word in w_features:
features['contains(%s)' % word] = (word in document_words)
return features
training_set = nltk.classify.apply_features(extract_features, comments)
classifier = nltk.NaiveBayesClassifier.train(training_set)
neg_cnt = 0
neu_cnt = 0
pos_cnt = 0
all_cnt = 0
for comment in test_neg:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Negative'):
neg_cnt = neg_cnt + 1
all_cnt = all_cnt + 1
for comment in test_neu:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Neutral'):
neu_cnt = neu_cnt + 1
all_cnt = all_cnt + 1
for comment in test_pos:
res = classifier.classify(extract_features(tokenize_words(comment)))
if(res == 'Positive'):
pos_cnt = pos_cnt + 1
all_cnt = all_cnt + 1
neg_pct = (float(neg_cnt) / float(len(test_neg))) * 100
neu_pct = (float(neu_cnt) / float(len(test_neu))) * 100
pos_pct = (float(pos_cnt) / float(len(test_pos))) * 100
all_pct = (float(all_cnt) / float(len(test_pos) + len(test_neg))) * 100
print('Accuracy of model predicting bucketized NPS Score from comments:')
print('[Negative]: %s/%s (%s%%)' % (neg_cnt, len(test_neg), neg_pct))
print('[Neutral]: %s/%s (%s%%)' % (neu_cnt, len(test_neu), neu_pct))
print('[Positive]: %s/%s (%s%%)' % (pos_cnt, len(test_pos), pos_pct))
print('[Overall]: %s/%s (%s%%)' % (all_cnt, len(test_pos) + len(test_neg), all_pct))
print('Ending run of analytic model.')
print('Runtime was %s seconds' % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment