anirudhr95/topic_analysis.py

## topic_analysis.py
import pandas as pd
import xlrd
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re


excel_object = xlrd.open_workbook('sample.xlsx')
# print(excel_object.sheet_names())

# tree = lambda: defaultdict(tree)
# key_wise_feedback = tree()

key_wise_feedback = defaultdict(lambda : defaultdict(str))

key = ''

chat_logs = excel_object.sheet_by_name('chatLogs')
user_feedback = excel_object.sheet_by_name('UserFeedback')

for i,row in enumerate(chat_logs.col(4)):

	if(i!=0):

		text = row.value.strip()

		if(str(chat_logs.col(0)[i]).split("'")[1]!=''):
			key = ''
			text = ''
			key = str(str(chat_logs.col(0)[i]).split("'")[1])
			#print(key)

		if(text.startswith('VA:  ')):
			key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
		else:
			key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text


stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

for i in key_wise_feedback:
	key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]
	key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]

for i in key_wise_feedback:
	print(i,key_wise_feedback[i]['cust'])
	break

import gensim
from gensim import corpora

def return_corpus(text):
	return corpora.Dictionary(text)

# for i in key_wise_feedback:
# 	key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
# 	key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])

corpus_bot = u''
corpus_cust = u''

for i in key_wise_feedback:
	for words in key_wise_feedback[i]['bot']:
		corpus_bot = corpus_bot + u' '.join(words)
	for words in key_wise_feedback[i]['cust']:
		corpus_cust = corpus_cust + u' '.join(words)


corpus_bot = corpus_bot.replace(u'va' , '')
corpus_bot = corpus_bot.strip()

dictionary_bot = corpora.Dictionary([corpus_bot.split()])
dictionary_cust = corpora.Dictionary([corpus_cust.split()])

# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):

# 	# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
# 	doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
# 	doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]

# 	return doc_term_matrix_bot,doc_term_matrix_cust

# for i in key_wise_feedback:
# 	key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])

for i in key_wise_feedback:
	key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
	key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]

# for i in key_wise_feedback:
# 	print(key_wise_feedback[i]['bot_matrix'])
# 	break

Lda = gensim.models.ldamodel.LdaModel

for i in key_wise_feedback:
	key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
	key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)


for i in key_wise_feedback:
	#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	#print_topics(num_topics=3, num_words=3)
	print(key_wise_feedback[i]['bot_words'])
	print(key_wise_feedback[i]['cust_words'])
	break
	import pandas as pd
	import xlrd
	from collections import defaultdict
	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	import string
	import re


	excel_object = xlrd.open_workbook('sample.xlsx')
	# print(excel_object.sheet_names())

	# tree = lambda: defaultdict(tree)
	# key_wise_feedback = tree()

	key_wise_feedback = defaultdict(lambda : defaultdict(str))

	key = ''

	chat_logs = excel_object.sheet_by_name('chatLogs')
	user_feedback = excel_object.sheet_by_name('UserFeedback')

	for i,row in enumerate(chat_logs.col(4)):

	if(i!=0):

	text = row.value.strip()

	if(str(chat_logs.col(0)[i]).split("'")[1]!=''):
	key = ''
	text = ''
	key = str(str(chat_logs.col(0)[i]).split("'")[1])
	#print(key)

	if(text.startswith('VA: ')):
	key_wise_feedback[key]['bot'] = str(key_wise_feedback[key]['bot']) + text
	else:
	key_wise_feedback[key]['cust'] = str(key_wise_feedback[key]['cust']) + text


	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation)
	lemma = WordNetLemmatizer()

	def clean(doc):
	stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
	punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
	return normalized

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot'] = [clean(key_wise_feedback[i]['bot']).split()]
	key_wise_feedback[i]['cust'] = [clean(key_wise_feedback[i]['cust']).split()]

	for i in key_wise_feedback:
	print(i,key_wise_feedback[i]['cust'])
	break

	import gensim
	from gensim import corpora

	def return_corpus(text):
	return corpora.Dictionary(text)

	# for i in key_wise_feedback:
	# key_wise_feedback[i]['bot_corpus'] = return_corpus(key_wise_feedback[i]['bot'])
	# key_wise_feedback[i]['cust_corpus'] = return_corpus(key_wise_feedback[i]['cust'])

	corpus_bot = u''
	corpus_cust = u''

	for i in key_wise_feedback:
	for words in key_wise_feedback[i]['bot']:
	corpus_bot = corpus_bot + u' '.join(words)
	for words in key_wise_feedback[i]['cust']:
	corpus_cust = corpus_cust + u' '.join(words)


	corpus_bot = corpus_bot.replace(u'va' , '')
	corpus_bot = corpus_bot.strip()

	dictionary_bot = corpora.Dictionary([corpus_bot.split()])
	dictionary_cust = corpora.Dictionary([corpus_cust.split()])

	# def assign_doc_term_matrix(bot_dict,cust_dict,bot,cust):

	# # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
	# doc_term_matrix_bot = [bot_dict.doc2bow(doc) for doc in bot]
	# doc_term_matrix_cust = [cust_dict.doc2bow(doc) for doc in cust]

	# return doc_term_matrix_bot,doc_term_matrix_cust

	# for i in key_wise_feedback:
	# key_wise_feedback[i]['bot_matrix'], key_wise_feedback[i]['cust_matrix'] = assign_doc_term_matrix(key_wise_feedback[i]['bot_corpus'],key_wise_feedback[i]['cust_corpus'],key_wise_feedback[i]['bot'],key_wise_feedback[i]['cust'])

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot_matrix'] = [dictionary_bot.doc2bow(doc) for doc in key_wise_feedback[i]['bot']]
	key_wise_feedback[i]['cust_matrix'] = [dictionary_cust.doc2bow(doc) for doc in key_wise_feedback[i]['cust']]

	# for i in key_wise_feedback:
	# print(key_wise_feedback[i]['bot_matrix'])
	# break

	Lda = gensim.models.ldamodel.LdaModel

	for i in key_wise_feedback:
	key_wise_feedback[i]['bot_lda'] = Lda(key_wise_feedback[i]['bot_matrix'], num_topics=2, id2word = dictionary_bot, passes=50)
	key_wise_feedback[i]['cust_lda'] = Lda(key_wise_feedback[i]['cust_matrix'] , num_topics = 2 , id2word = dictionary_cust , passes = 50)


	for i in key_wise_feedback:
	#print(i,key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1])
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	key_wise_feedback[i]['bot_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	key_wise_feedback[i]['cust_words'] = str(" ".join(re.findall('"([^"]*)"',key_wise_feedback[i]['cust_lda'].print_topics(num_topics = 2 , num_words = 3)[0][1])))
	#print(re.findall('"([^"]*)"',key_wise_feedback[i]['bot_lda'].print_topics(num_topics = 2 , num_words = 3)[1]))
	#print_topics(num_topics=3, num_words=3)
	print(key_wise_feedback[i]['bot_words'])
	print(key_wise_feedback[i]['cust_words'])
	break