Skip to content

Instantly share code, notes, and snippets.

@rishisidhu
Last active January 30, 2020 02:09
Show Gist options
  • Save rishisidhu/42488235acc6363b5f997145f272d971 to your computer and use it in GitHub Desktop.
Save rishisidhu/42488235acc6363b5f997145f272d971 to your computer and use it in GitHub Desktop.
A retrieval based python chatbot that acts as an FAQ Answerer
#Imports
import string
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import PunktSentenceTokenizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from termcolor import colored
#NLTK Downloads (Need to do only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('nps_chat')
#Global Constants
GREETING_INPUTS = ("hello", "hi")
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "Talkin' to me?"]
FILENAME = "canada_faq.txt"
#Global Variables
lem = nltk.stem.WordNetLemmatizer()
remove_punctuation = dict((ord(punct), None) for punct in string.punctuation)
#Functions
'''
fetch_features transforms a chat into a classifier friendly format
'''
def fetch_features(chat):
features = {}
for word in nltk.word_tokenize(chat):
features['contains({})'.format(word.lower())] = True
return features
'''
lemmatise performs lemmatization on words
'''
def lemmatise(tokens):
return [lem.lemmatize(token) for token in tokens]
'''
tokenise tokenizes the words
'''
def tokenise(text):
return lemmatise(nltk.word_tokenize(text.lower().translate(remove_punctuation)))
'''
Standard greeting responses that the bot can recognize and respond with
'''
def greet(sentence):
for word in sentence.split():
if word.lower() in GREETING_INPUTS:
return random.choice(GREETING_RESPONSES)
'''
match matches a user input to the existing set of questions
'''
def match(user_response):
resp =''
q_list.append(user_response)
TfidfVec = TfidfVectorizer(tokenizer=tokenise, stop_words='english')
tfidf = TfidfVec.fit_transform(q_list)
vals = cosine_similarity(tfidf[-1], tfidf)
idx = vals.argsort()[0][-2]
flat = vals.flatten()
flat.sort()
req_tfidf = flat[-2]
if(req_tfidf==0):
resp = resp+"Sorry! I don't know the answer to this. Would you like to try again? Type Ciao to exit"
return resp
else:
resp_ids = qa_dict[idx]
resp_str = ''
s_id = resp_ids[0]
end = resp_ids[1]
while s_id<end :
resp_str = resp_str + " " + sent_tokens[s_id]
s_id+=1
resp = resp+resp_str
return resp
#Training the classifier
chats = nltk.corpus.nps_chat.xml_posts()[:10000]
featuresets = [(fetch_features(chat.text), chat.get('class')) for chat in chats]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.MaxentClassifier.train(train_set)
#classifier = nltk.NaiveBayesClassifier.train(train_set) #If you need to test Naive Bayes as well
print(nltk.classify.accuracy(classifier, test_set))
#Question Bank Creation
ques_bank = open(FILENAME,'r',errors = 'ignore')
qb_text = ques_bank.read()
qb_text = qb_text.lower()
sent_tokens = nltk.sent_tokenize(qb_text)# converts to list of sentences
word_tokens = nltk.word_tokenize(qb_text)# converts to list of words
qa_dict = {} #The Dictionary to store questions and corresponding answers
q_list = [] #List of all questions
s_count = 0 #Sentence counter
#Extract questions and answers
#Answer is all the content between 2 questions [assumption]
while s_count < len(sent_tokens):
result = classifier.classify(fetch_features(sent_tokens[s_count]))
if("question" in result.lower()):
next_question_id = s_count+1
next_question = classifier.classify(fetch_features(sent_tokens[next_question_id]))
while(not("question" in next_question.lower()) and next_question_id < len(sent_tokens)-1):
next_question_id+=1
next_question = classifier.classify(fetch_features(sent_tokens[next_question_id]))
q_list.append(sent_tokens[s_count])
end = next_question_id
if(next_question_id-s_count > 5):
end = s_count+5
qa_dict.update({len(q_list)-1:[s_count+1,end]})
s_count = next_question_id
else:
s_count+=1
#Response Fetching
flag=True
print(colored("NEO:\nI am Neo, I have all the answers If you want to exit, type Ciao",'blue',attrs=['bold']))
while(flag==True):
print(colored("\nYOU:",'red',attrs=['bold']))
u_input = input()
u_input = u_input.lower()
if(u_input!='ciao'):
if(greet(u_input)!=None):
print(colored("\nNEO:",'blue',attrs=['bold']))
print(greet(u_input))
else:
print(colored("\nNEO:",'blue',attrs=['bold']))
print(colored(match(u_input).strip().capitalize(),'blue'))
q_list.remove(u_input)
else:
flag=False
print(colored("\nNEO: Bye! take care..",'blue', attrs=['bold']))
@pratik-dani
Copy link

Hi, I just wanted to ask what is the response function at line 137 after colored(response(u_input)), as I was not able to figure that part out. Would you mind explaining that thing to me?

Never I got it. You need to update your code the response function on line 137 should be change to match function.

@rishisidhu
Copy link
Author

Hi, I just wanted to ask what is the response function at line 137 after colored(response(u_input)), as I was not able to figure that part out. Would you mind explaining that thing to me?

Never I got it. You need to update your code the response function on line 137 should be change to match function.

Thanks @devildani. Corrected it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment