Created
January 23, 2022 08:59
-
-
Save Eeman1113/d798bd52d23182cd4e8d8d0b14ec8ec1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#importing the libraries | |
import tensorflow as tf | |
import numpy as np | |
import pandas as pd | |
import json | |
import nltk | |
import speak | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.layers import Input, Embedding, LSTM , Dense,GlobalMaxPooling1D,Flatten | |
from tensorflow.keras.models import Model | |
import matplotlib.pyplot as plt | |
#importing the dataset | |
with open('content.json') as content: | |
data1 = json.load(content) | |
#getting all the data to lists | |
tags = [] | |
inputs = [] | |
responses={} | |
for intent in data1['intents']: | |
responses[intent['tag']]=intent['responses'] | |
for lines in intent['input']: | |
inputs.append(lines) | |
tags.append(intent['tag']) | |
#converting to dataframe | |
data = pd.DataFrame({"inputs":inputs, | |
"tags":tags}) | |
data = data.sample(frac=1) | |
#removing punctuations | |
import string | |
data['inputs'] = data['inputs'].apply(lambda wrd:[ltrs.lower() for ltrs in wrd if ltrs not in string.punctuation]) | |
data['inputs'] = data['inputs'].apply(lambda wrd: ''.join(wrd)) | |
data | |
#tokenize the data | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
tokenizer = Tokenizer(num_words=2000) | |
tokenizer.fit_on_texts(data['inputs']) | |
train = tokenizer.texts_to_sequences(data['inputs']) | |
#apply padding | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
x_train = pad_sequences(train) | |
#encoding the outputs | |
from sklearn.preprocessing import LabelEncoder | |
le = LabelEncoder() | |
y_train = le.fit_transform(data['tags']) | |
input_shape = x_train.shape[1] | |
print(input_shape) | |
#define vocabulary | |
vocabulary = len(tokenizer.word_index) | |
print("number of unique words : ",vocabulary) | |
output_length = le.classes_.shape[0] | |
print("output length: ",output_length) | |
#creating the model | |
i = Input(shape=(input_shape,)) | |
x = Embedding(vocabulary+1,10)(i) | |
x = LSTM(10,return_sequences=True)(x) | |
x = Flatten()(x) | |
x = Dense(output_length,activation="softmax")(x) | |
model = Model(i,x) | |
#compiling the model | |
model.compile(loss="sparse_categorical_crossentropy",optimizer='adam',metrics=['accuracy']) | |
#training the model | |
train = model.fit(x_train,y_train,epochs=200) | |
#plotting model accuracy | |
plt.plot(train.history['accuracy'],label='training set accuracy') | |
plt.plot(train.history['loss'],label='training set loss') | |
plt.legend() | |
#chatting | |
import random | |
while True: | |
texts_p = [] | |
prediction_input = input('You : ') | |
#removing punctuation and converting to lowercase | |
prediction_input = [letters.lower() for letters in prediction_input if letters not in string.punctuation] | |
prediction_input = ''.join(prediction_input) | |
texts_p.append(prediction_input) | |
#tokenizing and padding | |
prediction_input = tokenizer.texts_to_sequences(texts_p) | |
prediction_input = np.array(prediction_input).reshape(-1) | |
prediction_input = pad_sequences([prediction_input],input_shape) | |
#getting output from model | |
output = model.predict(prediction_input) | |
output = output.argmax() | |
#finding the right tag and predicting | |
response_tag = le.inverse_transform([output])[0] | |
print("Dew : ",random.choice(responses[response_tag])) | |
speak(random.choice(responses[response_tag])) | |
if response_tag == "goodbye": | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment