Created
June 29, 2018 13:46
-
-
Save johndpope/b0d9a025c6e54dc1e07ab6100c34a24a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
__author__ = 'Oswaldo Ludwig' | |
__version__ = '1.01' | |
from tensorflow.python.keras.layers import Dense, Reshape, Flatten, \ | |
Dropout, Input, concatenate, Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose, Activation | |
from keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, Dropout, merge,concatenate | |
from keras.optimizers import Adam | |
from keras.models import Model | |
from keras.models import Sequential | |
from keras.layers import Activation, Dense | |
from keras.preprocessing import sequence | |
from six.moves import input | |
from read_activations import get_activations, display_activations | |
import keras.backend as K | |
import numpy as np | |
np.random.seed(1234) # for reproducibility | |
import pickle | |
import theano | |
import os.path | |
import sys | |
import nltk | |
import re | |
import time | |
from keras.utils import plot_model | |
from chatterbot.trainers import ChatterBotCorpusTrainer | |
import chatterbot | |
from chatterbot import ChatBot | |
WORD_EMBED_HIDDEN_SIZE = 100 | |
SENTENCE_EMBEDDING_SIZE = 300 | |
DICTIONARY_SIZE = 7000 | |
MAX_SEQUENCE_LENGTH_INPUT = 50 | |
vocabulary_file = 'vocabulary_movie' | |
weights_file = 'my_model_weights20.h5' | |
weights_file_GAN = 'my_model_weights.h5' | |
unknown_token = 'something' | |
file_saved_context = 'saved_context' | |
file_saved_answer = 'saved_answer' | |
name_of_computer = 'john' | |
def greedy_decoder(input): | |
flag = 0 | |
prob = 1 | |
ans_partial = np.zeros((1,MAX_SEQUENCE_LENGTH_INPUT)) | |
ans_partial[0, -1] = 2 # the index of the symbol BOS (begin of sentence) | |
for k in range(MAX_SEQUENCE_LENGTH_INPUT - 1): | |
ye = model.predict([input, ans_partial]) | |
yel = ye[0,:] | |
p = np.max(yel) | |
mp = np.argmax(ye) | |
ans_partial[0, 0:-1] = ans_partial[0, 1:] | |
ans_partial[0, -1] = mp | |
if mp == 3: # he index of the symbol EOS (end of sentence) | |
flag = 1 | |
if flag == 0: | |
prob = prob * p | |
text = '' | |
for k in ans_partial[0]: | |
k = k.astype(int) | |
if k < (DICTIONARY_SIZE-2): | |
w = vocabulary[k] | |
text = text + w[0] + ' ' | |
return(text, prob) | |
def preprocess(raw_word, name): | |
l1 = ['won’t','won\'t','wouldn’t','wouldn\'t','’m', '’re', '’ve', '’ll', '’s','’d', 'n’t', '\'m', '\'re', '\'ve', '\'ll', '\'s', '\'d', 'can\'t', 'n\'t', 'B: ', 'A: ', ',', ';', '.', '?', '!', ':', '. ?', ', .', '. ,', 'EOS', 'BOS', 'eos', 'bos'] | |
l2 = ['will not','will not','would not','would not',' am', ' are', ' have', ' will', ' is', ' had', ' not', ' am', ' are', ' have', ' will', ' is', ' had', 'can not', ' not', '', '', ' ,', ' ;', ' .', ' ?', ' !', ' :', '? ', '.', ',', '', '', '', ''] | |
l3 = ['-', '_', ' *', ' /', '* ', '/ ', '\"', ' \\"', '\\ ', '--', '...', '. . .'] | |
l4 = ['jeffrey','fred','benjamin','paula','walter','rachel','andy','helen','harrington','kathy','ronnie','carl','annie','cole','ike','milo','cole','rick','johnny','loretta','cornelius','claire','romeo','casey','johnson','rudy','stanzi','cosgrove','wolfi','kevin','paulie','cindy','paulie','enzo','mikey','i\97','davis','jeffrey','norman','johnson','dolores','tom','brian','bruce','john','laurie','stella','dignan','elaine','jack','christ','george','frank','mary','amon','david','tom','joe','paul','sam','charlie','bob','marry','walter','james','jimmy','michael','rose','jim','peter','nick','eddie','johnny','jake','ted','mike','billy','louis','ed','jerry','alex','charles','tommy','bobby','betty','sid','dave','jeffrey','jeff','marty','richard','otis','gale','fred','bill','jones','smith','mickey'] | |
raw_word = raw_word.lower() | |
raw_word = raw_word.replace(', ' + name_of_computer, '') | |
raw_word = raw_word.replace(name_of_computer + ' ,', '') | |
for j, term in enumerate(l1): | |
raw_word = raw_word.replace(term,l2[j]) | |
for term in l3: | |
raw_word = raw_word.replace(term,' ') | |
for term in l4: | |
raw_word = raw_word.replace(', ' + term, ', ' + name) | |
raw_word = raw_word.replace(' ' + term + ' ,' ,' ' + name + ' ,') | |
raw_word = raw_word.replace('i am ' + term, 'i am ' + name_of_computer) | |
raw_word = raw_word.replace('my name is' + term, 'my name is ' + name_of_computer) | |
for j in range(30): | |
raw_word = raw_word.replace('. .', '') | |
raw_word = raw_word.replace('. .', '') | |
raw_word = raw_word.replace('..', '') | |
for j in range(5): | |
raw_word = raw_word.replace(' ', ' ') | |
if raw_word[-1] != '!' and raw_word[-1] != '?' and raw_word[-1] != '.' and raw_word[-2:] != '! ' and raw_word[-2:] != '? ' and raw_word[-2:] != '. ': | |
raw_word = raw_word + ' .' | |
if raw_word == ' !' or raw_word == ' ?' or raw_word == ' .' or raw_word == ' ! ' or raw_word == ' ? ' or raw_word == ' . ': | |
raw_word = 'what ?' | |
if raw_word == ' .' or raw_word == ' .' or raw_word == ' . ': | |
raw_word = 'i do not want to talk about it .' | |
return raw_word | |
def tokenize(sentences): | |
# Tokenizing the sentences into words: | |
tokenized_sentences = nltk.word_tokenize(sentences) | |
index_to_word = [x[0] for x in vocabulary] | |
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) | |
tokenized_sentences = [w if w in word_to_index else unknown_token for w in tokenized_sentences] | |
print("tokenized_sentences:",tokenized_sentences) | |
X = np.asarray([word_to_index[w] for w in tokenized_sentences]) | |
s = X.size | |
Q = np.zeros((1,MAX_SEQUENCE_LENGTH_INPUT)) | |
if s < (MAX_SEQUENCE_LENGTH_INPUT + 1): | |
Q[0,- s:] = X | |
else: | |
Q[0,:] = X[- MAX_SEQUENCE_LENGTH_INPUT:] | |
return Q | |
# Open files to save the conversation for further training: | |
qf = open(file_saved_context, 'w') | |
af = open(file_saved_answer, 'w') | |
print('Loading Chatterbot...') | |
chatbot = ChatBot( | |
name_of_computer, | |
trainer='chatterbot.trainers.ChatterBotCorpusTrainer' | |
) | |
# Train based on the english corpus | |
chatbot.train("chatterbot.corpus.english") | |
print('Starting the model...') | |
# ******************************************************************* | |
# Keras model of the chatbot: | |
# ******************************************************************* | |
ad = Adam(lr=0.00005) | |
input_context = Input(shape=(MAX_SEQUENCE_LENGTH_INPUT,), dtype='int32', name='thecontexttext') | |
input_answer = Input(shape=(MAX_SEQUENCE_LENGTH_INPUT,), dtype='int32', name='theanswertextuptothecurrenttoken') | |
LSTM_encoder = LSTM(SENTENCE_EMBEDDING_SIZE, kernel_initializer= 'lecun_uniform', name='Encodecontext') | |
LSTM_decoder = LSTM(SENTENCE_EMBEDDING_SIZE, kernel_initializer= 'lecun_uniform', name='Encodeansweruptothecurrenttoken') | |
Shared_Embedding = Embedding(output_dim=WORD_EMBED_HIDDEN_SIZE, input_dim=DICTIONARY_SIZE, input_length=MAX_SEQUENCE_LENGTH_INPUT, name='Shared') | |
word_embedding_context = Shared_Embedding(input_context) | |
context_embedding = LSTM_encoder(word_embedding_context) | |
word_embedding_answer = Shared_Embedding(input_answer) | |
answer_embedding = LSTM_decoder(word_embedding_answer) | |
merge_layer = concatenate([context_embedding, answer_embedding], axis=1) | |
out = Dense(int(DICTIONARY_SIZE/2), activation="relu", name='reluactivation')(merge_layer) | |
out = Dense(DICTIONARY_SIZE, activation="softmax")(out) | |
# out = (merge_layer) | |
# out = Dense(DICTIONARY_SIZE, activation="softmax", name='likelihoodofthecurrenttokenusingsoftmaxactivation')(out) | |
model = Model(inputs=[input_context, input_answer], outputs = [out]) | |
model.compile(loss='categorical_crossentropy', optimizer=ad) | |
# Loading the data: | |
vocabulary = pickle.load(open(vocabulary_file, 'rb')) | |
print("\n \n \n \n CHAT: \n \n") | |
# Processing the user query: | |
prob = 0 | |
que = '' | |
last_query = ' ' | |
last_last_query = '' | |
text = ' ' | |
last_text = '' | |
print('computer: hi ! please type your name.\n') | |
name = input("user: ") | |
print('computer: hi , ' + name +' ! My name is ' + name_of_computer + '.\n') | |
last_query = chatbot.get_response('hi , ' + name +' ! My name is ' + name_of_computer + '.\n').text # hi computer my name is | |
while que != 'exit .': | |
# que = input("user: ") | |
# Get a response to an input statement | |
que = chatbot.get_response(last_query).text | |
print("chatbot.get_response:",que) | |
que = preprocess(que, name_of_computer) | |
# Collecting data for training: | |
q = last_query + ' ' + text | |
a = que | |
qf.write(q + '\n') | |
af.write(a + '\n') | |
# Composing the context: | |
if prob > 0.2: | |
query = text + ' ' + que | |
else: | |
query = que | |
last_text = text | |
Q = tokenize(query) | |
# Using the trained model to predict the answer: | |
model.load_weights(weights_file) | |
predout, prob = greedy_decoder(Q[0:1]) | |
start_index = predout.find('EOS') | |
text = preprocess(predout[0:start_index], name) | |
# print(('computer_BP: ' + text + ' (with probability of %f)'%prob)) | |
model.load_weights(weights_file_GAN) | |
predout, prob = greedy_decoder(Q[0:1]) | |
start_index = predout.find('EOS') | |
text = preprocess(predout[0:start_index], name) | |
print(('computer_GAN: ' + text + ' (with probability of %f)'%prob)) | |
# a = get_activations(model, "I like to play cards", print_shape_only = True, layer_name = 'reluactivation') | |
# model.summary() | |
last_last_query = last_query | |
last_query = que | |
qf.close() | |
af.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment