Last active
June 28, 2018 18:48
-
-
Save johndpope/62c3e2117c9de625b7f82f3356a7430d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# PYTHON 3 - upgrades | |
__author__ = 'Oswaldo Ludwig' | |
__version__ = '1.02' | |
from tensorflow.python.keras.layers import Dense, Reshape, Flatten, \ | |
Dropout, Input, concatenate, Conv2D, MaxPooling2D, UpSampling2D, Conv2DTranspose, Activation | |
from keras.layers import Input, Embedding, LSTM, Dense, RepeatVector, Dropout, merge,concatenate | |
from keras.optimizers import Adam | |
from keras.models import Model | |
from keras.models import Sequential | |
from keras.layers import Activation, Dense | |
from keras.preprocessing import sequence | |
from six.moves import input | |
import keras.backend as K | |
import numpy as np | |
np.random.seed(1234) # for reproducibility | |
import pickle | |
import theano | |
import os.path | |
import sys | |
import nltk | |
import re | |
import time | |
from keras.utils import plot_model | |
WORD_EMBED_HIDDEN_SIZE = 100 | |
SENTENCE_EMBEDDING_SIZE = 300 | |
DICTIONARY_SIZE = 7000 | |
MAX_SEQUENCE_LENGTH_INPUT = 50 | |
vocabulary_file = 'vocabulary_movie' | |
weights_file = 'my_model_weights20.h5' | |
weights_file_GAN = 'my_model_weights.h5' | |
unknown_token = 'something' | |
file_saved_context = 'saved_context' | |
file_saved_answer = 'saved_answer' | |
name_of_computer = 'john' | |
def greedy_decoder(input): | |
flag = 0 | |
prob = 1 | |
ans_partial = np.zeros((1,MAX_SEQUENCE_LENGTH_INPUT)) | |
ans_partial[0, -1] = 2 # the index of the symbol BOS (begin of sentence) | |
for k in range(MAX_SEQUENCE_LENGTH_INPUT - 1): | |
ye = model.predict([input, ans_partial]) | |
yel = ye[0,:] | |
p = np.max(yel) | |
mp = np.argmax(ye) | |
ans_partial[0, 0:-1] = ans_partial[0, 1:] | |
ans_partial[0, -1] = mp | |
if mp == 3: # he index of the symbol EOS (end of sentence) | |
flag = 1 | |
if flag == 0: | |
prob = prob * p | |
text = '' | |
for k in ans_partial[0]: | |
k = k.astype(int) | |
if k < (DICTIONARY_SIZE-2): | |
w = vocabulary[k] | |
text = text + w[0] + ' ' | |
return(text, prob) | |
def preprocess(raw_word, name): | |
l1 = ['won’t','won\'t','wouldn’t','wouldn\'t','’m', '’re', '’ve', '’ll', '’s','’d', 'n’t', '\'m', '\'re', '\'ve', '\'ll', '\'s', '\'d', 'can\'t', 'n\'t', 'B: ', 'A: ', ',', ';', '.', '?', '!', ':', '. ?', ', .', '. ,', 'EOS', 'BOS', 'eos', 'bos'] | |
l2 = ['will not','will not','would not','would not',' am', ' are', ' have', ' will', ' is', ' had', ' not', ' am', ' are', ' have', ' will', ' is', ' had', 'can not', ' not', '', '', ' ,', ' ;', ' .', ' ?', ' !', ' :', '? ', '.', ',', '', '', '', ''] | |
l3 = ['-', '_', ' *', ' /', '* ', '/ ', '\"', ' \\"', '\\ ', '--', '...', '. . .'] | |
l4 = ['jeffrey','fred','benjamin','paula','walter','rachel','andy','helen','harrington','kathy','ronnie','carl','annie','cole','ike','milo','cole','rick','johnny','loretta','cornelius','claire','romeo','casey','johnson','rudy','stanzi','cosgrove','wolfi','kevin','paulie','cindy','paulie','enzo','mikey','i\97','davis','jeffrey','norman','johnson','dolores','tom','brian','bruce','john','laurie','stella','dignan','elaine','jack','christ','george','frank','mary','amon','david','tom','joe','paul','sam','charlie','bob','marry','walter','james','jimmy','michael','rose','jim','peter','nick','eddie','johnny','jake','ted','mike','billy','louis','ed','jerry','alex','charles','tommy','bobby','betty','sid','dave','jeffrey','jeff','marty','richard','otis','gale','fred','bill','jones','smith','mickey'] | |
raw_word = raw_word.lower() | |
raw_word = raw_word.replace(', ' + name_of_computer, '') | |
raw_word = raw_word.replace(name_of_computer + ' ,', '') | |
for j, term in enumerate(l1): | |
raw_word = raw_word.replace(term,l2[j]) | |
for term in l3: | |
raw_word = raw_word.replace(term,' ') | |
for term in l4: | |
raw_word = raw_word.replace(', ' + term, ', ' + name) | |
raw_word = raw_word.replace(' ' + term + ' ,' ,' ' + name + ' ,') | |
raw_word = raw_word.replace('i am ' + term, 'i am ' + name_of_computer) | |
raw_word = raw_word.replace('my name is' + term, 'my name is ' + name_of_computer) | |
for j in range(30): | |
raw_word = raw_word.replace('. .', '') | |
raw_word = raw_word.replace('. .', '') | |
raw_word = raw_word.replace('..', '') | |
for j in range(5): | |
raw_word = raw_word.replace(' ', ' ') | |
if raw_word[-1] != '!' and raw_word[-1] != '?' and raw_word[-1] != '.' and raw_word[-2:] != '! ' and raw_word[-2:] != '? ' and raw_word[-2:] != '. ': | |
raw_word = raw_word + ' .' | |
if raw_word == ' !' or raw_word == ' ?' or raw_word == ' .' or raw_word == ' ! ' or raw_word == ' ? ' or raw_word == ' . ': | |
raw_word = 'what ?' | |
if raw_word == ' .' or raw_word == ' .' or raw_word == ' . ': | |
raw_word = 'i do not want to talk about it .' | |
return raw_word | |
def tokenize(sentences): | |
# Tokenizing the sentences into words: | |
tokenized_sentences = nltk.word_tokenize(sentences) | |
index_to_word = [x[0] for x in vocabulary] | |
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) | |
tokenized_sentences = [w if w in word_to_index else unknown_token for w in tokenized_sentences] | |
X = np.asarray([word_to_index[w] for w in tokenized_sentences]) | |
s = X.size | |
Q = np.zeros((1,MAX_SEQUENCE_LENGTH_INPUT)) | |
if s < (MAX_SEQUENCE_LENGTH_INPUT + 1): | |
Q[0,- s:] = X | |
else: | |
Q[0,:] = X[- MAX_SEQUENCE_LENGTH_INPUT:] | |
return Q | |
# Open files to save the conversation for further training: | |
qf = open(file_saved_context, 'w') | |
af = open(file_saved_answer, 'w') | |
print('Starting the model...') | |
# ******************************************************************* | |
# Keras model of the chatbot: | |
# ******************************************************************* | |
ad = Adam(lr=0.00005) | |
input_context = Input(shape=(MAX_SEQUENCE_LENGTH_INPUT,), dtype='int32', name='thecontexttext') | |
input_answer = Input(shape=(MAX_SEQUENCE_LENGTH_INPUT,), dtype='int32', name='theanswertextuptothecurrenttoken') | |
LSTM_encoder = LSTM(SENTENCE_EMBEDDING_SIZE, kernel_initializer= 'lecun_uniform', name='Encodecontext') | |
LSTM_decoder = LSTM(SENTENCE_EMBEDDING_SIZE, kernel_initializer= 'lecun_uniform', name='Encodeansweruptothecurrenttoken') | |
Shared_Embedding = Embedding(output_dim=WORD_EMBED_HIDDEN_SIZE, input_dim=DICTIONARY_SIZE, input_length=MAX_SEQUENCE_LENGTH_INPUT, name='Shared') | |
word_embedding_context = Shared_Embedding(input_context) | |
context_embedding = LSTM_encoder(word_embedding_context) | |
word_embedding_answer = Shared_Embedding(input_answer) | |
answer_embedding = LSTM_decoder(word_embedding_answer) | |
merge_layer = concatenate([context_embedding, answer_embedding], axis=1) | |
out = Dense(int(DICTIONARY_SIZE/2), activation="relu", name='reluactivation')(merge_layer) | |
out = Dense(DICTIONARY_SIZE, activation="softmax")(out) | |
# out = (merge_layer) | |
# out = Dense(DICTIONARY_SIZE, activation="softmax", name='likelihoodofthecurrenttokenusingsoftmaxactivation')(out) | |
model = Model(inputs=[input_context, input_answer], outputs = [out]) | |
model.compile(loss='categorical_crossentropy', optimizer=ad) | |
# Loading the data: | |
vocabulary = pickle.load(open(vocabulary_file, 'rb')) | |
print("\n \n \n \n CHAT: \n \n") | |
# Processing the user query: | |
prob = 0 | |
que = '' | |
last_query = ' ' | |
last_last_query = '' | |
text = ' ' | |
last_text = '' | |
print('computer: hi ! please type your name.\n') | |
name = input("user: ") | |
print('computer: hi , ' + name +' ! My name is ' + name_of_computer + '.\n') | |
while que != 'exit .': | |
que = input("user: ") | |
que = preprocess(que, name_of_computer) | |
# Collecting data for training: | |
q = last_query + ' ' + text | |
a = que | |
qf.write(q + '\n') | |
af.write(a + '\n') | |
# Composing the context: | |
if prob > 0.2: | |
query = text + ' ' + que | |
else: | |
query = que | |
last_text = text | |
Q = tokenize(query) | |
# Using the trained model to predict the answer: | |
model.load_weights(weights_file) | |
predout, prob = greedy_decoder(Q[0:1]) | |
start_index = predout.find('EOS') | |
text = preprocess(predout[0:start_index], name) | |
print(('computer_BP: ' + text + ' (with probability of %f)'%prob)) | |
model.load_weights(weights_file_GAN) | |
predout, prob = greedy_decoder(Q[0:1]) | |
start_index = predout.find('EOS') | |
text = preprocess(predout[0:start_index], name) | |
print(('computer_GAN: ' + text + ' (with probability of %f)'%prob)) | |
last_last_query = last_query | |
last_query = que | |
qf.close() | |
af.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment