NoelKennedy/generate_sentences.py

## generate_sentences.py
def make_random_input_sentence(sequence_length=300):
    import random
    input_words=list()
    input_sentence=list()
    for i in range(0,sequence_length):
        word=random.choice(list(sg.wv.vocab.keys()))
        input_words.append(word)
        vector=sg[word]
        input_sentence.append(vector)
    disease_token_bit=np.zeros((sequence_length,1))
    input_sentence=np.array(input_sentence)

    # sure there is an easier way to get the sentence of shape 25,101 into a matrix of shape 1,25,101!
    input_sentence=np.hstack([input_sentence,disease_token_bit])
    input_sentence=np.stack([input_sentence,input_sentence])
    return np.delete(input_sentence,[0],axis=0),input_words

def print_most_similar_sentence(word2vec, sentence_embeddings):
    most_similar_words=list()
    for feature_vector in sentence_embeddings[0]:
        token_vector = feature_vector[0:embedding_dimension]
        closest_token = word2vec.similar_by_vector(token_vector,topn=1)
        most_similar_words.append(closest_token[0][0])
    print(' '.join(most_similar_words))

# tries to generate an input sentence which maximises the activation function of the filters
def generate_sentences_via_gradient_ascent(model, word2vec,
                                           print_words_at_each_iteration= False,
                                           input_sentence=np.random.normal(loc=0.0, scale=1.0,size=(1, 25, 101)),
                                           number_of_iterations_of_gradient_ascent=2000,
                                           number_of_filters_in_layer=3,
                                           learning_rate=0.001,
                                           layer_name = 'conv1d_4'):
    import time
    from keras import applications
    from keras import backend as K
    import numpy as np


    # get the symbolic outputs of each "key" layer (we gave them unique names).
    layer_dict = dict([(layer.name, layer) for layer in model.layers])

      # hc to this layer for dev purposes

    # this is the placeholder for the input into the model
    # the input layer is the same no matter which layer we are maximising
    input_to_model = layer_dict['conv1d_4'].input #model.inputs[0]

    def normalize(x):
        # utility function to normalize a tensor by its L2 norm
        return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)

    kept_filters = []
    print('Started generating sentences for each filter...')
    for filter_index in range(0, number_of_filters_in_layer):
        start_time = time.time()

        # we build a loss function that maximizes the activation
        # of the nth filter of the layer considered
        layer_output = layer_dict[layer_name].output
        loss = K.mean(layer_output[:, :, filter_index])

        # we compute the gradient of the input picture wrt this loss
        grads = K.gradients(loss, input_to_model)[0]

        # normalization trick: we normalize the gradient
        grads = normalize(grads)

        # this function returns the loss and grads given the input picture
        iterate = K.function([input_to_model], [loss, grads])

        generated_sentence=np.copy(input_sentence)

        # we run gradient ascent for thse many steps
        for i in range(number_of_iterations_of_gradient_ascent):
            loss_value, grads_value = iterate([generated_sentence])
            #print(grads_value[0][0])
            generated_sentence += grads_value * learning_rate

            # this tries to snap the sentence to the nearest token in the vector space.  it works better than the unconstrained version, but still doesnt seem to produce reasonable matches to real sentesnce
            #sentence = generated_sentence
            #generated_sentence_2=list()
            #for feature_vector in sentence[0]:
            #    token_vector = feature_vector[0:100]
            #    closest_token = sg.similar_by_vector(token_vector,topn=1)[0][0]
            #
            #    closest_vector = sg[closest_token]
            #    generated_sentence_2.append(np.concatenate((closest_vector,[0])))
            #generated_sentence = np.stack(generated_sentence_2)


            #print('Iteration %s Current loss value: %s '% (i,loss_value))
            if print_words_at_each_iteration:
                sentence = generated_sentence
                for feature_vector in sentence[0]:
                    token_vector = feature_vector[0:100]
                    closest_tokens = sg.similar_by_vector(token_vector,topn=1)
                    #print('filter %s, token %s' % ())
                    for close_token in closest_tokens:
                        print(close_token)
            if loss_value <= 0.:
                # some filters get stuck to 0, we can skip them
                break

        # decode the resulting input image
        if loss_value > 0:

            kept_filters.append(('xx', loss_value, generated_sentence))
        end_time = time.time()
        print('Filter %d sentence generation completed in %ds' % (filter_index, end_time - start_time))

    print('keeping best generated sentences')

    # the filters that have the highest loss are assumed to be better sentences
    # we will only keep the top 64 filters.
    kept_filters.sort(key=lambda x: x[1], reverse=True)
    #kept_filters = kept_filters[:64]

    for idx,filt in enumerate(kept_filters):
        print("filter %s" % idx)
        sentence = filt[2]
        print_most_similar_sentence(word2vec,sentence)
    return kept_filters
	def make_random_input_sentence(sequence_length=300):
	import random
	input_words=list()
	input_sentence=list()
	for i in range(0,sequence_length):
	word=random.choice(list(sg.wv.vocab.keys()))
	input_words.append(word)
	vector=sg[word]
	input_sentence.append(vector)
	disease_token_bit=np.zeros((sequence_length,1))
	input_sentence=np.array(input_sentence)

	# sure there is an easier way to get the sentence of shape 25,101 into a matrix of shape 1,25,101!
	input_sentence=np.hstack([input_sentence,disease_token_bit])
	input_sentence=np.stack([input_sentence,input_sentence])
	return np.delete(input_sentence,[0],axis=0),input_words

	def print_most_similar_sentence(word2vec, sentence_embeddings):
	most_similar_words=list()
	for feature_vector in sentence_embeddings[0]:
	token_vector = feature_vector[0:embedding_dimension]
	closest_token = word2vec.similar_by_vector(token_vector,topn=1)
	most_similar_words.append(closest_token[0][0])
	print(' '.join(most_similar_words))

	# tries to generate an input sentence which maximises the activation function of the filters
	def generate_sentences_via_gradient_ascent(model, word2vec,
	print_words_at_each_iteration= False,
	input_sentence=np.random.normal(loc=0.0, scale=1.0,size=(1, 25, 101)),
	number_of_iterations_of_gradient_ascent=2000,
	number_of_filters_in_layer=3,
	learning_rate=0.001,
	layer_name = 'conv1d_4'):
	import time
	from keras import applications
	from keras import backend as K
	import numpy as np



	# get the symbolic outputs of each "key" layer (we gave them unique names).
	layer_dict = dict([(layer.name, layer) for layer in model.layers])

	# hc to this layer for dev purposes

	# this is the placeholder for the input into the model
	# the input layer is the same no matter which layer we are maximising
	input_to_model = layer_dict['conv1d_4'].input #model.inputs[0]

	def normalize(x):
	# utility function to normalize a tensor by its L2 norm
	return x / (K.sqrt(K.mean(K.square(x))) + 1e-5)

	kept_filters = []
	print('Started generating sentences for each filter...')
	for filter_index in range(0, number_of_filters_in_layer):
	start_time = time.time()

	# we build a loss function that maximizes the activation
	# of the nth filter of the layer considered
	layer_output = layer_dict[layer_name].output
	loss = K.mean(layer_output[:, :, filter_index])

	# we compute the gradient of the input picture wrt this loss
	grads = K.gradients(loss, input_to_model)[0]

	# normalization trick: we normalize the gradient
	grads = normalize(grads)

	# this function returns the loss and grads given the input picture
	iterate = K.function([input_to_model], [loss, grads])

	generated_sentence=np.copy(input_sentence)

	# we run gradient ascent for thse many steps
	for i in range(number_of_iterations_of_gradient_ascent):
	loss_value, grads_value = iterate([generated_sentence])
	#print(grads_value[0][0])
	generated_sentence += grads_value * learning_rate

	# this tries to snap the sentence to the nearest token in the vector space. it works better than the unconstrained version, but still doesnt seem to produce reasonable matches to real sentesnce
	#sentence = generated_sentence
	#generated_sentence_2=list()
	#for feature_vector in sentence[0]:
	# token_vector = feature_vector[0:100]
	# closest_token = sg.similar_by_vector(token_vector,topn=1)[0][0]
	#
	# closest_vector = sg[closest_token]
	# generated_sentence_2.append(np.concatenate((closest_vector,[0])))
	#generated_sentence = np.stack(generated_sentence_2)


	#print('Iteration %s Current loss value: %s '% (i,loss_value))
	if print_words_at_each_iteration:
	sentence = generated_sentence
	for feature_vector in sentence[0]:
	token_vector = feature_vector[0:100]
	closest_tokens = sg.similar_by_vector(token_vector,topn=1)
	#print('filter %s, token %s' % ())
	for close_token in closest_tokens:
	print(close_token)
	if loss_value <= 0.:
	# some filters get stuck to 0, we can skip them
	break

	# decode the resulting input image
	if loss_value > 0:

	kept_filters.append(('xx', loss_value, generated_sentence))
	end_time = time.time()
	print('Filter %d sentence generation completed in %ds' % (filter_index, end_time - start_time))

	print('keeping best generated sentences')

	# the filters that have the highest loss are assumed to be better sentences
	# we will only keep the top 64 filters.
	kept_filters.sort(key=lambda x: x[1], reverse=True)
	#kept_filters = kept_filters[:64]

	for idx,filt in enumerate(kept_filters):
	print("filter %s" % idx)
	sentence = filt[2]
	print_most_similar_sentence(word2vec,sentence)
	return kept_filters