NoelKennedy/Filter_activation.py

## Filter_activation.py
# converts array of embedding intergers to a list of tokens that those integers represent
# the disease token is shown in upper case


# x_1 = array of embedding integers
# x_2 = array of disease phrase indicators
def embeddings_to_tokens(x_1,x_2,retain_null=True):
    tokens=list([embedding_to_token_map[embedding_id] for embedding_id in x_1])
    for i in range(0,x_2.shape[0]):
        if x_2[i]==1:
            tokens[i]=tokens[i].upper()
    if retain_null:
        return tokens
    else:
        return list([token for token in tokens if token!='NULL'])

# remember, my problem set up has two input vectors per training example (x_train and x_train_2)
# word2vec is a gensim word2vec model
def find_highest_activation_ngrams(x_train,x_train_2, model, word2vec, top_n,
                                   layer_name='conv1d_1',
                                   number_of_filters=128,
                                   print_ngrams=True # only possible on the first convulutional layer
                                  ):
    from keras import applications
    from keras import backend as K
    import numpy as np

     # get the symbolic outputs of each "key" layer (we gave them unique names).
    layer_dict = dict([(layer.name, layer) for layer in model.layers])

    # this is the placeholder for the input training example.  My CNN has 2 input vectors per example.
    input_sentence,input_disease_phrase_marker = model.inputs

    # interogate model to get some values used in processing later
    layer_output = layer_dict[layer_name].output
    ouput_dimension = layer_output.shape[1].value
    max_sequence_length=x_train.shape[1]
    filter_size = (max_sequence_length - ouput_dimension) + 1


    for filter_index in range(0, number_of_filters - 1):
        print('Processing filter %d' % filter_index)

        matched_on_padding = False # flag when the filter matches padding, this is a bug

        # manipulate the tensors to perform our task
        features = K.identity(layer_output[:, :, filter_index])
        extract_features = K.function([input_sentence,input_disease_phrase_marker], [features])

        #exceute feature extraction
        features_extracted = extract_features([x_train,x_train_2])

        # get top_n scores
        flattened=np.array(features_extracted).flatten()
        top_n_idx=np.argpartition(flattened,-top_n)[-top_n:]
        top_n_scores=list()
        for i in top_n_idx:
            top_n_scores.append(flattened[i])
        score_cutoff = min(top_n_scores)

        # now go through the training set, and pull out the top_n n-grams which maximise the filter
        count=0
        output_printing=list()

        for i,sentence_features in enumerate(features_extracted[0]):
            #if highest match is better than the worst top_n score, then print
            max_idx=np.argmax(sentence_features)
            highest_match = sentence_features[max_idx]
            if highest_match == 0.0:
                continue # sometimes filters learn nothing, just skip

            if highest_match >= score_cutoff:
                # this sentence is a strong match for the filter

                if print_ngrams:
                    # get the tokens from the input data
                    sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=True)
                    matched_tokens=sentence_tokens[max_idx:max_idx+ filter_size]

                    matched_string=' '.join(matched_tokens)

                    output_printing.append((matched_string,highest_match))
                    #print('%s:(%s)' % (matched_string,highest_match))
                    count=count+1

                    if matched_string == '' or matched_string == '  ':
                        # somethings wrong here, debug print
                        print('somehow no matched tokens?')
                        print(sentence_tokens)
                        print('length %s' % len(sentence_tokens))
                        print('max_idx %s' % max_idx)
                        print('padding_count %s' % padding_count)
                        print('index %s' % i)
                        print('filter length %s' % filter_size)
                        matched_on_padding = True
                else:
                    # cant focus on a particular token sequence, print whole sentence
                    sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=False)
                    output_printing.append((' '.join(sentence_tokens),highest_match))

        output_printing=sorted(output_printing,key=lambda x:x[1],reverse=True)
        for xxx in output_printing:
            print(xxx)
        if matched_on_padding: return features_extracted[0]

        print(count)
        print(score_cutoff)
        print('Finished filter %d' % filter_index)
	# converts array of embedding intergers to a list of tokens that those integers represent
	# the disease token is shown in upper case


	# x_1 = array of embedding integers
	# x_2 = array of disease phrase indicators
	def embeddings_to_tokens(x_1,x_2,retain_null=True):
	tokens=list([embedding_to_token_map[embedding_id] for embedding_id in x_1])
	for i in range(0,x_2.shape[0]):
	if x_2[i]==1:
	tokens[i]=tokens[i].upper()
	if retain_null:
	return tokens
	else:
	return list([token for token in tokens if token!='NULL'])

	# remember, my problem set up has two input vectors per training example (x_train and x_train_2)
	# word2vec is a gensim word2vec model
	def find_highest_activation_ngrams(x_train,x_train_2, model, word2vec, top_n,
	layer_name='conv1d_1',
	number_of_filters=128,
	print_ngrams=True # only possible on the first convulutional layer
	):
	from keras import applications
	from keras import backend as K
	import numpy as np

	# get the symbolic outputs of each "key" layer (we gave them unique names).
	layer_dict = dict([(layer.name, layer) for layer in model.layers])

	# this is the placeholder for the input training example. My CNN has 2 input vectors per example.
	input_sentence,input_disease_phrase_marker = model.inputs

	# interogate model to get some values used in processing later
	layer_output = layer_dict[layer_name].output
	ouput_dimension = layer_output.shape[1].value
	max_sequence_length=x_train.shape[1]
	filter_size = (max_sequence_length - ouput_dimension) + 1


	for filter_index in range(0, number_of_filters - 1):
	print('Processing filter %d' % filter_index)

	matched_on_padding = False # flag when the filter matches padding, this is a bug

	# manipulate the tensors to perform our task
	features = K.identity(layer_output[:, :, filter_index])
	extract_features = K.function([input_sentence,input_disease_phrase_marker], [features])

	#exceute feature extraction
	features_extracted = extract_features([x_train,x_train_2])

	# get top_n scores
	flattened=np.array(features_extracted).flatten()
	top_n_idx=np.argpartition(flattened,-top_n)[-top_n:]
	top_n_scores=list()
	for i in top_n_idx:
	top_n_scores.append(flattened[i])
	score_cutoff = min(top_n_scores)

	# now go through the training set, and pull out the top_n n-grams which maximise the filter
	count=0
	output_printing=list()

	for i,sentence_features in enumerate(features_extracted[0]):
	#if highest match is better than the worst top_n score, then print
	max_idx=np.argmax(sentence_features)
	highest_match = sentence_features[max_idx]
	if highest_match == 0.0:
	continue # sometimes filters learn nothing, just skip

	if highest_match >= score_cutoff:
	# this sentence is a strong match for the filter

	if print_ngrams:
	# get the tokens from the input data
	sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=True)
	matched_tokens=sentence_tokens[max_idx:max_idx+ filter_size]

	matched_string=' '.join(matched_tokens)

	output_printing.append((matched_string,highest_match))
	#print('%s:(%s)' % (matched_string,highest_match))
	count=count+1

	if matched_string == '' or matched_string == ' ':
	# somethings wrong here, debug print
	print('somehow no matched tokens?')
	print(sentence_tokens)
	print('length %s' % len(sentence_tokens))
	print('max_idx %s' % max_idx)
	print('padding_count %s' % padding_count)
	print('index %s' % i)
	print('filter length %s' % filter_size)
	matched_on_padding = True
	else:
	# cant focus on a particular token sequence, print whole sentence
	sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=False)
	output_printing.append((' '.join(sentence_tokens),highest_match))

	output_printing=sorted(output_printing,key=lambda x:x[1],reverse=True)
	for xxx in output_printing:
	print(xxx)
	if matched_on_padding: return features_extracted[0]

	print(count)
	print(score_cutoff)
	print('Finished filter %d' % filter_index)