Skip to content

Instantly share code, notes, and snippets.

@NoelKennedy
Last active May 18, 2018 15:35
Show Gist options
  • Save NoelKennedy/dd0dd2d4a852305832c75bb07703f6e5 to your computer and use it in GitHub Desktop.
Save NoelKennedy/dd0dd2d4a852305832c75bb07703f6e5 to your computer and use it in GitHub Desktop.
Method 3 : rank input sentences by a filter's activation function score
# converts array of embedding intergers to a list of tokens that those integers represent
# the disease token is shown in upper case
# x_1 = array of embedding integers
# x_2 = array of disease phrase indicators
def embeddings_to_tokens(x_1,x_2,retain_null=True):
tokens=list([embedding_to_token_map[embedding_id] for embedding_id in x_1])
for i in range(0,x_2.shape[0]):
if x_2[i]==1:
tokens[i]=tokens[i].upper()
if retain_null:
return tokens
else:
return list([token for token in tokens if token!='NULL'])
# remember, my problem set up has two input vectors per training example (x_train and x_train_2)
# word2vec is a gensim word2vec model
def find_highest_activation_ngrams(x_train,x_train_2, model, word2vec, top_n,
layer_name='conv1d_1',
number_of_filters=128,
print_ngrams=True # only possible on the first convulutional layer
):
from keras import applications
from keras import backend as K
import numpy as np
# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers])
# this is the placeholder for the input training example. My CNN has 2 input vectors per example.
input_sentence,input_disease_phrase_marker = model.inputs
# interogate model to get some values used in processing later
layer_output = layer_dict[layer_name].output
ouput_dimension = layer_output.shape[1].value
max_sequence_length=x_train.shape[1]
filter_size = (max_sequence_length - ouput_dimension) + 1
for filter_index in range(0, number_of_filters - 1):
print('Processing filter %d' % filter_index)
matched_on_padding = False # flag when the filter matches padding, this is a bug
# manipulate the tensors to perform our task
features = K.identity(layer_output[:, :, filter_index])
extract_features = K.function([input_sentence,input_disease_phrase_marker], [features])
#exceute feature extraction
features_extracted = extract_features([x_train,x_train_2])
# get top_n scores
flattened=np.array(features_extracted).flatten()
top_n_idx=np.argpartition(flattened,-top_n)[-top_n:]
top_n_scores=list()
for i in top_n_idx:
top_n_scores.append(flattened[i])
score_cutoff = min(top_n_scores)
# now go through the training set, and pull out the top_n n-grams which maximise the filter
count=0
output_printing=list()
for i,sentence_features in enumerate(features_extracted[0]):
#if highest match is better than the worst top_n score, then print
max_idx=np.argmax(sentence_features)
highest_match = sentence_features[max_idx]
if highest_match == 0.0:
continue # sometimes filters learn nothing, just skip
if highest_match >= score_cutoff:
# this sentence is a strong match for the filter
if print_ngrams:
# get the tokens from the input data
sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=True)
matched_tokens=sentence_tokens[max_idx:max_idx+ filter_size]
matched_string=' '.join(matched_tokens)
output_printing.append((matched_string,highest_match))
#print('%s:(%s)' % (matched_string,highest_match))
count=count+1
if matched_string == '' or matched_string == ' ':
# somethings wrong here, debug print
print('somehow no matched tokens?')
print(sentence_tokens)
print('length %s' % len(sentence_tokens))
print('max_idx %s' % max_idx)
print('padding_count %s' % padding_count)
print('index %s' % i)
print('filter length %s' % filter_size)
matched_on_padding = True
else:
# cant focus on a particular token sequence, print whole sentence
sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=False)
output_printing.append((' '.join(sentence_tokens),highest_match))
output_printing=sorted(output_printing,key=lambda x:x[1],reverse=True)
for xxx in output_printing:
print(xxx)
if matched_on_padding: return features_extracted[0]
print(count)
print(score_cutoff)
print('Finished filter %d' % filter_index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment