Skip to content

Instantly share code, notes, and snippets.

@NoelKennedy
Last active May 18, 2018 15:31
Show Gist options
  • Save NoelKennedy/93d5b8fb96895a6093b96560affe73d7 to your computer and use it in GitHub Desktop.
Save NoelKennedy/93d5b8fb96895a6093b96560affe73d7 to your computer and use it in GitHub Desktop.
Method 2 : Hide each token in the input sentence one-by-one and see how this changes the likelihood of the sentence's class
import math
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
# x is a vector representing a sentence. Sentence is an vector of integers.
# this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence
# st that z[i][j] == x[j] except when i==j, z[i][j] == 0. ie the token is blanked out with the padding indicator
def create_occlusion_batch(x):
number_tokens_in_sentence=x.shape[0]
z=list()
for j in range(0,number_tokens_in_sentence):
z.append(np.copy(x))
z[j][j]=0 # blank out this token
return np.vstack(z)
# controls the sentence from dev set we will run the analysis on
sentence_id=np.random.randint(0,y_dev.shape[0])
#sentence_id=38439
print('sentence_id %s' % sentence_id)
significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at. 0.01 = 1%
original_sentence=x_dev[sentence_id]
#original_sentence=cp
original_sentence_2=x_dev_2[sentence_id]
original_sentence_label=y_dev[sentence_id]
# create a batch for the sentence with each token occluded
z=create_occlusion_batch(original_sentence)
z_disease_phrase=list()
for i in range(0,z.shape[0]):
z_disease_phrase.append(original_sentence_2)
z_disease_phrase=np.stack(z_disease_phrase)
# get model to make predictions for each occuluded token
y_pred_occulusion=cpu_model.predict([z,z_disease_phrase])
# calculate the effect this has on the true class of the sentence
true_class=np.argmax(original_sentence_label)
base_prediction=base_predictions[sentence_id][true_class]
tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False)
tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True)
# code for anonymising output
#tokens[277]='day'
#tokens[278]='month'
#tokens[279]='year'
#tokens[283]='patient_name'
diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()])
print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction))
print('** original sentence')
print(' '.join(tokens_no_null))
print('**')
important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
impact=list()
for i in range(0,z.shape[0]):
# skip padding
if original_sentence[i]==0:
continue
token_occluded_score=y_pred_occulusion[i][true_class]
#guard: token makes no difference to prediction
if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference):
continue
important_tokens.append(tokens[i])
impact.append(100*(token_occluded_score-base_prediction))
#important_tokens=important_tokens[0:20]
#impact=impact[0:20]
important_tokens.reverse()
impact.reverse()
y_pos = np.arange(len(important_tokens))
matplotlib.rcParams['figure.figsize'] = [7, 10]
plt.barh(y_pos, impact, align='center', alpha=0.5)
plt.yticks(y_pos, important_tokens)
plt.ylabel('Occluded token')
plt.xlabel('Change in % likelihood of true class when token is occluded')
plt.title('True class: %s %s' % (class_labels[true_class], diagnosis))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment