NoelKennedy/token_occulusion_plots.py

## token_occulusion_plots.py
import math
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

# x is a vector representing a sentence.  Sentence is an vector of integers.
# this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence
# st that z[i][j] == x[j] except when i==j, z[i][j] == 0.  ie the token is blanked out with the padding indicator
def create_occlusion_batch(x):
    number_tokens_in_sentence=x.shape[0]
    z=list()

    for j in range(0,number_tokens_in_sentence):
        z.append(np.copy(x))
        z[j][j]=0 # blank out this token
    return np.vstack(z)


# controls the sentence from dev set we will run the analysis on
sentence_id=np.random.randint(0,y_dev.shape[0])
#sentence_id=38439
print('sentence_id %s' % sentence_id)

significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at.  0.01 = 1%

original_sentence=x_dev[sentence_id]
#original_sentence=cp
original_sentence_2=x_dev_2[sentence_id]
original_sentence_label=y_dev[sentence_id]

# create a batch for the sentence with each token occluded
z=create_occlusion_batch(original_sentence)
z_disease_phrase=list()
for i in range(0,z.shape[0]):
    z_disease_phrase.append(original_sentence_2)
z_disease_phrase=np.stack(z_disease_phrase)

# get model to make predictions for each occuluded token
y_pred_occulusion=cpu_model.predict([z,z_disease_phrase])

# calculate the effect this has on the true class of the sentence
true_class=np.argmax(original_sentence_label)
base_prediction=base_predictions[sentence_id][true_class]

tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False)
tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True)

# code for anonymising output
#tokens[277]='day'
#tokens[278]='month'
#tokens[279]='year'
#tokens[283]='patient_name'

diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()])
print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction))
print('** original sentence')
print(' '.join(tokens_no_null))
print('**')


important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
impact=list()


for i in range(0,z.shape[0]):
    # skip padding
    if original_sentence[i]==0:
        continue
    token_occluded_score=y_pred_occulusion[i][true_class]

    #guard: token makes no difference to prediction
    if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference):
        continue
    important_tokens.append(tokens[i])
    impact.append(100*(token_occluded_score-base_prediction))

#important_tokens=important_tokens[0:20]
#impact=impact[0:20]
important_tokens.reverse()
impact.reverse()

y_pos = np.arange(len(important_tokens))
matplotlib.rcParams['figure.figsize'] = [7, 10]
plt.barh(y_pos, impact, align='center', alpha=0.5)
plt.yticks(y_pos, important_tokens)
plt.ylabel('Occluded token')
plt.xlabel('Change in % likelihood of true class when token is occluded')
plt.title('True class: %s %s' % (class_labels[true_class], diagnosis))
plt.show()
	import math
	import matplotlib
	import matplotlib.pyplot as plt
	from matplotlib.ticker import PercentFormatter

	# x is a vector representing a sentence. Sentence is an vector of integers.
	# this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence
	# st that z[i][j] == x[j] except when i==j, z[i][j] == 0. ie the token is blanked out with the padding indicator
	def create_occlusion_batch(x):
	number_tokens_in_sentence=x.shape[0]
	z=list()

	for j in range(0,number_tokens_in_sentence):
	z.append(np.copy(x))
	z[j][j]=0 # blank out this token
	return np.vstack(z)


	# controls the sentence from dev set we will run the analysis on
	sentence_id=np.random.randint(0,y_dev.shape[0])
	#sentence_id=38439
	print('sentence_id %s' % sentence_id)

	significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at. 0.01 = 1%

	original_sentence=x_dev[sentence_id]
	#original_sentence=cp
	original_sentence_2=x_dev_2[sentence_id]
	original_sentence_label=y_dev[sentence_id]

	# create a batch for the sentence with each token occluded
	z=create_occlusion_batch(original_sentence)
	z_disease_phrase=list()
	for i in range(0,z.shape[0]):
	z_disease_phrase.append(original_sentence_2)
	z_disease_phrase=np.stack(z_disease_phrase)

	# get model to make predictions for each occuluded token
	y_pred_occulusion=cpu_model.predict([z,z_disease_phrase])

	# calculate the effect this has on the true class of the sentence
	true_class=np.argmax(original_sentence_label)
	base_prediction=base_predictions[sentence_id][true_class]

	tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False)
	tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True)

	# code for anonymising output
	#tokens[277]='day'
	#tokens[278]='month'
	#tokens[279]='year'
	#tokens[283]='patient_name'

	diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()])
	print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction))
	print('** original sentence')
	print(' '.join(tokens_no_null))
	print('**')


	important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
	impact=list()


	for i in range(0,z.shape[0]):
	# skip padding
	if original_sentence[i]==0:
	continue
	token_occluded_score=y_pred_occulusion[i][true_class]

	#guard: token makes no difference to prediction
	if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference):
	continue
	important_tokens.append(tokens[i])
	impact.append(100*(token_occluded_score-base_prediction))

	#important_tokens=important_tokens[0:20]
	#impact=impact[0:20]
	important_tokens.reverse()
	impact.reverse()

	y_pos = np.arange(len(important_tokens))
	matplotlib.rcParams['figure.figsize'] = [7, 10]
	plt.barh(y_pos, impact, align='center', alpha=0.5)
	plt.yticks(y_pos, important_tokens)
	plt.ylabel('Occluded token')
	plt.xlabel('Change in % likelihood of true class when token is occluded')
	plt.title('True class: %s %s' % (class_labels[true_class], diagnosis))
	plt.show()