sachinsdate/schoenfeld_residuals.py

## schoenfeld_residuals.py
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
import statsmodels.stats.diagnostic as diag
from matplotlib import pyplot as plt

#dataset link
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt


#Load the data set into memory
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')

#sort the data by survival time
df = df.sort_values(by='SURVIVAL_TIME')

#print out the columns
df.columns

#Let's carve out a vertical slice of the data set containing only columns of our interest
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']].copy()
df2.head()

#Create and train the Cox model on the training set:
cph_model = CoxPHFitter()
cph_model.fit(df=df2, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')

#Display the model training summary:
cph_model.print_summary()

#Let's carve out the X matrix consisting of only the patients in R_30:
X30 = np.array(df2[['AGE',  'PRIOR_SURGERY',  'TRANSPLANT_STATUS']])[23:,]

#Let's calculate the expected age of patients in R30 for our sample data set.

#The regression coefficients vector β of shape (3 x 1)
Beta = np.array(cph_model.params_).reshape(-1,1)
#exp(X30.Beta). A vector of size (80 x 1). Note that X30 has a shape (80 x 1)
exp_X30_Beta = np.exp(np.matmul(X30, Beta))
#The summation in the denominator (a scaler quantity)
sum_exp_X30_Beta = np.sum(exp_X30_Beta)
#The Cox probability of the kth individual in R30 dying0at T=30. A vector of shape (80 x 1)
p_k = exp_X30_Beta/sum_exp_X30_Beta
#Column 0 (Age) in X30, transposed to shape (1 x 80)
X30_0 = X30[:,0].reshape(1,-1)
#Expected Age of volunteers in R30
expected_age_at_R30 = np.matmul(X30_0, p_k)
print('Expected age of volunteers in R30='+str(expected_age_at_R30))

#subtract the observed age from the expected value of age to get the vector of Schoenfeld residuals r_i_0
# corresponding to T=t_i and risk set R_i. r_i_0 is a vector of shape (1 x 80)
r_i_0 = X30_0-expected_age_at_R30

#The value of the Schoenfeld residual for Age at T=30 days is the mean value of r_i_0:
mean_r_i_0 = np.mean(r_i_0)
print(mean_r_i_0)

#Use Lifelines to calculate the variance scaled Schoenfeld residuals for all regression variables in one go:
scaled_schoenfeld_residuals = cph_model.compute_residuals(training_dataframe=df2, kind='scaled_schoenfeld')
print(scaled_schoenfeld_residuals)

#Let's plot the residuals for AGE against time:
plt.plot(scaled_schoenfeld_residuals.index, scaled_schoenfeld_residuals['AGE'])
plt.show()

#Run the Ljung-Box test to test for auto-correlation in residuals up to lag 40
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['AGE'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

#Let's also run the same two tests on the residuals for PRIOR_SURGERY:
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['PRIOR_SURGERY'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

#And for TRANSPLANT_STATUS:
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['TRANSPLANT_STATUS'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

#Run the CPHFitter.proportional_hazards_test on the scaled Schoenfeld residuals
proportional_hazard_test(fitted_cox_model=cph_model, training_df=df2, time_transform='log', precomputed_residuals=scaled_schoenfeld_residuals)
	import numpy as np
	import pandas as pd
	from lifelines import CoxPHFitter
	from lifelines.statistics import proportional_hazard_test
	import statsmodels.stats.diagnostic as diag
	from matplotlib import pyplot as plt

	#dataset link
	#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
	#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt


	#Load the data set into memory
	df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')

	#sort the data by survival time
	df = df.sort_values(by='SURVIVAL_TIME')

	#print out the columns
	df.columns

	#Let's carve out a vertical slice of the data set containing only columns of our interest
	df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']].copy()
	df2.head()

	#Create and train the Cox model on the training set:
	cph_model = CoxPHFitter()
	cph_model.fit(df=df2, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')

	#Display the model training summary:
	cph_model.print_summary()

	#Let's carve out the X matrix consisting of only the patients in R_30:
	X30 = np.array(df2[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS']])[23:,]

	#Let's calculate the expected age of patients in R30 for our sample data set.

	#The regression coefficients vector β of shape (3 x 1)
	Beta = np.array(cph_model.params_).reshape(-1,1)
	#exp(X30.Beta). A vector of size (80 x 1). Note that X30 has a shape (80 x 1)
	exp_X30_Beta = np.exp(np.matmul(X30, Beta))
	#The summation in the denominator (a scaler quantity)
	sum_exp_X30_Beta = np.sum(exp_X30_Beta)
	#The Cox probability of the kth individual in R30 dying0at T=30. A vector of shape (80 x 1)
	p_k = exp_X30_Beta/sum_exp_X30_Beta
	#Column 0 (Age) in X30, transposed to shape (1 x 80)
	X30_0 = X30[:,0].reshape(1,-1)
	#Expected Age of volunteers in R30
	expected_age_at_R30 = np.matmul(X30_0, p_k)
	print('Expected age of volunteers in R30='+str(expected_age_at_R30))

	#subtract the observed age from the expected value of age to get the vector of Schoenfeld residuals r_i_0
	# corresponding to T=t_i and risk set R_i. r_i_0 is a vector of shape (1 x 80)
	r_i_0 = X30_0-expected_age_at_R30

	#The value of the Schoenfeld residual for Age at T=30 days is the mean value of r_i_0:
	mean_r_i_0 = np.mean(r_i_0)
	print(mean_r_i_0)

	#Use Lifelines to calculate the variance scaled Schoenfeld residuals for all regression variables in one go:
	scaled_schoenfeld_residuals = cph_model.compute_residuals(training_dataframe=df2, kind='scaled_schoenfeld')
	print(scaled_schoenfeld_residuals)

	#Let's plot the residuals for AGE against time:
	plt.plot(scaled_schoenfeld_residuals.index, scaled_schoenfeld_residuals['AGE'])
	plt.show()

	#Run the Ljung-Box test to test for auto-correlation in residuals up to lag 40
	diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['AGE'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

	#Let's also run the same two tests on the residuals for PRIOR_SURGERY:
	diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['PRIOR_SURGERY'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

	#And for TRANSPLANT_STATUS:
	diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['TRANSPLANT_STATUS'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)

	#Run the CPHFitter.proportional_hazards_test on the scaled Schoenfeld residuals
	proportional_hazard_test(fitted_cox_model=cph_model, training_df=df2, time_transform='log', precomputed_residuals=scaled_schoenfeld_residuals)