Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Last active April 19, 2021 20:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sachinsdate/a8ef6feb40357c97cab207a2a8f5bfab to your computer and use it in GitHub Desktop.
Save sachinsdate/a8ef6feb40357c97cab207a2a8f5bfab to your computer and use it in GitHub Desktop.
How to compute Schoenfeld Residuals and how to use them to test the assumptions of the Cox Proportional Hazards model
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
import statsmodels.stats.diagnostic as diag
from matplotlib import pyplot as plt
#dataset link
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt
#Load the data set into memory
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')
#sort the data by survival time
df = df.sort_values(by='SURVIVAL_TIME')
#print out the columns
df.columns
#Let's carve out a vertical slice of the data set containing only columns of our interest
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']].copy()
df2.head()
#Create and train the Cox model on the training set:
cph_model = CoxPHFitter()
cph_model.fit(df=df2, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')
#Display the model training summary:
cph_model.print_summary()
#Let's carve out the X matrix consisting of only the patients in R_30:
X30 = np.array(df2[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS']])[23:,]
#Let's calculate the expected age of patients in R30 for our sample data set.
#The regression coefficients vector β of shape (3 x 1)
Beta = np.array(cph_model.params_).reshape(-1,1)
#exp(X30.Beta). A vector of size (80 x 1). Note that X30 has a shape (80 x 1)
exp_X30_Beta = np.exp(np.matmul(X30, Beta))
#The summation in the denominator (a scaler quantity)
sum_exp_X30_Beta = np.sum(exp_X30_Beta)
#The Cox probability of the kth individual in R30 dying0at T=30. A vector of shape (80 x 1)
p_k = exp_X30_Beta/sum_exp_X30_Beta
#Column 0 (Age) in X30, transposed to shape (1 x 80)
X30_0 = X30[:,0].reshape(1,-1)
#Expected Age of volunteers in R30
expected_age_at_R30 = np.matmul(X30_0, p_k)
print('Expected age of volunteers in R30='+str(expected_age_at_R30))
#subtract the observed age from the expected value of age to get the vector of Schoenfeld residuals r_i_0
# corresponding to T=t_i and risk set R_i. r_i_0 is a vector of shape (1 x 80)
r_i_0 = X30_0-expected_age_at_R30
#The value of the Schoenfeld residual for Age at T=30 days is the mean value of r_i_0:
mean_r_i_0 = np.mean(r_i_0)
print(mean_r_i_0)
#Use Lifelines to calculate the variance scaled Schoenfeld residuals for all regression variables in one go:
scaled_schoenfeld_residuals = cph_model.compute_residuals(training_dataframe=df2, kind='scaled_schoenfeld')
print(scaled_schoenfeld_residuals)
#Let's plot the residuals for AGE against time:
plt.plot(scaled_schoenfeld_residuals.index, scaled_schoenfeld_residuals['AGE'])
plt.show()
#Run the Ljung-Box test to test for auto-correlation in residuals up to lag 40
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['AGE'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)
#Let's also run the same two tests on the residuals for PRIOR_SURGERY:
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['PRIOR_SURGERY'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)
#And for TRANSPLANT_STATUS:
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['TRANSPLANT_STATUS'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None)
#Run the CPHFitter.proportional_hazards_test on the scaled Schoenfeld residuals
proportional_hazard_test(fitted_cox_model=cph_model, training_df=df2, time_transform='log', precomputed_residuals=scaled_schoenfeld_residuals)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment