Last active
April 19, 2021 20:15
-
-
Save sachinsdate/a8ef6feb40357c97cab207a2a8f5bfab to your computer and use it in GitHub Desktop.
How to compute Schoenfeld Residuals and how to use them to test the assumptions of the Cox Proportional Hazards model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from lifelines import CoxPHFitter | |
from lifelines.statistics import proportional_hazard_test | |
import statsmodels.stats.diagnostic as diag | |
from matplotlib import pyplot as plt | |
#dataset link | |
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data | |
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt | |
#Load the data set into memory | |
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv') | |
#sort the data by survival time | |
df = df.sort_values(by='SURVIVAL_TIME') | |
#print out the columns | |
df.columns | |
#Let's carve out a vertical slice of the data set containing only columns of our interest | |
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']].copy() | |
df2.head() | |
#Create and train the Cox model on the training set: | |
cph_model = CoxPHFitter() | |
cph_model.fit(df=df2, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS') | |
#Display the model training summary: | |
cph_model.print_summary() | |
#Let's carve out the X matrix consisting of only the patients in R_30: | |
X30 = np.array(df2[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS']])[23:,] | |
#Let's calculate the expected age of patients in R30 for our sample data set. | |
#The regression coefficients vector β of shape (3 x 1) | |
Beta = np.array(cph_model.params_).reshape(-1,1) | |
#exp(X30.Beta). A vector of size (80 x 1). Note that X30 has a shape (80 x 1) | |
exp_X30_Beta = np.exp(np.matmul(X30, Beta)) | |
#The summation in the denominator (a scaler quantity) | |
sum_exp_X30_Beta = np.sum(exp_X30_Beta) | |
#The Cox probability of the kth individual in R30 dying0at T=30. A vector of shape (80 x 1) | |
p_k = exp_X30_Beta/sum_exp_X30_Beta | |
#Column 0 (Age) in X30, transposed to shape (1 x 80) | |
X30_0 = X30[:,0].reshape(1,-1) | |
#Expected Age of volunteers in R30 | |
expected_age_at_R30 = np.matmul(X30_0, p_k) | |
print('Expected age of volunteers in R30='+str(expected_age_at_R30)) | |
#subtract the observed age from the expected value of age to get the vector of Schoenfeld residuals r_i_0 | |
# corresponding to T=t_i and risk set R_i. r_i_0 is a vector of shape (1 x 80) | |
r_i_0 = X30_0-expected_age_at_R30 | |
#The value of the Schoenfeld residual for Age at T=30 days is the mean value of r_i_0: | |
mean_r_i_0 = np.mean(r_i_0) | |
print(mean_r_i_0) | |
#Use Lifelines to calculate the variance scaled Schoenfeld residuals for all regression variables in one go: | |
scaled_schoenfeld_residuals = cph_model.compute_residuals(training_dataframe=df2, kind='scaled_schoenfeld') | |
print(scaled_schoenfeld_residuals) | |
#Let's plot the residuals for AGE against time: | |
plt.plot(scaled_schoenfeld_residuals.index, scaled_schoenfeld_residuals['AGE']) | |
plt.show() | |
#Run the Ljung-Box test to test for auto-correlation in residuals up to lag 40 | |
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['AGE'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None) | |
#Let's also run the same two tests on the residuals for PRIOR_SURGERY: | |
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['PRIOR_SURGERY'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None) | |
#And for TRANSPLANT_STATUS: | |
diag.acorr_ljungbox(x=scaled_schoenfeld_residuals['TRANSPLANT_STATUS'], lags=[40], boxpierce=True, model_df=0, period=None, return_df=None) | |
#Run the CPHFitter.proportional_hazards_test on the scaled Schoenfeld residuals | |
proportional_hazard_test(fitted_cox_model=cph_model, training_df=df2, time_transform='log', precomputed_residuals=scaled_schoenfeld_residuals) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment