Created
November 20, 2020 18:37
-
-
Save sachinsdate/3b551b7e8e68fc1b52fc72d182e947d3 to your computer and use it in GitHub Desktop.
Survival analysis using Python and Lifelines using the Stanford heart transplant dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from lifelines import KaplanMeierFitter | |
from lifelines.statistics import survival_difference_at_fixed_point_in_time_test | |
from lifelines import CoxPHFitter | |
from matplotlib import pyplot as plt | |
#dataset link | |
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data | |
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt | |
#Load the data set into memory | |
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv') | |
#sort the data by survival time | |
df = df.sort_values(by='SURVIVAL_TIME') | |
#print out the columns | |
df.columns | |
#print the top 15 rows of the two columns we are interested in | |
df[['SURVIVAL_TIME', 'SURVIVAL_STATUS']].head(15) | |
#Construct the Kaplan-Meier Estimator | |
kmf = KaplanMeierFitter(label='Stanford heart transplant dataset') | |
#fit it to the data set | |
kmf = kmf.fit(durations=df['SURVIVAL_TIME'], event_observed=df['SURVIVAL_STATUS']) | |
#plot the Kaplan-Meier plot showing S(T>t) against t | |
kmf.plot() | |
#Mark the point (6, 0.9126) corresponding to S(T > 6) | |
plt.plot(6, 0.9126, color='red', marker='+', markeredgewidth=3, markersize=16) | |
plt.show() | |
#create a differentiated index based on PRIOR_SURGERY=1 | |
idx = df['PRIOR_SURGERY'] == 1 | |
#Create the respective time series | |
survival_time_with_prior_surgery, survival_status_with_prior_surgery = df.loc[idx, 'SURVIVAL_TIME'], df.loc[idx, 'SURVIVAL_STATUS'] | |
survival_time_without_prior_surgery, survival_status_without_prior_surgery = df.loc[~idx, 'SURVIVAL_TIME'], df.loc[~idx, 'SURVIVAL_STATUS'] | |
#Fit a Kaplan-Meier plot to the respective sample, with and without prior surgery | |
kmf_with_prior_surgery = KaplanMeierFitter(label='Group with prior heart surgery').fit(durations=survival_time_with_prior_surgery, event_observed=survival_status_with_prior_surgery) | |
kmf_without_prior_surgery = KaplanMeierFitter(label='Group without prior heart surgery').fit(durations=survival_time_without_prior_surgery, event_observed=survival_status_without_prior_surgery) | |
#plot the two curves | |
kmf_with_prior_surgery.plot() | |
kmf_without_prior_surgery.plot() | |
plt.show() | |
#Perform the point in time at t=365 days | |
results = survival_difference_at_fixed_point_in_time_test(point_in_time=365, fitterA=kmf_with_prior_surgery, fitterB=kmf_without_prior_surgery) | |
#Print the test statistic and it's p-value for 1 degree of freedom on the Chi-square table | |
print('Chi-squared(1) Test statistic='+str(results.test_statistic) + ' p-value='+str(results.p_value)) | |
#Let's create a subset that contains the columns of our interest | |
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']] | |
#Carve out the training and test data sets | |
mask = np.random.rand(len(df2)) < 0.8 | |
df2_train = df2[mask] | |
df2_test = df2[~mask] | |
print('Training data set length='+str(len(df2_train))) | |
print('Testing data set length='+str(len(df2_test))) | |
#Create and train the Cox model on the training set | |
cph_model = CoxPHFitter() | |
cph_model.fit(df=df2_train, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS') | |
#Display the model training summary | |
cph_model.print_summary() | |
#here is the way to get the estimated expectation of the survival times on the test data set | |
expected_survival_times = cph_model.predict_expectation(df2_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment