Skip to content

Instantly share code, notes, and snippets.

@sachinsdate
Created November 20, 2020 18:37
Show Gist options
  • Save sachinsdate/3b551b7e8e68fc1b52fc72d182e947d3 to your computer and use it in GitHub Desktop.
Save sachinsdate/3b551b7e8e68fc1b52fc72d182e947d3 to your computer and use it in GitHub Desktop.
Survival analysis using Python and Lifelines using the Stanford heart transplant dataset
import numpy as np
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
from lifelines import CoxPHFitter
from matplotlib import pyplot as plt
#dataset link
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt
#Load the data set into memory
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')
#sort the data by survival time
df = df.sort_values(by='SURVIVAL_TIME')
#print out the columns
df.columns
#print the top 15 rows of the two columns we are interested in
df[['SURVIVAL_TIME', 'SURVIVAL_STATUS']].head(15)
#Construct the Kaplan-Meier Estimator
kmf = KaplanMeierFitter(label='Stanford heart transplant dataset')
#fit it to the data set
kmf = kmf.fit(durations=df['SURVIVAL_TIME'], event_observed=df['SURVIVAL_STATUS'])
#plot the Kaplan-Meier plot showing S(T>t) against t
kmf.plot()
#Mark the point (6, 0.9126) corresponding to S(T > 6)
plt.plot(6, 0.9126, color='red', marker='+', markeredgewidth=3, markersize=16)
plt.show()
#create a differentiated index based on PRIOR_SURGERY=1
idx = df['PRIOR_SURGERY'] == 1
#Create the respective time series
survival_time_with_prior_surgery, survival_status_with_prior_surgery = df.loc[idx, 'SURVIVAL_TIME'], df.loc[idx, 'SURVIVAL_STATUS']
survival_time_without_prior_surgery, survival_status_without_prior_surgery = df.loc[~idx, 'SURVIVAL_TIME'], df.loc[~idx, 'SURVIVAL_STATUS']
#Fit a Kaplan-Meier plot to the respective sample, with and without prior surgery
kmf_with_prior_surgery = KaplanMeierFitter(label='Group with prior heart surgery').fit(durations=survival_time_with_prior_surgery, event_observed=survival_status_with_prior_surgery)
kmf_without_prior_surgery = KaplanMeierFitter(label='Group without prior heart surgery').fit(durations=survival_time_without_prior_surgery, event_observed=survival_status_without_prior_surgery)
#plot the two curves
kmf_with_prior_surgery.plot()
kmf_without_prior_surgery.plot()
plt.show()
#Perform the point in time at t=365 days
results = survival_difference_at_fixed_point_in_time_test(point_in_time=365, fitterA=kmf_with_prior_surgery, fitterB=kmf_without_prior_surgery)
#Print the test statistic and it's p-value for 1 degree of freedom on the Chi-square table
print('Chi-squared(1) Test statistic='+str(results.test_statistic) + ' p-value='+str(results.p_value))
#Let's create a subset that contains the columns of our interest
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']]
#Carve out the training and test data sets
mask = np.random.rand(len(df2)) < 0.8
df2_train = df2[mask]
df2_test = df2[~mask]
print('Training data set length='+str(len(df2_train)))
print('Testing data set length='+str(len(df2_test)))
#Create and train the Cox model on the training set
cph_model = CoxPHFitter()
cph_model.fit(df=df2_train, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')
#Display the model training summary
cph_model.print_summary()
#here is the way to get the estimated expectation of the survival times on the test data set
expected_survival_times = cph_model.predict_expectation(df2_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment