sachinsdate/survival_analysis.py

## survival_analysis.py
import numpy as np
import pandas as pd
from lifelines import KaplanMeierFitter
from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
from lifelines import CoxPHFitter
from matplotlib import pyplot as plt

#dataset link
#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt


#Load the data set into memory
df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')

#sort the data by survival time
df = df.sort_values(by='SURVIVAL_TIME')

#print out the columns
df.columns

#print the top 15 rows of the two columns we are interested in
df[['SURVIVAL_TIME', 'SURVIVAL_STATUS']].head(15)

#Construct the Kaplan-Meier Estimator
kmf = KaplanMeierFitter(label='Stanford heart transplant dataset')

#fit it to the data set
kmf = kmf.fit(durations=df['SURVIVAL_TIME'], event_observed=df['SURVIVAL_STATUS'])

#plot the Kaplan-Meier plot showing S(T>t) against t
kmf.plot()

#Mark the point (6, 0.9126) corresponding to S(T > 6)
plt.plot(6, 0.9126, color='red', marker='+', markeredgewidth=3, markersize=16)

plt.show()

#create a differentiated index based on PRIOR_SURGERY=1
idx = df['PRIOR_SURGERY'] == 1

#Create the respective time series
survival_time_with_prior_surgery, survival_status_with_prior_surgery = df.loc[idx, 'SURVIVAL_TIME'], df.loc[idx, 'SURVIVAL_STATUS']

survival_time_without_prior_surgery, survival_status_without_prior_surgery = df.loc[~idx, 'SURVIVAL_TIME'], df.loc[~idx, 'SURVIVAL_STATUS']

#Fit a Kaplan-Meier plot to the respective sample, with and without prior surgery
kmf_with_prior_surgery = KaplanMeierFitter(label='Group with prior heart surgery').fit(durations=survival_time_with_prior_surgery, event_observed=survival_status_with_prior_surgery)

kmf_without_prior_surgery = KaplanMeierFitter(label='Group without prior heart surgery').fit(durations=survival_time_without_prior_surgery, event_observed=survival_status_without_prior_surgery)

#plot the two curves
kmf_with_prior_surgery.plot()
kmf_without_prior_surgery.plot()
plt.show()

#Perform the point in time at t=365 days
results = survival_difference_at_fixed_point_in_time_test(point_in_time=365, fitterA=kmf_with_prior_surgery, fitterB=kmf_without_prior_surgery)

#Print the test statistic and it's p-value for 1 degree of freedom on the Chi-square table
print('Chi-squared(1) Test statistic='+str(results.test_statistic) + ' p-value='+str(results.p_value))

#Let's create a subset that contains the columns of our interest
df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']]

#Carve out the training and test data sets
mask = np.random.rand(len(df2)) < 0.8
df2_train = df2[mask]
df2_test = df2[~mask]
print('Training data set length='+str(len(df2_train)))
print('Testing data set length='+str(len(df2_test)))

#Create and train the Cox model on the training set
cph_model = CoxPHFitter()
cph_model.fit(df=df2_train, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')

#Display the model training summary
cph_model.print_summary()

#here is the way to get the estimated expectation of the survival times on the test data set
expected_survival_times = cph_model.predict_expectation(df2_test)
	import numpy as np
	import pandas as pd
	from lifelines import KaplanMeierFitter
	from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
	from lifelines import CoxPHFitter
	from matplotlib import pyplot as plt

	#dataset link
	#https://statistics.stanford.edu/research/covariance-analysis-heart-transplant-survival-data
	#http://www.stat.rice.edu/~sneeley/STAT553/Datasets/survivaldata.txt


	#Load the data set into memory
	df = pd.read_csv('stanford_heart_transplant_dataset_full.csv')

	#sort the data by survival time
	df = df.sort_values(by='SURVIVAL_TIME')

	#print out the columns
	df.columns

	#print the top 15 rows of the two columns we are interested in
	df[['SURVIVAL_TIME', 'SURVIVAL_STATUS']].head(15)

	#Construct the Kaplan-Meier Estimator
	kmf = KaplanMeierFitter(label='Stanford heart transplant dataset')

	#fit it to the data set
	kmf = kmf.fit(durations=df['SURVIVAL_TIME'], event_observed=df['SURVIVAL_STATUS'])

	#plot the Kaplan-Meier plot showing S(T>t) against t
	kmf.plot()

	#Mark the point (6, 0.9126) corresponding to S(T > 6)
	plt.plot(6, 0.9126, color='red', marker='+', markeredgewidth=3, markersize=16)

	plt.show()

	#create a differentiated index based on PRIOR_SURGERY=1
	idx = df['PRIOR_SURGERY'] == 1

	#Create the respective time series
	survival_time_with_prior_surgery, survival_status_with_prior_surgery = df.loc[idx, 'SURVIVAL_TIME'], df.loc[idx, 'SURVIVAL_STATUS']

	survival_time_without_prior_surgery, survival_status_without_prior_surgery = df.loc[~idx, 'SURVIVAL_TIME'], df.loc[~idx, 'SURVIVAL_STATUS']

	#Fit a Kaplan-Meier plot to the respective sample, with and without prior surgery
	kmf_with_prior_surgery = KaplanMeierFitter(label='Group with prior heart surgery').fit(durations=survival_time_with_prior_surgery, event_observed=survival_status_with_prior_surgery)

	kmf_without_prior_surgery = KaplanMeierFitter(label='Group without prior heart surgery').fit(durations=survival_time_without_prior_surgery, event_observed=survival_status_without_prior_surgery)

	#plot the two curves
	kmf_with_prior_surgery.plot()
	kmf_without_prior_surgery.plot()
	plt.show()

	#Perform the point in time at t=365 days
	results = survival_difference_at_fixed_point_in_time_test(point_in_time=365, fitterA=kmf_with_prior_surgery, fitterB=kmf_without_prior_surgery)

	#Print the test statistic and it's p-value for 1 degree of freedom on the Chi-square table
	print('Chi-squared(1) Test statistic='+str(results.test_statistic) + ' p-value='+str(results.p_value))

	#Let's create a subset that contains the columns of our interest
	df2 = df[['AGE', 'PRIOR_SURGERY', 'TRANSPLANT_STATUS', 'SURVIVAL_TIME', 'SURVIVAL_STATUS']]

	#Carve out the training and test data sets
	mask = np.random.rand(len(df2)) < 0.8
	df2_train = df2[mask]
	df2_test = df2[~mask]
	print('Training data set length='+str(len(df2_train)))
	print('Testing data set length='+str(len(df2_test)))

	#Create and train the Cox model on the training set
	cph_model = CoxPHFitter()
	cph_model.fit(df=df2_train, duration_col='SURVIVAL_TIME', event_col='SURVIVAL_STATUS')

	#Display the model training summary
	cph_model.print_summary()

	#here is the way to get the estimated expectation of the survival times on the test data set
	expected_survival_times = cph_model.predict_expectation(df2_test)