Skip to content

Instantly share code, notes, and snippets.

@adiamb
Created July 20, 2017 01:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adiamb/79750cb13aea84c9a55c3bf95b69cc1f to your computer and use it in GitHub Desktop.
Save adiamb/79750cb13aea84c9a55c3bf95b69cc1f to your computer and use it in GitHub Desktop.
AGE_PREDICTORS_ONLY EM DATA
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import numpy
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection train_test_split
from sklearn.linear_model import Ridge, RidgeCV, ElasticNetCV, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
seaborn.set(style='ticks')
import re
def rmse_cv(model, X_train, y):
rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
return(rmse)
### read in the files
em_csf = pd.read_csv('/home/labcomp/Desktop/Age_preds/EM_CSF_JULY18.csv', header=0, sep=";")
em_csf=em_csf.loc[em_csf.Age != 0]
em_csf.reset_index(drop=True, inplace=True)
em_csf.drop(['C34 gp41 HIV Fragment'], inplace=True, axis=1) ## non human protein control
### adjust for races
em_csf.Race.value_counts()
em_csf.loc[em_csf.Race.str.contains(r'(American Indian|Other|Caucasian/Asian|\.|Caucasian/American Indian|Indian)'), 'Race'] = 'other'
csf_race_covs=pd.get_dummies(em_csf.Race)
em_csf_covars=csf_race_covs
em_csf_covars.loc[:, ('sex')] = np.where(em_csf.Gender == 'F', 1, 2)
em_csf_scaled = pd.DataFrame(StandardScaler().fit_transform(em_csf.iloc[:, 11:]), columns=em_csf.columns[11:]) ### scale the values
em_csf_scaled_tes = pd.DataFrame(StandardScaler().fit_transform(2**(em_csf.iloc[:, 11:])), columns=em_csf.columns[11:]) ### scale the values
em_csf_scaled2 = pd.concat([em_csf_covars, em_csf_scaled], axis=1) ## add the race covariates
em_csf_scaled2_tes = pd.concat([em_csf_covars, em_csf_scaled_tes], axis=1) ## add the race covariates
em_serum = pd.read_csv('/home/labcomp/Desktop/Age_preds/SERUM_FINAL_Jan18.csv', header=0)
### change dbid to object
em_serum.DbID=em_serum.DbID.astype('object')
em_serum.loc[em_serum.DbID.isin(em_csf.DbID), ('DbID')]
### get individuals that are only present in both csf and serum
em_serum=em_serum.loc[em_serum.DbID.isin(em_csf.DbID)]
em_serum.reset_index(drop=True, inplace=True)
em_serum.Age=em_serum.Age.astype('float64')
em_serum.loc[:, 'sex'] = np.where(em_serum.Gender == 'F', 1, 2)
## make numpy arrays
csf_vars = np.zeros((em_csf_scaled2.shape))
csf_vars2 = np.zeros((em_csf_scaled2.shape))
serum_vars = np.zeros((em_serum.ix[:, 10:].shape))
csf_age = np.zeros((em_csf.Age.shape[0], 1))
serum_age = np.zeros((em_serum.Age.shape[0], 1))
for i in xrange(0, len(em_csf_scaled2)):
csf_vars[i] = em_csf_scaled2.ix[i]
csf_vars2[i] = em_csf_scaled2_tes.ix[i]
csf_age[i] = em_csf.Age[i]
for i in xrange(0, len(em_serum)):
serum_vars[i] = em_serum.ix[i, 10:]
serum_age[i] = em_serum.Age[i]
#csf_vars_sc = StandardScaler().fit_transform(csf_vars)
serum_vars_sc = StandardScaler().fit_transform(serum_vars)
## split into test train for csf and serum
csf_train, csf_test, csf_age_train, csf_age_test = train_test_split(csf_vars, csf_age, random_state=42,test_size=0.33)
csf_train2, csf_test2, csf_age_train2, csf_age_test2 = train_test_split(csf_vars2, csf_age, random_state=42,test_size=0.33)
serum_train, serum_test, serum_age_train, serum_age_test = train_test_split(serum_vars_sc, serum_age, random_state=42,test_size=0.33)
################# CSF from EM cohort
pca = PCA(n_components=2, random_state=42)
pca_pass1=pca.fit_transform(csf_vars)
tsne = TSNE(n_components=2, verbose=1)
tsne_p1 = tsne.fit_transform(pca_pass1)
tsne_p1_df = pd.DataFrame(index=range(0, len(tsne_p1)), columns=['P1', 'P2', 'Age'])
tsne_p1_df.loc[:,('P1')] = tsne_p1[:,0]
tsne_p1_df.loc[:,('P2')] = tsne_p1[:,1]
tsne_p1_df.loc[:,('Age')] = em_csf.Age
tsne_p1_df.loc[:, ('Race')] = em_csf.Race
#tsne_p1_df.loc[:, ('Race')] = comb.Race
fg = sns.FacetGrid(data=tsne_p1_df, hue='Age', palette="Set1")
fg.map(plt.scatter, 'P1', 'P2').add_legend()
ggplot(aes(x='P1', y='P2', color = 'Age', shape='Race'), data=tsne_p1_df)+scale_color_gradient(low='green', high='red')+geom_point(aes(size=100, alpha=0.5))
### build a lasso model
model_lasso = LassoCV(n_jobs=-1, verbose=2, cv=5).fit(csf_train, csf_age_train)
model_lasso2 = LassoCV(n_jobs=-1, verbose=2, cv=5).fit(csf_train2, csf_age_train2)
model_enet = ElasticNetCV(n_jobs=-1, verbose=2).fit(csf_train, csf_age_train)
rmse_cv(model_lasso, csf_train, csf_age_train).mean()
rmse_cv(model_enet, csf_train, csf_age_train).mean()
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
coef = pd.Series(model_lasso.coef_, index = em_csf_scaled2.columns)
coef2 = pd.Series(model_lasso2.coef_, index = em_csf_scaled2.columns)
imp_coef = pd.concat([coef.sort_values().head(12), coef.sort_values().tail(12)])
imp_coef2 = pd.concat([coef2.sort_values().head(12), coef2.sort_values().tail(12)])
### correlation matrix with sns
csf_scaled=pd.DataFrame(csf_vars, columns=em_csf_scaled2.columns)
csf_scaled.loc[:, 'Age'] = em_csf.Age
lasso_csf_sel=csf_scaled.loc[:, np.append(imp_coef.index.values, 'Age')]
corrmat = lasso_csf_sel.corr()
#f, ax = plt.subplots(figsize=(12, 9))
g=sns.heatmap(corrmat, vmax=0.8, square=True);
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
plt.title('Lasso model features highly correlated with Age')
pylab.savefig('/home/labcomp/Desktop/csf_all_cor_with_age_heatmap.png', bbox_inches='tight')
### top 10 correlation matrix variables
k = 15 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Age')['Age'].index
cm = np.corrcoef(csf_scaled[cols].values.T)
sns.set(font_scale=1)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
pylab.savefig('/home/labcomp/Desktop/csf_top_cor_with_age.png', bbox_inches='tight')
#### coef importance ###
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model -CSF")
pylab.savefig('/home/labcomp/Desktop/csf_imp_coefs.png', bbox_inches='tight')
#ggplot(aes(x='Age', y='JAM-B'), data=em_csf_scaled2)+geom_point(size=100, alpha=0.5)
### how do residuals plot against each other
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_lasso.predict(csf_test), "true":[i[0] for i in csf_age_test.tolist()]})
preds["residuals"] = preds["true"] - preds["preds"]
#preds.plot(x = "true", y = "preds",kind = "scatter")
fig, ax = plt.subplots()
ax.scatter(preds.true, preds.preds, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Predicted Vs True CSF')
plt.show()
pylab.savefig('/home/labcomp/Desktop/csf_Pred_Vs_True.png', bbox_inches='tight')
### RMSE on the test set
from sklearn.metrics import mean_squared_error
mean_squared_error(model_lasso.predict(csf_test), csf_age_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(model_lasso.predict(csf_test), csf_age_test)
from sklearn.metrics import r2_score
r2_score(model_lasso.predict(csf_test), csf_age_test)
#################### ensemble of XGBoost and lasso models
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
xgb_model = xgb.XGBRegressor()
#n_estimators = range(50, 700, 50)
#reg_alpha= np.linspace(0.01, 10, num=25)
clf = GridSearchCV(xgb_model,{'max_depth': [2,4,6,8],'n_estimators': [50,100,200, 400, 600, 800], 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}, verbose=1, error_score = 'r2_score')
clf.fit(csf_train,csf_age_train)
print(clf.best_score_)
print(clf.best_params_)
xgb_preds =clf.predict(csf_test) ## orginal estimator
lasso_preds=model_lasso.predict(csf_test)
lasso_preds
predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds})
predictions
predictions.plot(x = "xgb", y = "lasso", kind = "scatter")
preds = 0.7*lasso_preds + 0.3*xgb_preds
preds
r2_score(preds, csf_age_test)
mean_absolute_error(preds, csf_age_test)
### plot the ensemble model
fig, ax = plt.subplots()
ax.scatter(preds, csf_age_test, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.title('Predicted Vs True CSF')
plt.show()
pylab.savefig('/home/labcomp/Desktop/csf_Pred_Vs_True.png', bbox_inches='tight')
############################################ serum from EM
tsne_serum = TSNE(n_components=2, verbose=1)
tsne_serum_p1 = tsne.fit_transform(serum_vars_sc)
tsne_serum_p1_df = pd.DataFrame(index=range(0, len(tsne_serum_p1)), columns=['P1', 'P2', 'Age'])
tsne_serum_p1_df.loc[:,('P1')] = tsne_serum_p1[:,0]
tsne_serum_p1_df.loc[:,('P2')] = tsne_serum_p1[:,1]
tsne_serum_p1_df.loc[:,('Age')] = em_serum.Age
#tsne_p1_df.loc[:, ('Race')] = comb.Race
ggplot(aes(x='P1', y='P2', color = 'Age'), data=tsne_p1_df)+scale_color_gradient(low='green', high='red')+geom_point(aes(size=100, alpha=0.5))
### build a lasso model
model_lasso_ser = LassoCV(n_jobs=-1, verbose=2).fit(serum_train, serum_age_train)
model_enet_ser = ElasticNetCV(n_jobs=-1, verbose=2).fit(serum_train, serum_age_train)
rmse_cv(model_lasso_ser, serum_train, serum_age_train).mean()
print("Lasso picked " + str(sum(coef_ser != 0)) + " variables and eliminated the other " + str(sum(coef_ser == 0)) + " variables")
coef_ser = pd.Series(model_lasso_ser.coef_, index = em_serum.columns[10:1120])
imp_coef_ser = pd.concat([coef_ser.sort_values().head(15), coef_ser.sort_values().tail(15)])
### correlation matrix with sns
serum_scaled=pd.DataFrame(serum_vars_sc, columns=em_serum.columns[10:])
serum_scaled.loc[:, 'Age'] = em_serum.Age
lasso_serum_sel=serum_scaled.loc[:, np.append(imp_coef_ser.index.values, 'Age')]
corrmat_ser = lasso_serum_sel.corr()
#f, ax = plt.subplots(figsize=(12, 9))
g_ser=sns.heatmap(corrmat_ser, vmax=0.8, square=True);
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
pylab.savefig('/home/labcomp/Desktop/serum_correlation_lasso_sel.png', bbox_inches='tight')
### top 10 correlation matrix variables
k = 15 #number of variables for heatmap
cols_ser = corrmat_ser.nlargest(k, 'Age')['Age'].index
cm_ser = np.corrcoef(serum_scaled[cols_ser].values.T)
sns.set(font_scale=1)
hm_ser = sns.heatmap(cm_ser, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols_ser.values, xticklabels=cols_ser.values)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
pylab.savefig('/home/labcomp/Desktop/serum_corr_vals_lasso_sel.png', bbox_inches='tight')
#### coef importance ###
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef_ser.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model Serum")
pylab.savefig('/home/labcomp/Desktop/serum_ceof_importance.png', bbox_inches='tight')
ggplot(aes(x='Age', y='TIMP-2'), data=em_serum)+geom_point(size=100, alpha=0.5)
### how do residuals plot against each other
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds_ser = pd.DataFrame({"preds":model_lasso_ser.predict(serum_test), "true":[i[0] for i in serum_age_test.tolist()]})
preds_ser["residuals"] = preds_ser["true"] - preds_ser["preds"]
#preds.plot(x = "true", y = "residuals",kind = "scatter")
fig, ax = plt.subplots()
ax.scatter(model_lasso_ser.predict(serum_test), serum_age_test, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.title('Predicted Vs True SERUM')
plt.show()
pylab.savefig('/home/labcomp/Desktop/SERUM_Pred_Vs_True.png', bbox_inches='tight')
### RMSE on the test set
from sklearn.metrics import mean_squared_error
mean_squared_error(model_lasso_ser.predict(serum_test), serum_age_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(model_lasso_ser.predict(serum_test), serum_age_test)
from sklearn.metrics import r2_score
r2_score(model_lasso_ser.predict(serum_test), serum_age_test)
clf_ser = GridSearchCV(xgb_model,{'max_depth': [2,4,6,8],'n_estimators': [50,100,200, 400, 600, 800, 900, 1000], 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}, verbose=1, error_score = 'r2_score')
clf_ser.fit(serum_train,serum_age_train)
print(clf_ser.best_score_)
print(clf_ser.best_params_)
xgb_preds_ser =clf_ser.predict(serum_test) ## orginal estimator
lasso_preds_ser=model_lasso_ser.predict(serum_test)
#lasso_preds
predictions_ser = pd.DataFrame({"xgb":xgb_preds_ser, "lasso":lasso_preds_ser})
predictions_ser
predictions_ser.plot(x = "xgb", y = "lasso", kind = "scatter")
preds_ser = 0.7*lasso_preds_ser + 0.3*xgb_preds_ser
r2_score(preds_ser, serum_age_test)
mean_absolute_error(preds_ser, serum_age_test)
### plot the ensemble model ############### LASSO performs better than a ensemble model
fig, ax = plt.subplots()
ax.scatter(lasso_preds_ser, serum_age_test, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.title('Predicted Vs True SERUM')
plt.show()
pylab.savefig('/home/labcomp/Desktop/SERUM_Pred_Vs_True.png', bbox_inches='tight')
########## keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
model = Sequential()
model.add(Dense(64, input_dim=serum_train.shape[1]))
model.add(Activation('sigmoid'))
#model.add(Dropout(0.2))
#model.add(Dense(64/2))
#model.add(Activation('sigmoid'))
#model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('linear'))
model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
his=model.fit(serum_train, serum_age_train, epochs=1000, batch_size=32, validation_data=(serum_test, serum_age_test), shuffle=True)
plt.plot(his.history['acc'])
plt.plot(his.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(his.history['loss'])
plt.plot(his.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
import matplotlib.pyplot as plt
lr = linear_model.LinearRegression()
boston = datasets.load_boston()
y = boston.target
# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(LassoCV(), serum_train, serum_age_train, cv=10)
fig, ax = plt.subplots()
ax.scatter(serum_age_train, predicted, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
## EXTRA SCRIPTS FOR DEBUUNG
########## keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
model = Sequential()
model.add(Dense(600, input_dim=csf_train.shape[1]))
model.add(Activation('sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(600/2))
model.add(Activation('sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('linear'))
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
his=model.fit(csf_train, csf_age_train, epochs=500, batch_size=16, validation_data=(csf_test, csf_age_test), shuffle=True)
plt.plot(his.history['acc'])
plt.plot(his.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.plot(his.history['loss'])
plt.plot(his.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
from sklearn import ensemble
model = ensemble.RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)
model.fit(csf_train, csf_age_train)
feat_names = em_csf_scaled2.columns.values
## plot the importances ##
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment