Skip to content

Instantly share code, notes, and snippets.

@polinabee
Created December 17, 2020 15:11
Show Gist options
  • Save polinabee/eb4a069a34fbe2def1679587974d15a4 to your computer and use it in GitHub Desktop.
Save polinabee/eb4a069a34fbe2def1679587974d15a4 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import datetime
from sklearn import preprocessing
import category_encoders as ce
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
np.random.seed(3123)
def get_year(date_time_str):
return datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S').year
def clean_data(d):
# set age and set'bad' ages to median of 85+
age = d.ADMITTIME.apply(get_year) - d.DOB.apply(get_year)
median_old_age = age[age < 150][age > 84].median()
d['age'] = d.ADMITTIME.apply(get_year) - d.DOB.apply(get_year)
d['age'] = np.where(d['age'] < 150, d['age'], median_old_age)
# return visits per subject
visits_per_subject = pd.DataFrame(d.subject_id.value_counts().reset_index())
visits_per_subject.columns = ['subject_id', 'visits_per_subject']
d = d.merge(visits_per_subject, on='subject_id', how='left')
return d
def x_fields(d):
numericals = ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean',
'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean',
'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean',
'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 'age']
categoricals = ['GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
'FIRST_CAREUNIT', 'ICD9_diagnosis']
return d[numericals+categoricals]
def dummies(d):
dummy = ['GENDER', 'ADMISSION_TYPE', 'FIRST_CAREUNIT', 'INSURANCE']
return pd.get_dummies(d, columns=dummy)
def scale_numericals(d):
numericals = ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean',
'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean',
'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean',
'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 'age']
scaler = preprocessing.StandardScaler().fit(d[numericals])
d_scaled = scaler.transform(d[numericals])
d[numericals] = pd.DataFrame(d_scaled, columns=d[numericals].columns)
return d
def imputation(d):
imputer = IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=15, add_indicator=False)
d_filled = imputer.fit_transform(d)
return pd.DataFrame(d_filled, columns=d.columns)
data=pd.read_csv('../../Data/mimic_train.csv')
data_clean = clean_data(data)
# Test dataset (to produce predictions)
data_test=pd.read_csv('../../Data/mimic_test_los.csv')
data_test_clean = clean_data(data_test)
y = pd.DataFrame(data['LOS'])
x = x_fields(data)
x_test_pred = x_fields(data_test)
x_dirty = x.copy()
x_test_pred_dirty = x_test_pred.copy()
encoder = ce.TargetEncoder(cols=['RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'ICD9_diagnosis'],
handle_missing='return_nan')
#Fit encoder:
encoder.fit(x_dirty, y)
x_clean = encoder.transform(x_dirty)
x_test_pred_clean = encoder.transform(x_test_pred_dirty)
# transform dummies
x_clean = dummies(x_clean)
x_test_pred_clean = dummies(x_test_pred_clean)
# Scale this
x_clean = scale_numericals(x_clean)
x_test_pred_clean = scale_numericals(x_test_pred_clean)
# imputation
x_clean = imputation(x_clean)
x_test_pred_clean = imputation(x_test_pred_clean)
#Grid Search
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
#Grid Search
from sklearn.model_selection import GridSearchCV
model = XGBRegressor()
grid_values = {'learning_rate' : [0.05, 0.1] ,
'min_child_weight' : [10, 15],
'gamma': [1.5, 2],
'subsample': [0.8],
'colsample_bytree' : [0.3, 0.6],
'max_depth': [5, 7, 10],
'objective': ['reg:squarederror']}
grid_xgb_acc = GridSearchCV(model, param_grid = grid_values, cv=5,
scoring = 'neg_mean_squared_error', n_jobs=-1)
#fit model
grid_xgb_acc.fit(x_clean, y)
#Print best parameters
print('Best learning rate : '+ str(grid_xgb_acc.best_estimator_.learning_rate))
print('Best minimum child weight : '+ str(grid_xgb_acc.best_estimator_.min_child_weight))
print('Best gamma : '+ str(grid_xgb_acc.best_estimator_.gamma))
print('Best subsample : '+ str(grid_xgb_acc.best_estimator_.subsample))
print('Best colsample by tree : '+ str(grid_xgb_acc.best_estimator_.colsample_bytree))
print('Best max depth : '+ str(grid_xgb_acc.best_estimator_.max_depth))
# #### XG BOOST 1
xg_reg = xgb.XGBRegressor(learning_rate = 0.05, max_depth = 3, alpha = 10, n_estimators = 200)
xg_reg.fit(x_clean,y)
x_train, x_test, y_train, y_test = train_test_split(x_clean, y, random_state=42)
preds = xg_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
##### output results to CSV
y_pred = grid_xgb_acc.predict(x_test_pred_clean)
# y_pred = xg_reg.predict(x_test_pred_clean)
data_test['LOS'] = y_pred
output = data_test[['icustay_id','LOS']].set_index('icustay_id')
output.to_csv('decision_trees_predictions.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment