Created
December 17, 2020 15:11
-
-
Save polinabee/eb4a069a34fbe2def1679587974d15a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import datetime | |
from sklearn import preprocessing | |
import category_encoders as ce | |
from sklearn.impute import IterativeImputer | |
from sklearn.linear_model import BayesianRidge | |
np.random.seed(3123) | |
def get_year(date_time_str): | |
return datetime.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S').year | |
def clean_data(d): | |
# set age and set'bad' ages to median of 85+ | |
age = d.ADMITTIME.apply(get_year) - d.DOB.apply(get_year) | |
median_old_age = age[age < 150][age > 84].median() | |
d['age'] = d.ADMITTIME.apply(get_year) - d.DOB.apply(get_year) | |
d['age'] = np.where(d['age'] < 150, d['age'], median_old_age) | |
# return visits per subject | |
visits_per_subject = pd.DataFrame(d.subject_id.value_counts().reset_index()) | |
visits_per_subject.columns = ['subject_id', 'visits_per_subject'] | |
d = d.merge(visits_per_subject, on='subject_id', how='left') | |
return d | |
def x_fields(d): | |
numericals = ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', | |
'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', | |
'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean', | |
'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 'age'] | |
categoricals = ['GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', | |
'FIRST_CAREUNIT', 'ICD9_diagnosis'] | |
return d[numericals+categoricals] | |
def dummies(d): | |
dummy = ['GENDER', 'ADMISSION_TYPE', 'FIRST_CAREUNIT', 'INSURANCE'] | |
return pd.get_dummies(d, columns=dummy) | |
def scale_numericals(d): | |
numericals = ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', | |
'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', | |
'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean', | |
'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 'age'] | |
scaler = preprocessing.StandardScaler().fit(d[numericals]) | |
d_scaled = scaler.transform(d[numericals]) | |
d[numericals] = pd.DataFrame(d_scaled, columns=d[numericals].columns) | |
return d | |
def imputation(d): | |
imputer = IterativeImputer(random_state=0, estimator=BayesianRidge(), max_iter=15, add_indicator=False) | |
d_filled = imputer.fit_transform(d) | |
return pd.DataFrame(d_filled, columns=d.columns) | |
data=pd.read_csv('../../Data/mimic_train.csv') | |
data_clean = clean_data(data) | |
# Test dataset (to produce predictions) | |
data_test=pd.read_csv('../../Data/mimic_test_los.csv') | |
data_test_clean = clean_data(data_test) | |
y = pd.DataFrame(data['LOS']) | |
x = x_fields(data) | |
x_test_pred = x_fields(data_test) | |
x_dirty = x.copy() | |
x_test_pred_dirty = x_test_pred.copy() | |
encoder = ce.TargetEncoder(cols=['RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'ICD9_diagnosis'], | |
handle_missing='return_nan') | |
#Fit encoder: | |
encoder.fit(x_dirty, y) | |
x_clean = encoder.transform(x_dirty) | |
x_test_pred_clean = encoder.transform(x_test_pred_dirty) | |
# transform dummies | |
x_clean = dummies(x_clean) | |
x_test_pred_clean = dummies(x_test_pred_clean) | |
# Scale this | |
x_clean = scale_numericals(x_clean) | |
x_test_pred_clean = scale_numericals(x_test_pred_clean) | |
# imputation | |
x_clean = imputation(x_clean) | |
x_test_pred_clean = imputation(x_test_pred_clean) | |
#Grid Search | |
from sklearn.model_selection import GridSearchCV | |
from xgboost import XGBRegressor | |
#Grid Search | |
from sklearn.model_selection import GridSearchCV | |
model = XGBRegressor() | |
grid_values = {'learning_rate' : [0.05, 0.1] , | |
'min_child_weight' : [10, 15], | |
'gamma': [1.5, 2], | |
'subsample': [0.8], | |
'colsample_bytree' : [0.3, 0.6], | |
'max_depth': [5, 7, 10], | |
'objective': ['reg:squarederror']} | |
grid_xgb_acc = GridSearchCV(model, param_grid = grid_values, cv=5, | |
scoring = 'neg_mean_squared_error', n_jobs=-1) | |
#fit model | |
grid_xgb_acc.fit(x_clean, y) | |
#Print best parameters | |
print('Best learning rate : '+ str(grid_xgb_acc.best_estimator_.learning_rate)) | |
print('Best minimum child weight : '+ str(grid_xgb_acc.best_estimator_.min_child_weight)) | |
print('Best gamma : '+ str(grid_xgb_acc.best_estimator_.gamma)) | |
print('Best subsample : '+ str(grid_xgb_acc.best_estimator_.subsample)) | |
print('Best colsample by tree : '+ str(grid_xgb_acc.best_estimator_.colsample_bytree)) | |
print('Best max depth : '+ str(grid_xgb_acc.best_estimator_.max_depth)) | |
# #### XG BOOST 1 | |
xg_reg = xgb.XGBRegressor(learning_rate = 0.05, max_depth = 3, alpha = 10, n_estimators = 200) | |
xg_reg.fit(x_clean,y) | |
x_train, x_test, y_train, y_test = train_test_split(x_clean, y, random_state=42) | |
preds = xg_reg.predict(x_test) | |
rmse = np.sqrt(mean_squared_error(y_test, preds)) | |
print("RMSE: %f" % (rmse)) | |
##### output results to CSV | |
y_pred = grid_xgb_acc.predict(x_test_pred_clean) | |
# y_pred = xg_reg.predict(x_test_pred_clean) | |
data_test['LOS'] = y_pred | |
output = data_test[['icustay_id','LOS']].set_index('icustay_id') | |
output.to_csv('decision_trees_predictions.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment