Last active
July 21, 2021 14:08
-
-
Save JonathanLoscalzo/6bb2de18c867f13fa02248771a65fce7 to your computer and use it in GitHub Desktop.
for medium post!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn import datasets, ensemble | |
from sklearn.inspection import permutation_importance | |
from sklearn.metrics import mean_squared_error | |
from sklearn.model_selection import train_test_split | |
import pandas as pd | |
import streamlit as st | |
st.set_page_config( | |
page_title="Working with streamlit and sklearn", | |
page_icon="🧊", | |
# layout="wide", | |
initial_sidebar_state="collapsed", | |
) | |
st.header("Working with streamlit and sklearn") | |
st.markdown(""" | |
With streamlit, if you don't have any akward dependency, you could have just one file! | |
But if your script depends on ther files, such images or models, you will need some repository. | |
Here, we are using the sklearn example from here: | |
[sklearn-GradientBoostingRegressor](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py) | |
Visualize the data from well-known diabetes dataset: | |
""") | |
diabetes = datasets.load_diabetes(return_X_y=True, as_frame=True) | |
X, y = diabetes | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.1, random_state=13 | |
) | |
st.write(pd.concat([X, y], axis=1)) | |
st.subheader("Hyperparameters") | |
st.markdown(""" | |
You could test whatever hyperparameter you need, here we are using | |
- "n_estimators" | |
- "max_depth" | |
- "min_samples_split" | |
- "learning_rate" | |
- "loss" | |
""") | |
col1, col2 = st.beta_columns([3,3]) | |
params = { | |
"n_estimators": col1.slider("n_estimator", 100, 1000, 500, 50), | |
"max_depth": col1.slider("max_depth", 1, 25, 8, 3), | |
"min_samples_split": col1.slider("min_samples_split", 1, 10, 5, 1), | |
"learning_rate": col2.slider("learning_rate", 0.001, 1.000, 0.010, 0.001), | |
"loss": col2.select_slider( | |
"Loss", | |
options=["ls", "lad", "huber", "quantile"], | |
), | |
} | |
# Train model | |
reg = ensemble.GradientBoostingRegressor(**params) | |
reg.fit(X_train, y_train) | |
# evaluate bias-variance | |
st.write("The mean squared error (MSE) on train set: {:.4f}".format(mean_squared_error(y_train, reg.predict(X_train)))) | |
st.write("The mean squared error (MSE) on test set: {:.4f}".format(mean_squared_error(y_test, reg.predict(X_test)))) | |
test_score = np.zeros((params["n_estimators"],), dtype=np.float64) | |
for i, y_pred in enumerate(reg.staged_predict(X_test)): | |
test_score[i] = reg.loss_(y_test, y_pred) | |
# calc and plot Deviance | |
fig = plt.figure(figsize=(6, 6)) | |
plt.subplot(1, 1, 1) | |
plt.title("Deviance") | |
plt.plot( | |
np.arange(params["n_estimators"]) + 1, | |
reg.train_score_, | |
"b-", | |
label="Training Set Deviance", | |
) | |
plt.plot( | |
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance" | |
) | |
plt.legend(loc="upper right") | |
plt.xlabel("Boosting Iterations") | |
plt.ylabel("Deviance") | |
col1, col2 = st.beta_columns([3,3]) | |
col1.pyplot(fig) | |
# calc feature importance | |
feature_importance = reg.feature_importances_ | |
sorted_idx = np.argsort(feature_importance) | |
pos = np.arange(sorted_idx.shape[0]) + .5 | |
#plotting feature importance | |
fig = plt.figure(figsize=(6, 6)) | |
plt.subplot(1, 1, 1) | |
plt.barh(pos, feature_importance[sorted_idx], align='center') | |
plt.yticks(pos, np.array(X_train.columns)[sorted_idx]) | |
plt.title('Feature Importance (MDI)') | |
col2.pyplot(fig) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment