Skip to content

Instantly share code, notes, and snippets.

@JonathanLoscalzo
Last active July 21, 2021 14:08
Show Gist options
  • Save JonathanLoscalzo/6bb2de18c867f13fa02248771a65fce7 to your computer and use it in GitHub Desktop.
Save JonathanLoscalzo/6bb2de18c867f13fa02248771a65fce7 to your computer and use it in GitHub Desktop.
for medium post!
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import streamlit as st
st.set_page_config(
page_title="Working with streamlit and sklearn",
page_icon="🧊",
# layout="wide",
initial_sidebar_state="collapsed",
)
st.header("Working with streamlit and sklearn")
st.markdown("""
With streamlit, if you don't have any akward dependency, you could have just one file!
But if your script depends on ther files, such images or models, you will need some repository.
Here, we are using the sklearn example from here:
[sklearn-GradientBoostingRegressor](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py)
Visualize the data from well-known diabetes dataset:
""")
diabetes = datasets.load_diabetes(return_X_y=True, as_frame=True)
X, y = diabetes
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13
)
st.write(pd.concat([X, y], axis=1))
st.subheader("Hyperparameters")
st.markdown("""
You could test whatever hyperparameter you need, here we are using
- "n_estimators"
- "max_depth"
- "min_samples_split"
- "learning_rate"
- "loss"
""")
col1, col2 = st.beta_columns([3,3])
params = {
"n_estimators": col1.slider("n_estimator", 100, 1000, 500, 50),
"max_depth": col1.slider("max_depth", 1, 25, 8, 3),
"min_samples_split": col1.slider("min_samples_split", 1, 10, 5, 1),
"learning_rate": col2.slider("learning_rate", 0.001, 1.000, 0.010, 0.001),
"loss": col2.select_slider(
"Loss",
options=["ls", "lad", "huber", "quantile"],
),
}
# Train model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
# evaluate bias-variance
st.write("The mean squared error (MSE) on train set: {:.4f}".format(mean_squared_error(y_train, reg.predict(X_train))))
st.write("The mean squared error (MSE) on test set: {:.4f}".format(mean_squared_error(y_test, reg.predict(X_test))))
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = reg.loss_(y_test, y_pred)
# calc and plot Deviance
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
reg.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
col1, col2 = st.beta_columns([3,3])
col1.pyplot(fig)
# calc feature importance
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
#plotting feature importance
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance (MDI)')
col2.pyplot(fig)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment