JonathanLoscalzo/sklearn_tuto.py

## sklearn_tuto.py
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import streamlit as st

st.set_page_config(
    page_title="Working with streamlit and sklearn",
    page_icon="🧊",
    # layout="wide",
    initial_sidebar_state="collapsed",
)

st.header("Working with streamlit and sklearn")
st.markdown("""
With streamlit, if you don't have any akward dependency, you could have just one file!
But if your script depends on ther files, such images or models, you will need some repository.

Here, we are using the sklearn example from here:

[sklearn-GradientBoostingRegressor](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py)


Visualize the data from well-known diabetes dataset:
""")

diabetes = datasets.load_diabetes(return_X_y=True, as_frame=True)
X, y = diabetes

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=13
)
st.write(pd.concat([X, y], axis=1))


st.subheader("Hyperparameters")
st.markdown("""
You could test whatever hyperparameter you need, here we are using
- "n_estimators"
- "max_depth"
- "min_samples_split"
- "learning_rate"
- "loss"
""")

col1, col2 = st.beta_columns([3,3])
params = {
    "n_estimators": col1.slider("n_estimator", 100, 1000, 500, 50),
    "max_depth": col1.slider("max_depth", 1, 25, 8, 3),
    "min_samples_split": col1.slider("min_samples_split", 1, 10, 5, 1),
    "learning_rate": col2.slider("learning_rate", 0.001, 1.000, 0.010, 0.001),
    "loss": col2.select_slider(
        "Loss",
        options=["ls", "lad", "huber", "quantile"],
    ),
}

# Train model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

# evaluate bias-variance
st.write("The mean squared error (MSE) on train set: {:.4f}".format(mean_squared_error(y_train, reg.predict(X_train))))

st.write("The mean squared error (MSE) on test  set: {:.4f}".format(mean_squared_error(y_test, reg.predict(X_test))))

test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
    test_score[i] = reg.loss_(y_test, y_pred)

# calc and plot Deviance
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
    np.arange(params["n_estimators"]) + 1,
    reg.train_score_,
    "b-",
    label="Training Set Deviance",
)
plt.plot(
    np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")

col1, col2 = st.beta_columns([3,3])

col1.pyplot(fig)

# calc feature importance
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

#plotting feature importance
fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
plt.title('Feature Importance (MDI)')

col2.pyplot(fig)
	import matplotlib.pyplot as plt
	import numpy as np
	from sklearn import datasets, ensemble
	from sklearn.inspection import permutation_importance
	from sklearn.metrics import mean_squared_error
	from sklearn.model_selection import train_test_split
	import pandas as pd
	import streamlit as st

	st.set_page_config(
	page_title="Working with streamlit and sklearn",
	page_icon="🧊",
	# layout="wide",
	initial_sidebar_state="collapsed",
	)

	st.header("Working with streamlit and sklearn")
	st.markdown("""
	With streamlit, if you don't have any akward dependency, you could have just one file!
	But if your script depends on ther files, such images or models, you will need some repository.

	Here, we are using the sklearn example from here:

	[sklearn-GradientBoostingRegressor](https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py)


	Visualize the data from well-known diabetes dataset:
	""")

	diabetes = datasets.load_diabetes(return_X_y=True, as_frame=True)
	X, y = diabetes

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.1, random_state=13
	)
	st.write(pd.concat([X, y], axis=1))


	st.subheader("Hyperparameters")
	st.markdown("""
	You could test whatever hyperparameter you need, here we are using
	- "n_estimators"
	- "max_depth"
	- "min_samples_split"
	- "learning_rate"
	- "loss"
	""")

	col1, col2 = st.beta_columns([3,3])
	params = {
	"n_estimators": col1.slider("n_estimator", 100, 1000, 500, 50),
	"max_depth": col1.slider("max_depth", 1, 25, 8, 3),
	"min_samples_split": col1.slider("min_samples_split", 1, 10, 5, 1),
	"learning_rate": col2.slider("learning_rate", 0.001, 1.000, 0.010, 0.001),
	"loss": col2.select_slider(
	"Loss",
	options=["ls", "lad", "huber", "quantile"],
	),
	}

	# Train model
	reg = ensemble.GradientBoostingRegressor(**params)
	reg.fit(X_train, y_train)

	# evaluate bias-variance
	st.write("The mean squared error (MSE) on train set: {:.4f}".format(mean_squared_error(y_train, reg.predict(X_train))))

	st.write("The mean squared error (MSE) on test set: {:.4f}".format(mean_squared_error(y_test, reg.predict(X_test))))

	test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
	for i, y_pred in enumerate(reg.staged_predict(X_test)):
	test_score[i] = reg.loss_(y_test, y_pred)

	# calc and plot Deviance
	fig = plt.figure(figsize=(6, 6))
	plt.subplot(1, 1, 1)
	plt.title("Deviance")
	plt.plot(
	np.arange(params["n_estimators"]) + 1,
	reg.train_score_,
	"b-",
	label="Training Set Deviance",
	)
	plt.plot(
	np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
	)
	plt.legend(loc="upper right")
	plt.xlabel("Boosting Iterations")
	plt.ylabel("Deviance")

	col1, col2 = st.beta_columns([3,3])

	col1.pyplot(fig)

	# calc feature importance
	feature_importance = reg.feature_importances_
	sorted_idx = np.argsort(feature_importance)
	pos = np.arange(sorted_idx.shape[0]) + .5

	#plotting feature importance
	fig = plt.figure(figsize=(6, 6))
	plt.subplot(1, 1, 1)
	plt.barh(pos, feature_importance[sorted_idx], align='center')
	plt.yticks(pos, np.array(X_train.columns)[sorted_idx])
	plt.title('Feature Importance (MDI)')

	col2.pyplot(fig)