Skip to content

Instantly share code, notes, and snippets.

View ychennay's full-sized avatar

Yu Chen ychennay

View GitHub Profile
X = np.array([[0, 0]])
y = np.array([[1.2]])
gp = GaussianProcess(X, y)
gp.update([[1.5, -1.5]], [[2.3]]) # second data point
gp.update([[-2,1.5]], [[-1.0]]) # third data point
gp.update([[2.1,1.3]], [[-0.6]]) # fourth data point
delta = 0.05 # changes granularity of the contour map
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
import numpy as np
import time
from sklearn.metrics.pairwise import rbf_kernel
x = np.random.rand(10000)
x = x.reshape(100,100)
distances = []
γ = -.5
start = time.time()
for i in x:
for j in x:
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression
# get original NumPy implementation
β = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y).flatten()
# get sklearn's LinearRegression implementation weights
lr = LinearRegression()
sklearn_coefficients = lr.fit(X,y).coef_.flatten()
# generate 100 different models with different lambda values, from 0 to 50
lambdas = np.linspace(0, 50, 200)
bias = []
variance = []
for λ in lambdas:
# refit model with new λ hyperparameter
β = np.linalg.inv(X.T.dot(X) + np.diag([λ] * len(X.T))).dot(X.T).dot(y).flatten()
bias.append(np.mean(np.mean(np.abs(X.dot(β) - y.values.flatten()))))
variance.append(np.var(X.dot(β)))
@ychennay
ychennay / sklearn_random_forest.py
Created May 12, 2019 17:36
A brief summary of sklearn's random forest class
class RandomForestRegressor(ForestRegressor):
def __init__(self,
n_estimators='warn', criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.,
min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,
warm_start=False):
super().__init__(
base_estimator=DecisionTreeRegressor(),
... # pass through to ForestRegressor parent class many other constructor parameters omitted
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('λ (Inverse Model Complexity)')
ax1.set_ylabel('Prediction Bias', color=color)
ax1.plot(lambdas, bias, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
true_median = np.median(X["Impressions"])
errors = []
variances = []
for b in range(1, 500):
sample_medians = [np.median(X.sample(len(X), replace=True)["Impressions"]) for i in range(b)]
error = np.mean(sample_medians) - true_median
variances.append(np.std(sample_medians) ** 2) # record the variance of the bootstrap estimates
errors.append(error)
import seaborn as sns
corr = X.corr()
# randomly pick 30 of the interest columns to drop to make the visualization more readable
drops = np.random.choice(X.filter(regex=('interest_1?[0-9]{2}')).columns,
size=30, replace=False)
corr = corr.loc[~corr.index.isin(drops)]
corr.drop(columns=drops, inplace=True)
# using dictionary convert columns into categorical data types
convert_dict = {'gender': "category",
'interest':"category",
"age": "category"}
conversions_df = conversions_df.astype(convert_dict)
dummified_data = pd.get_dummies(conversions_df, drop_first=True) # get dummy features for categorical variables
# make gender the target variable for classification
TARGET = ["gender_M"]
import numpy as np
def plot_decision_boundaries(X, y, model_class, bootstrap=False,
x_label=None, y_label=None, title=None, **model_params):
# adapted from https://gist.github.com/anandology/772d44d291a9daa198d4
reduced_data = X[:, :2] # take only the first two feature columns (since we are plotting a 2D contour map)
# we need to recombine the data and target together since we need
# to bootstrap sample from them with replacement