Skip to content

Instantly share code, notes, and snippets.

View ychennay's full-sized avatar

Yu Chen ychennay

View GitHub Profile
X = np.array([[0, 0]])
y = np.array([[1.2]])
gp = GaussianProcess(X, y)
gp.update([[1.5, -1.5]], [[2.3]]) # second data point
gp.update([[-2,1.5]], [[-1.0]]) # third data point
gp.update([[2.1,1.3]], [[-0.6]]) # fourth data point
delta = 0.05 # changes granularity of the contour map
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
import numpy as np
import time
from sklearn.metrics.pairwise import rbf_kernel
x = np.random.rand(10000)
x = x.reshape(100,100)
distances = []
γ = -.5
start = time.time()
for i in x:
for j in x:
@ychennay
ychennay / preprocessing_sales_conversion_data.py
Last active May 20, 2019 00:05
Script to quickly process and load into memory anonymous sales conversion data from Kaggle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from io import StringIO
COLUMNS_TO_DROP = ["ad_id", "xyz_campaign_id", "fb_campaign_id", "Total_Conversion"] # drop to avoid potential data leakage
TARGET = ["Approved_Conversion"]
# a hosted version of sales optimization conversion data from an anonymous organization, originally from Kaggle
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression
# get original NumPy implementation
β = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y).flatten()
# get sklearn's LinearRegression implementation weights
lr = LinearRegression()
sklearn_coefficients = lr.fit(X,y).coef_.flatten()
# generate 100 different models with different lambda values, from 0 to 50
lambdas = np.linspace(0, 50, 200)
bias = []
variance = []
for λ in lambdas:
# refit model with new λ hyperparameter
β = np.linalg.inv(X.T.dot(X) + np.diag([λ] * len(X.T))).dot(X.T).dot(y).flatten()
bias.append(np.mean(np.mean(np.abs(X.dot(β) - y.values.flatten()))))
variance.append(np.var(X.dot(β)))
@ychennay
ychennay / sklearn_random_forest.py
Created May 12, 2019 17:36
A brief summary of sklearn's random forest class
class RandomForestRegressor(ForestRegressor):
def __init__(self,
n_estimators='warn', criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.,
min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,
warm_start=False):
super().__init__(
base_estimator=DecisionTreeRegressor(),
... # pass through to ForestRegressor parent class many other constructor parameters omitted
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('λ (Inverse Model Complexity)')
ax1.set_ylabel('Prediction Bias', color=color)
ax1.plot(lambdas, bias, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
true_median = np.median(X["Impressions"])
errors = []
variances = []
for b in range(1, 500):
sample_medians = [np.median(X.sample(len(X), replace=True)["Impressions"]) for i in range(b)]
error = np.mean(sample_medians) - true_median
variances.append(np.std(sample_medians) ** 2) # record the variance of the bootstrap estimates
errors.append(error)
import seaborn as sns
corr = X.corr()
# randomly pick 30 of the interest columns to drop to make the visualization more readable
drops = np.random.choice(X.filter(regex=('interest_1?[0-9]{2}')).columns,
size=30, replace=False)
corr = corr.loc[~corr.index.isin(drops)]
corr.drop(columns=drops, inplace=True)
import numpy as np
def plot_decision_boundaries(X, y, model_class, bootstrap=False,
x_label=None, y_label=None, title=None, **model_params):
# adapted from https://gist.github.com/anandology/772d44d291a9daa198d4
reduced_data = X[:, :2] # take only the first two feature columns (since we are plotting a 2D contour map)
# we need to recombine the data and target together since we need
# to bootstrap sample from them with replacement