Skip to content

Instantly share code, notes, and snippets.

View contour.py
X = np.array([[0, 0]])
y = np.array([[1.2]])
gp = GaussianProcess(X, y)
gp.update([[1.5, -1.5]], [[2.3]]) # second data point
gp.update([[-2,1.5]], [[-1.0]]) # third data point
gp.update([[2.1,1.3]], [[-0.6]]) # fourth data point
delta = 0.05 # changes granularity of the contour map
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
View runtimes.py
import numpy as np
import time
from sklearn.metrics.pairwise import rbf_kernel
x = np.random.rand(10000)
x = x.reshape(100,100)
distances = []
γ = -.5
start = time.time()
for i in x:
for j in x:
@ychennay
ychennay / preprocessing_sales_conversion_data.py
Last active May 20, 2019
Script to quickly process and load into memory anonymous sales conversion data from Kaggle
View preprocessing_sales_conversion_data.py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from io import StringIO
COLUMNS_TO_DROP = ["ad_id", "xyz_campaign_id", "fb_campaign_id", "Total_Conversion"] # drop to avoid potential data leakage
TARGET = ["Approved_Conversion"]
# a hosted version of sales optimization conversion data from an anonymous organization, originally from Kaggle
View compare_ols.py
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression
# get original NumPy implementation
β = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y).flatten()
# get sklearn's LinearRegression implementation weights
lr = LinearRegression()
sklearn_coefficients = lr.fit(X,y).coef_.flatten()
View explore_l2_regularization.py
# generate 100 different models with different lambda values, from 0 to 50
lambdas = np.linspace(0, 50, 200)
bias = []
variance = []
for λ in lambdas:
# refit model with new λ hyperparameter
β = np.linalg.inv(X.T.dot(X) + np.diag([λ] * len(X.T))).dot(X.T).dot(y).flatten()
bias.append(np.mean(np.mean(np.abs(X.dot(β) - y.values.flatten()))))
variance.append(np.var(X.dot(β)))
@ychennay
ychennay / sklearn_random_forest.py
Created May 12, 2019
A brief summary of sklearn's random forest class
View sklearn_random_forest.py
class RandomForestRegressor(ForestRegressor):
def __init__(self,
n_estimators='warn', criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.,
min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,
warm_start=False):
super().__init__(
base_estimator=DecisionTreeRegressor(),
... # pass through to ForestRegressor parent class many other constructor parameters omitted
View visualize_bias_variance.py
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('λ (Inverse Model Complexity)')
ax1.set_ylabel('Prediction Bias', color=color)
ax1.plot(lambdas, bias, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx()
View bootstrap_sampling.py
true_median = np.median(X["Impressions"])
errors = []
variances = []
for b in range(1, 500):
sample_medians = [np.median(X.sample(len(X), replace=True)["Impressions"]) for i in range(b)]
error = np.mean(sample_medians) - true_median
variances.append(np.std(sample_medians) ** 2) # record the variance of the bootstrap estimates
errors.append(error)
View heatmap.py
import seaborn as sns
corr = X.corr()
# randomly pick 30 of the interest columns to drop to make the visualization more readable
drops = np.random.choice(X.filter(regex=('interest_1?[0-9]{2}')).columns,
size=30, replace=False)
corr = corr.loc[~corr.index.isin(drops)]
corr.drop(columns=drops, inplace=True)
View bootstrap_decision_boundary.py
import numpy as np
def plot_decision_boundaries(X, y, model_class, bootstrap=False,
x_label=None, y_label=None, title=None, **model_params):
# adapted from https://gist.github.com/anandology/772d44d291a9daa198d4
reduced_data = X[:, :2] # take only the first two feature columns (since we are plotting a 2D contour map)
# we need to recombine the data and target together since we need
# to bootstrap sample from them with replacement