Yu Chen ychennay

## contour.py
X = np.array([[0, 0]])
y = np.array([[1.2]])
gp = GaussianProcess(X, y)
gp.update([[1.5, -1.5]], [[2.3]]) # second data point
gp.update([[-2,1.5]], [[-1.0]]) # third data point
gp.update([[2.1,1.3]], [[-0.6]]) # fourth data point

delta = 0.05 # changes granularity of the contour map
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)

## runtimes.py
import numpy as np
import time
from sklearn.metrics.pairwise import rbf_kernel
x = np.random.rand(10000)
x = x.reshape(100,100)
distances = []
γ = -.5
start = time.time()
for i in x:
    for j in x:

## preprocessing_sales_conversion_data.py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from io import StringIO

COLUMNS_TO_DROP = ["ad_id", "xyz_campaign_id", "fb_campaign_id", "Total_Conversion"] # drop to avoid potential data leakage

TARGET = ["Approved_Conversion"]
# a hosted version of sales optimization conversion data from an anonymous organization, originally from Kaggle

## compare_ols.py
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression

# get original NumPy implementation
β = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y).flatten()

# get sklearn's LinearRegression implementation weights
lr = LinearRegression()
sklearn_coefficients = lr.fit(X,y).coef_.flatten()

## explore_l2_regularization.py
# generate 100 different models with different lambda values, from 0 to 50
lambdas = np.linspace(0, 50, 200)
bias = []
variance = []
for λ in lambdas:

    # refit model with new λ hyperparameter
    β = np.linalg.inv(X.T.dot(X) + np.diag([λ] * len(X.T))).dot(X.T).dot(y).flatten()
    bias.append(np.mean(np.mean(np.abs(X.dot(β) - y.values.flatten()))))
    variance.append(np.var(X.dot(β)))

## sklearn_random_forest.py
class RandomForestRegressor(ForestRegressor):
    def __init__(self,
                 n_estimators='warn', criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.,
                 min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,
                 warm_start=False):

       super().__init__(
            base_estimator=DecisionTreeRegressor(),
            ... # pass through to ForestRegressor parent class many other constructor parameters omitted

## visualize_bias_variance.py
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('λ (Inverse Model Complexity)')
ax1.set_ylabel('Prediction Bias', color=color)
ax1.plot(lambdas, bias, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()

## bootstrap_sampling.py
true_median = np.median(X["Impressions"])

errors = []
variances = []
for b in range(1, 500):
    sample_medians = [np.median(X.sample(len(X), replace=True)["Impressions"]) for i in range(b)]
    error = np.mean(sample_medians) - true_median
    variances.append(np.std(sample_medians) ** 2) # record the variance of the bootstrap estimates
    errors.append(error)

## heatmap.py
import seaborn as sns
corr = X.corr()

# randomly pick 30 of the interest columns to drop to make the visualization more readable

drops = np.random.choice(X.filter(regex=('interest_1?[0-9]{2}')).columns,
                 size=30, replace=False)
corr = corr.loc[~corr.index.isin(drops)]
corr.drop(columns=drops, inplace=True)

## bootstrap_decision_boundary.py
import numpy as np

def plot_decision_boundaries(X, y, model_class, bootstrap=False,
                             x_label=None, y_label=None, title=None, **model_params):

    # adapted from https://gist.github.com/anandology/772d44d291a9daa198d4
    reduced_data = X[:, :2] # take only the first two feature columns (since we are plotting a 2D contour map)

    # we need to recombine the data and target together since we need
    # to bootstrap sample from them with replacement
	X = np.array([[0, 0]])
	y = np.array([[1.2]])
	gp = GaussianProcess(X, y)
	gp.update([[1.5, -1.5]], [[2.3]]) # second data point
	gp.update([[-2,1.5]], [[-1.0]]) # third data point
	gp.update([[2.1,1.3]], [[-0.6]]) # fourth data point

	delta = 0.05 # changes granularity of the contour map
	x = np.arange(-3.0, 3.0, delta)
	y = np.arange(-2.0, 2.0, delta)
	import numpy as np
	import time
	from sklearn.metrics.pairwise import rbf_kernel
	x = np.random.rand(10000)
	x = x.reshape(100,100)
	distances = []
	γ = -.5
	start = time.time()
	for i in x:
	for j in x:
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	import requests
	from io import StringIO

	COLUMNS_TO_DROP = ["ad_id", "xyz_campaign_id", "fb_campaign_id", "Total_Conversion"] # drop to avoid potential data leakage

	TARGET = ["Approved_Conversion"]
	# a hosted version of sales optimization conversion data from an anonymous organization, originally from Kaggle
	import statsmodels.api as sm
	import numpy as np
	from sklearn.linear_model import LinearRegression

	# get original NumPy implementation
	β = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y).flatten()

	# get sklearn's LinearRegression implementation weights
	lr = LinearRegression()
	sklearn_coefficients = lr.fit(X,y).coef_.flatten()
	# generate 100 different models with different lambda values, from 0 to 50
	lambdas = np.linspace(0, 50, 200)
	bias = []
	variance = []
	for λ in lambdas:

	# refit model with new λ hyperparameter
	β = np.linalg.inv(X.T.dot(X) + np.diag([λ] * len(X.T))).dot(X.T).dot(y).flatten()
	bias.append(np.mean(np.mean(np.abs(X.dot(β) - y.values.flatten()))))
	variance.append(np.var(X.dot(β)))
	class RandomForestRegressor(ForestRegressor):
	def __init__(self,
	n_estimators='warn', criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1,
	min_weight_fraction_leaf=0.,max_features="auto",max_leaf_nodes=None,min_impurity_decrease=0.,
	min_impurity_split=None,bootstrap=True,oob_score=False,n_jobs=None,random_state=None,verbose=0,
	warm_start=False):

	super().__init__(
	base_estimator=DecisionTreeRegressor(),
	... # pass through to ForestRegressor parent class many other constructor parameters omitted
	fig, ax1 = plt.subplots()

	color = 'tab:red'
	ax1.set_xlabel('λ (Inverse Model Complexity)')
	ax1.set_ylabel('Prediction Bias', color=color)
	ax1.plot(lambdas, bias, color=color)
	ax1.tick_params(axis='y', labelcolor=color)

	ax2 = ax1.twinx()
	true_median = np.median(X["Impressions"])

	errors = []
	variances = []
	for b in range(1, 500):
	sample_medians = [np.median(X.sample(len(X), replace=True)["Impressions"]) for i in range(b)]
	error = np.mean(sample_medians) - true_median
	variances.append(np.std(sample_medians) ** 2) # record the variance of the bootstrap estimates
	errors.append(error)
	import seaborn as sns
	corr = X.corr()

	# randomly pick 30 of the interest columns to drop to make the visualization more readable

	drops = np.random.choice(X.filter(regex=('interest_1?[0-9]{2}')).columns,
	size=30, replace=False)
	corr = corr.loc[~corr.index.isin(drops)]
	corr.drop(columns=drops, inplace=True)
	import numpy as np

	def plot_decision_boundaries(X, y, model_class, bootstrap=False,
	x_label=None, y_label=None, title=None, **model_params):

	# adapted from https://gist.github.com/anandology/772d44d291a9daa198d4
	reduced_data = X[:, :2] # take only the first two feature columns (since we are plotting a 2D contour map)

	# we need to recombine the data and target together since we need
	# to bootstrap sample from them with replacement