Skip to content

Instantly share code, notes, and snippets.

@shivammehta25
Created January 19, 2022 09:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shivammehta25/d60cc1319af6cdd318b9c7295745108e to your computer and use it in GitHub Desktop.
Save shivammehta25/d60cc1319af6cdd318b9c7295745108e to your computer and use it in GitHub Desktop.
Helperfile for the second programming assignment
from scipy.stats import multivariate_normal
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
import random
import numpy as np
from sklearn.datasets import make_blobs
from matplotlib.colors import ListedColormap
from math import ceil, floor
from matplotlib import cm, gridspec
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.mixture import GaussianMixture
RANDOM_SEED=1234
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
Point = namedtuple('Point', ['x', 'y'])
grid_region_min = -8
grid_region_max = 8
sup_title = "Problem Setup"
x_label = "x1"
y_label = "x2"
z_label = "Probability"
fig_title_2D = "Distributions in feature space (2D)"
fig_title_3D = "Distributions in feature space (3D)"
def assert_between(p):
if not (grid_region_min < p.x < grid_region_max):
warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
p = Point(grid_region_min if p.x < 0 else grid_region_max, p.y)
if not (grid_region_min < p.y < grid_region_max):
warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
p = Point(p.x, grid_region_min if p.y < 0 else grid_region_max)
return p
def generate_data_space():
x = np.linspace(grid_region_min, grid_region_max, 1000)
y = np.linspace(grid_region_min, grid_region_max, 1000)
xx, yy = np.meshgrid(x, y)
pos = np.dstack((xx, yy))
return x, y, xx, yy, pos
def problem_1_data(scaling_factor, pos):
# Two Gaussians
mu1 = np.array([-2, -2])
mu3 = np.array([2, 2])
sigma = np.array([3, 3])
z1 = multivariate_normal(mu1, sigma).pdf(pos) * scaling_factor[0]
z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]
peak1, peak2 = np.max(z1), np.max(z2)
return z1, z2, (mu1, [], mu3), sigma, peak1, peak2
def problem_2_data(scaling_factor, pos):
# 0 is mixture of gaussians now
# 0 gaussian
mu1 = np.array([0, -3.5])
mu2 = np.array([-3.5, 0])
sigma = np.array([3, 3])
n_components = 2
weights = np.ones(n_components, dtype=np.float32) / n_components
z01 = multivariate_normal(mu1, sigma).pdf(pos) * (scaling_factor[0] * weights[0])
z02 = multivariate_normal(mu2, sigma).pdf(pos) * (scaling_factor[0] * weights[1])
z1 = z01 + z02
# 1 gaussian
mu3 = np.array([1.5, 1.5])
sigma = np.array([3, 3])
z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]
peak1, peak2 = np.max(z1), np.max(z2)
return z1, z2, (mu1, mu2, mu3), sigma, peak1, peak2
def plot_feature_space(p1=None, p2=None, figsize=(10, 4), scaling_factor=(1, 1), D3=False, optimal_boundary=False, cost_function=None, problem_v=1):
decision_boundary = False
if (p1 and p2):
decision_boundary = True
if not ((isinstance(p1, tuple) and len(p1) == 2) and (isinstance(p2, tuple) and len(p2) == 2)):
raise ValueError("P1 and P2 should either be of type Point or tuple of len 2")
p1 = assert_between(Point(p1[0], p1[1]))
p2 = assert_between(Point(p2[0], p2[1]))
x, y, xx, yy, pos = generate_data_space()
if problem_v == 1:
data_function = problem_1_data
else:
data_function = problem_2_data
z1, z2, mu, sigma, peak1, peak2 = data_function(scaling_factor, pos)
fig = plt.figure(figsize=figsize)
plt.suptitle(sup_title)
gs = gridspec.GridSpec(1, 2)
# Plotting in 3D
if D3:
ax1 = fig.add_subplot(gs[0], projection='3d')
with warnings.catch_warnings():
warnings.simplefilter("ignore")
ax1.plot_surface(xx, yy, z1, cmap=cm.coolwarm,
linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z1), vmax=np.nanmax(z1))
ax1.plot_surface(xx, yy, z2, cmap=cm.Spectral,
linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z2), vmax=np.nanmax(z2))
mu1 = mu[0]
mu2 = mu[-1]
ax1.text(mu1[0], mu1[1], peak1, "class 0", (1, 1,0))
ax1.text(mu2[0], mu2[1], peak2, "class 1", (1, 1,0))
ax1.set_title(fig_title_3D)
ax1.set_xlabel(x_label)
ax1.set_ylabel(y_label)
ax1.set_zlabel(z_label)
# Plotting in 2D
ax2 = fig.add_subplot(gs[1])
else:
ax2 = fig.add_subplot(gs[0])
ax2.contour(xx, yy, z1, cmap=cm.coolwarm)
ax2.contour(xx, yy, z2, cmap=cm.Spectral)
if decision_boundary:
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
grid = np.hstack((r1,r2))
yhat = np.where(((p2.x - p1.x)*( grid[:, 1] - p1.y) - (p2.y - p1.y)*(grid[:, 0] - p1.x)) > 0, 1, 0)
zz = yhat.reshape(xx.shape)
ax2.contourf(xx, yy, zz, cmap=ListedColormap(['tab:blue', 'tab:orange']), alpha=0.2)
ax2.axline((p1.x, p1.y), (p2.x, p2.y), c="k")
ax2.scatter(p1.x, p1.y, marker='X', s=500, c='k')
ax2.scatter(p2.x, p2.y, marker='X', s=500, c='k')
mu1 = mu[0]
mu2 = mu[-1]
ax2.text(mu1[0], mu1[1], "0", color="b")
ax2.text(mu2[0], mu2[1], "1", color="k")
ax2.set_title(fig_title_2D)
ax2.set_xlabel(x_label)
ax2.set_ylabel(y_label)
# Monte Carlo estimation of values
if decision_boundary:
n = 5000
r1 = scaling_factor[0] / sum(scaling_factor)
r2 = scaling_factor[1] / sum(scaling_factor)
if problem_v == 1:
mu1, mu2 = mu[0], mu[-1]
X, y = make_blobs(n_samples=[floor(n * r1), ceil(n * r2)], centers=[mu1, mu2], cluster_std=sigma, random_state=RANDOM_SEED)
else:
mu1, mu2, mu3 = mu[0], mu[1], mu[-1]
X, y = make_blobs(n_samples=[floor(n * r1 * 0.5), ceil(n * r1 * 0.5), n - floor(n * r1 * 0.5) - ceil(n * r1 * 0.5)], centers=np.array([mu1, mu2, mu3]), cluster_std=[sigma, sigma, sigma], random_state=RANDOM_SEED)
y[y == 1] = 0 # Converting the 2nd guassian output to 0
y[y == 2] = 1 # convering the 3rd guassian output to 1
X, y = monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function)
if optimal_boundary:
boundary = np.where(z2 >= z1, 1, 0)
gmm_boundary = ax2.contour(xx, yy, boundary, colors='r')
ax2.contourf(xx, yy, boundary, cmap=ListedColormap(['tab:green', 'tab:red']), alpha=0.1)
plt.show()
def print_metrices(y, prediction):
metrics = {
"accuracy": round(accuracy_score(y, prediction), 4),
"precision": round(precision_score(y, prediction), 4),
"recall": round(recall_score(y, prediction), 4),
"F1": round(f1_score(y, prediction, average="micro"), 4),
"macro F1": round(f1_score(y, prediction, average="macro"), 4)
}
{print(f"{k}: {v}", end=' | ') for k,v in metrics.items()}
print('')
def monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function=None, n=5000):
prediction = np.where(((p2.x - p1.x)*( X[:, 1] - p1.y) - (p2.y - p1.y)*(X[:, 0] - p1.x)) > 0, 1, 0)
TN, FP, FN, TP = confusion_matrix(y, prediction).ravel()
print_metrices(y, prediction)
if cost_function:
cost = cost_function(TN, FP, FN, TP)
print(f'Cost: {cost:,.3f}')
return X, y
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment