shivammehta25/helperfile.py

## helperfile.py
from scipy.stats import multivariate_normal
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
import random
import numpy as np
from sklearn.datasets import make_blobs
from matplotlib.colors import ListedColormap
from math import ceil, floor
from matplotlib import cm, gridspec
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.mixture import GaussianMixture


RANDOM_SEED=1234

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

Point = namedtuple('Point', ['x', 'y'])

grid_region_min = -8
grid_region_max = 8
sup_title = "Problem Setup"
x_label = "x1"
y_label = "x2"
z_label = "Probability"
fig_title_2D = "Distributions in feature space (2D)"
fig_title_3D = "Distributions in feature space (3D)"


def assert_between(p):
    if not (grid_region_min < p.x < grid_region_max):
        warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
        p = Point(grid_region_min if p.x < 0 else grid_region_max, p.y)

    if not (grid_region_min < p.y < grid_region_max):
        warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
        p = Point(p.x, grid_region_min if p.y < 0 else grid_region_max)

    return p


def generate_data_space():
    x = np.linspace(grid_region_min, grid_region_max, 1000)
    y = np.linspace(grid_region_min, grid_region_max, 1000)
    xx, yy = np.meshgrid(x, y)
    pos = np.dstack((xx, yy))
    return x, y, xx, yy, pos


def problem_1_data(scaling_factor, pos):
    # Two Gaussians
    mu1 = np.array([-2, -2])
    mu3 = np.array([2, 2])
    sigma = np.array([3, 3])
    z1 = multivariate_normal(mu1, sigma).pdf(pos) * scaling_factor[0]
    z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]
    peak1, peak2 = np.max(z1), np.max(z2)

    return z1, z2, (mu1, [], mu3), sigma, peak1, peak2


def problem_2_data(scaling_factor, pos):
    # 0 is mixture of gaussians now
    # 0 gaussian
    mu1 = np.array([0, -3.5])
    mu2 = np.array([-3.5, 0])
    sigma = np.array([3, 3])
    n_components = 2
    weights = np.ones(n_components, dtype=np.float32) / n_components
    z01 = multivariate_normal(mu1, sigma).pdf(pos) * (scaling_factor[0] * weights[0])
    z02 = multivariate_normal(mu2, sigma).pdf(pos) * (scaling_factor[0] * weights[1])
    z1 = z01 + z02

    # 1 gaussian
    mu3 = np.array([1.5, 1.5])
    sigma = np.array([3, 3])
    z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]

    peak1, peak2 = np.max(z1), np.max(z2)

    return z1, z2, (mu1, mu2, mu3), sigma, peak1, peak2


def plot_feature_space(p1=None, p2=None, figsize=(10, 4), scaling_factor=(1, 1), D3=False, optimal_boundary=False, cost_function=None, problem_v=1):

    decision_boundary = False
    if (p1 and p2):
        decision_boundary = True
        if not ((isinstance(p1, tuple) and len(p1) == 2) and (isinstance(p2, tuple) and len(p2) == 2)):
            raise ValueError("P1 and P2 should either be of type Point or tuple of len 2")

        p1 = assert_between(Point(p1[0], p1[1]))
        p2 = assert_between(Point(p2[0], p2[1]))


    x, y, xx, yy, pos = generate_data_space()


    if problem_v == 1:
        data_function = problem_1_data
    else:
        data_function = problem_2_data


    z1, z2, mu, sigma, peak1, peak2 = data_function(scaling_factor, pos)

    fig = plt.figure(figsize=figsize)
    plt.suptitle(sup_title)
    gs = gridspec.GridSpec(1, 2)

    # Plotting in 3D
    if D3:
        ax1 = fig.add_subplot(gs[0], projection='3d')

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            ax1.plot_surface(xx, yy,  z1, cmap=cm.coolwarm,
                        linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z1), vmax=np.nanmax(z1))

            ax1.plot_surface(xx, yy, z2, cmap=cm.Spectral,
                                    linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z2), vmax=np.nanmax(z2))

        mu1 = mu[0]
        mu2 = mu[-1]
        ax1.text(mu1[0], mu1[1], peak1, "class 0", (1, 1,0))
        ax1.text(mu2[0], mu2[1], peak2, "class 1", (1, 1,0))
        ax1.set_title(fig_title_3D)
        ax1.set_xlabel(x_label)
        ax1.set_ylabel(y_label)
        ax1.set_zlabel(z_label)


    # Plotting in 2D
        ax2 = fig.add_subplot(gs[1])

    else:
        ax2 = fig.add_subplot(gs[0])

    ax2.contour(xx, yy, z1, cmap=cm.coolwarm)
    ax2.contour(xx, yy, z2, cmap=cm.Spectral)

    if decision_boundary:
        r1, r2 = xx.flatten(), yy.flatten()
        r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))

        grid = np.hstack((r1,r2))

        yhat = np.where(((p2.x - p1.x)*( grid[:, 1] - p1.y) - (p2.y - p1.y)*(grid[:, 0] - p1.x)) > 0, 1, 0)

        zz = yhat.reshape(xx.shape)

        ax2.contourf(xx, yy, zz, cmap=ListedColormap(['tab:blue', 'tab:orange']), alpha=0.2)
        ax2.axline((p1.x, p1.y), (p2.x, p2.y), c="k")
        ax2.scatter(p1.x, p1.y, marker='X', s=500, c='k')
        ax2.scatter(p2.x, p2.y, marker='X', s=500, c='k')

    mu1 = mu[0]
    mu2 = mu[-1]
    ax2.text(mu1[0], mu1[1], "0", color="b")
    ax2.text(mu2[0], mu2[1], "1", color="k")
    ax2.set_title(fig_title_2D)
    ax2.set_xlabel(x_label)
    ax2.set_ylabel(y_label)


    # Monte Carlo estimation of values
    if decision_boundary:
        n = 5000
        r1 = scaling_factor[0] / sum(scaling_factor)
        r2 = scaling_factor[1] / sum(scaling_factor)


        if problem_v == 1:
            mu1, mu2 = mu[0], mu[-1]
            X, y = make_blobs(n_samples=[floor(n * r1), ceil(n * r2)], centers=[mu1, mu2], cluster_std=sigma, random_state=RANDOM_SEED)
        else:
            mu1, mu2, mu3 = mu[0], mu[1], mu[-1]
            X, y = make_blobs(n_samples=[floor(n * r1 * 0.5), ceil(n * r1 * 0.5), n - floor(n * r1 * 0.5) - ceil(n * r1 * 0.5)], centers=np.array([mu1, mu2, mu3]), cluster_std=[sigma, sigma, sigma], random_state=RANDOM_SEED)
            y[y == 1] = 0 # Converting the 2nd guassian output to 0
            y[y == 2] = 1 # convering the 3rd guassian output to 1

        X, y = monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function)

    if optimal_boundary:
        boundary = np.where(z2 >= z1, 1, 0)
        gmm_boundary = ax2.contour(xx, yy, boundary, colors='r')
        ax2.contourf(xx, yy, boundary, cmap=ListedColormap(['tab:green', 'tab:red']), alpha=0.1)


    plt.show()


def print_metrices(y, prediction):
    metrics = {
        "accuracy": round(accuracy_score(y, prediction), 4),
        "precision": round(precision_score(y, prediction), 4),
        "recall": round(recall_score(y, prediction), 4),
        "F1": round(f1_score(y, prediction, average="micro"), 4),
        "macro F1": round(f1_score(y, prediction, average="macro"), 4)
    }
    {print(f"{k}: {v}", end=' | ') for k,v in metrics.items()}
    print('')


def monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function=None, n=5000):
    prediction = np.where(((p2.x - p1.x)*( X[:, 1] - p1.y) - (p2.y - p1.y)*(X[:, 0] - p1.x)) > 0, 1, 0)

    TN, FP, FN, TP = confusion_matrix(y, prediction).ravel()

    print_metrices(y, prediction)

    if cost_function:
        cost = cost_function(TN, FP, FN, TP)
        print(f'Cost: {cost:,.3f}')

    return X, y
	from scipy.stats import multivariate_normal
	import numpy as np
	import matplotlib.pyplot as plt
	from collections import namedtuple
	import random
	import numpy as np
	from sklearn.datasets import make_blobs
	from matplotlib.colors import ListedColormap
	from math import ceil, floor
	from matplotlib import cm, gridspec
	import warnings
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
	import pandas as pd
	from sklearn.mixture import GaussianMixture


	RANDOM_SEED=1234

	random.seed(RANDOM_SEED)
	np.random.seed(RANDOM_SEED)

	Point = namedtuple('Point', ['x', 'y'])

	grid_region_min = -8
	grid_region_max = 8
	sup_title = "Problem Setup"
	x_label = "x1"
	y_label = "x2"
	z_label = "Probability"
	fig_title_2D = "Distributions in feature space (2D)"
	fig_title_3D = "Distributions in feature space (3D)"



	def assert_between(p):
	if not (grid_region_min < p.x < grid_region_max):
	warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
	p = Point(grid_region_min if p.x < 0 else grid_region_max, p.y)

	if not (grid_region_min < p.y < grid_region_max):
	warnings.warn("Values must be inside the grid i.e between ({}, {})! Therefore they are clipped.".format(grid_region_min, grid_region_max))
	p = Point(p.x, grid_region_min if p.y < 0 else grid_region_max)

	return p


	def generate_data_space():
	x = np.linspace(grid_region_min, grid_region_max, 1000)
	y = np.linspace(grid_region_min, grid_region_max, 1000)
	xx, yy = np.meshgrid(x, y)
	pos = np.dstack((xx, yy))
	return x, y, xx, yy, pos



	def problem_1_data(scaling_factor, pos):
	# Two Gaussians
	mu1 = np.array([-2, -2])
	mu3 = np.array([2, 2])
	sigma = np.array([3, 3])
	z1 = multivariate_normal(mu1, sigma).pdf(pos) * scaling_factor[0]
	z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]
	peak1, peak2 = np.max(z1), np.max(z2)

	return z1, z2, (mu1, [], mu3), sigma, peak1, peak2


	def problem_2_data(scaling_factor, pos):
	# 0 is mixture of gaussians now
	# 0 gaussian
	mu1 = np.array([0, -3.5])
	mu2 = np.array([-3.5, 0])
	sigma = np.array([3, 3])
	n_components = 2
	weights = np.ones(n_components, dtype=np.float32) / n_components
	z01 = multivariate_normal(mu1, sigma).pdf(pos) * (scaling_factor[0] * weights[0])
	z02 = multivariate_normal(mu2, sigma).pdf(pos) * (scaling_factor[0] * weights[1])
	z1 = z01 + z02

	# 1 gaussian
	mu3 = np.array([1.5, 1.5])
	sigma = np.array([3, 3])
	z2 = multivariate_normal(mu3, sigma).pdf(pos) * scaling_factor[1]

	peak1, peak2 = np.max(z1), np.max(z2)

	return z1, z2, (mu1, mu2, mu3), sigma, peak1, peak2



	def plot_feature_space(p1=None, p2=None, figsize=(10, 4), scaling_factor=(1, 1), D3=False, optimal_boundary=False, cost_function=None, problem_v=1):

	decision_boundary = False
	if (p1 and p2):
	decision_boundary = True
	if not ((isinstance(p1, tuple) and len(p1) == 2) and (isinstance(p2, tuple) and len(p2) == 2)):
	raise ValueError("P1 and P2 should either be of type Point or tuple of len 2")

	p1 = assert_between(Point(p1[0], p1[1]))
	p2 = assert_between(Point(p2[0], p2[1]))


	x, y, xx, yy, pos = generate_data_space()


	if problem_v == 1:
	data_function = problem_1_data
	else:
	data_function = problem_2_data


	z1, z2, mu, sigma, peak1, peak2 = data_function(scaling_factor, pos)

	fig = plt.figure(figsize=figsize)
	plt.suptitle(sup_title)
	gs = gridspec.GridSpec(1, 2)

	# Plotting in 3D
	if D3:
	ax1 = fig.add_subplot(gs[0], projection='3d')

	with warnings.catch_warnings():
	warnings.simplefilter("ignore")

	ax1.plot_surface(xx, yy, z1, cmap=cm.coolwarm,
	linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z1), vmax=np.nanmax(z1))

	ax1.plot_surface(xx, yy, z2, cmap=cm.Spectral,
	linewidth=0, antialiased=False, alpha=0.5, vmin=np.nanmin(z2), vmax=np.nanmax(z2))

	mu1 = mu[0]
	mu2 = mu[-1]
	ax1.text(mu1[0], mu1[1], peak1, "class 0", (1, 1,0))
	ax1.text(mu2[0], mu2[1], peak2, "class 1", (1, 1,0))
	ax1.set_title(fig_title_3D)
	ax1.set_xlabel(x_label)
	ax1.set_ylabel(y_label)
	ax1.set_zlabel(z_label)


	# Plotting in 2D
	ax2 = fig.add_subplot(gs[1])

	else:
	ax2 = fig.add_subplot(gs[0])

	ax2.contour(xx, yy, z1, cmap=cm.coolwarm)
	ax2.contour(xx, yy, z2, cmap=cm.Spectral)

	if decision_boundary:
	r1, r2 = xx.flatten(), yy.flatten()
	r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))

	grid = np.hstack((r1,r2))

	yhat = np.where(((p2.x - p1.x)( grid[:, 1] - p1.y) - (p2.y - p1.y)(grid[:, 0] - p1.x)) > 0, 1, 0)

	zz = yhat.reshape(xx.shape)

	ax2.contourf(xx, yy, zz, cmap=ListedColormap(['tab:blue', 'tab:orange']), alpha=0.2)
	ax2.axline((p1.x, p1.y), (p2.x, p2.y), c="k")
	ax2.scatter(p1.x, p1.y, marker='X', s=500, c='k')
	ax2.scatter(p2.x, p2.y, marker='X', s=500, c='k')

	mu1 = mu[0]
	mu2 = mu[-1]
	ax2.text(mu1[0], mu1[1], "0", color="b")
	ax2.text(mu2[0], mu2[1], "1", color="k")
	ax2.set_title(fig_title_2D)
	ax2.set_xlabel(x_label)
	ax2.set_ylabel(y_label)


	# Monte Carlo estimation of values
	if decision_boundary:
	n = 5000
	r1 = scaling_factor[0] / sum(scaling_factor)
	r2 = scaling_factor[1] / sum(scaling_factor)


	if problem_v == 1:
	mu1, mu2 = mu[0], mu[-1]
	X, y = make_blobs(n_samples=[floor(n * r1), ceil(n * r2)], centers=[mu1, mu2], cluster_std=sigma, random_state=RANDOM_SEED)
	else:
	mu1, mu2, mu3 = mu[0], mu[1], mu[-1]
	X, y = make_blobs(n_samples=[floor(n * r1 * 0.5), ceil(n * r1 * 0.5), n - floor(n * r1 * 0.5) - ceil(n * r1 * 0.5)], centers=np.array([mu1, mu2, mu3]), cluster_std=[sigma, sigma, sigma], random_state=RANDOM_SEED)
	y[y == 1] = 0 # Converting the 2nd guassian output to 0
	y[y == 2] = 1 # convering the 3rd guassian output to 1

	X, y = monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function)

	if optimal_boundary:
	boundary = np.where(z2 >= z1, 1, 0)
	gmm_boundary = ax2.contour(xx, yy, boundary, colors='r')
	ax2.contourf(xx, yy, boundary, cmap=ListedColormap(['tab:green', 'tab:red']), alpha=0.1)


	plt.show()


	def print_metrices(y, prediction):
	metrics = {
	"accuracy": round(accuracy_score(y, prediction), 4),
	"precision": round(precision_score(y, prediction), 4),
	"recall": round(recall_score(y, prediction), 4),
	"F1": round(f1_score(y, prediction, average="micro"), 4),
	"macro F1": round(f1_score(y, prediction, average="macro"), 4)
	}
	{print(f"{k}: {v}", end=' \| ') for k,v in metrics.items()}
	print('')


	def monte_carlo_estimates_decision_boundary(X, y, p1, p2, cost_function=None, n=5000):
	prediction = np.where(((p2.x - p1.x)( X[:, 1] - p1.y) - (p2.y - p1.y)(X[:, 0] - p1.x)) > 0, 1, 0)

	TN, FP, FN, TP = confusion_matrix(y, prediction).ravel()

	print_metrices(y, prediction)

	if cost_function:
	cost = cost_function(TN, FP, FN, TP)
	print(f'Cost: {cost:,.3f}')

	return X, y