duarteocarmo/3_1_Outliers.py

## 3_1_Outliers.py
# exercise 11.4.1
import numpy as np
from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm,
                               subplot, show, legend, hold)
from matplotlib.pyplot import (figure, hold, subplot, plot, xlabel, ylabel,
                               xticks, yticks,legend,show)
from scipy.io import loadmat
from toolbox_02450 import gausKernelDensity
from sklearn.neighbors import NearestNeighbors
from Project_Clean_data import raw, header, standardize_this
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter
from textwrap import wrap


toggle_plot = True
target_attribute_name = 'Dx'
target_index = list(header).index(target_attribute_name)
X = raw
X = standardize_this(X)
y = X[:, target_index]
attributeNames = np.delete(header, target_index)
classNames = ['Non-Outlier', 'Possible Outlier']
N, M = X.shape

window = N

density_matrix = np.zeros((N, 1))


### Gausian Kernel density estimator
# cross-validate kernel width by leave-one-out-cross-validation
# (efficient implementation in gausKernelDensity function)
# evaluate for range of kernel widths
widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3))
logP = np.zeros(np.size(widths))
for i, w in enumerate(widths):
    print('Fold {:2d}, w={:f}'.format(i, w))
    density, log_density = gausKernelDensity(X, w)
    logP[i] = log_density.sum()

val = logP.max()
ind = logP.argmax()

width = widths[ind]
print('Optimal estimated width is: {0}'.format(width))

# evaluate density for estimated width
density, log_density = gausKernelDensity(X, width)

density_matrix = np.hstack((density_matrix, density))


# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i].reshape(-1, )

# Plot density estimate of outlier score
if toggle_plot:
    figure(1)
    bar(range(window), density[:window])
    title('Gaussian Kernel: Density estimate')

show()

### K-neighbors density estimator
# Neighbor to use:
K = 5

# Find the k nearest neighbors
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)

density = 1. / (D.sum(axis=1) / K)
density_toappend = 1. / (D.sum(axis=1) / K)
density_toappend.shape = (density_toappend.shape[0], 1)
density_matrix = np.hstack((density_matrix, density_toappend))

# Sort the scores
i = density.argsort()
density = density[i]

# Plot k-neighbor estimate of outlier score (distances)
if toggle_plot:
    figure(3)
    bar(range(window), density[:window])
    title('KNN density: Outlier score')
    show()

### K-nearest neigbor average relative density
# Compute the average relative density

knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
density = 1. / (D.sum(axis=1) / K)
avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)
avg_rel_density_to_append = density / (density[i[:, 1:]].sum(axis=1) / K)
avg_rel_density_to_append.shape = (avg_rel_density_to_append.shape[0], 1)
density_matrix = np.hstack((density_matrix, avg_rel_density_to_append))

# Sort the avg.rel.densities
i_avg_rel = avg_rel_density.argsort()
avg_rel_density = avg_rel_density[i_avg_rel]

# Plot k-neighbor estimate of outlier score (distances)
if toggle_plot:
    figure(5)
    bar(range(window), avg_rel_density[:window])
    title('KNN average relative density: Outlier score')
    show()

density_matrix = np.delete(density_matrix, 0, axis=1)

indexes = np.argsort(density_matrix, axis=0)

Check_the_last = 50

method = ['GK', 'KN', 'KNA']

outs = []

for i in range(density_matrix.shape[1]):
    print('The outliers for method {}:'.format(method[i]))
    printer = []
    for j in range(density_matrix.shape[0]):
        if indexes[j, i] <= 40:
            printer.append(j)
            outs.append(j)
    print(printer)


regularity = Counter(outs)
plotthese = []
for element in regularity.most_common(30):
    if element[1] > 1:
        print('Observation {} is an outlier because it appears the lists of lower densities {} times.'
              .format(element[0], element[1]))
        plotthese.append(element[0])


outlier_features = X[list(set(plotthese)), :]

if toggle_plot:
    sns.heatmap(outlier_features.transpose(), cbar=False, yticklabels=header, xticklabels=plotthese)
    plt.yticks(rotation=0)
    plt.show()

C = 2

# Keep main attributes and header
important_attributes = [0, 1, 2, 3, 5, 8, 11]
main_raw = raw[:,important_attributes]
main_header = header[important_attributes]


main_header = [ '\n'.join(wrap(l, 20)) for l in main_header ]


# Compute values of N, M and C.
M = len(important_attributes)
X = main_raw
z = [0 for x in range(X.shape[0])]

for i in range(len(z)):
    if i in plotthese:
        z[i] = 1

z = np.asarray(z)

flatui = ["#457b9d", "#e63946", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]

sns.set_palette(sns.color_palette(flatui))

figure(figsize=(12,10))
hold(True)
for m1 in range(M):
    for m2 in range(M):
        subplot(M, M, m1*M + m2 + 1)
        for c in range(C):
            class_mask = (z == c)
            plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), '.')
            if m1==M-1:
                xlabel(main_header[m2], fontsize=7)
            else:
                xticks([])
            if m2==0:
                ylabel(main_header[m1], fontsize=7)
            else:
                yticks([])

legend(classNames)

show()
	# exercise 11.4.1
	import numpy as np
	from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm,
	subplot, show, legend, hold)
	from matplotlib.pyplot import (figure, hold, subplot, plot, xlabel, ylabel,
	xticks, yticks,legend,show)
	from scipy.io import loadmat
	from toolbox_02450 import gausKernelDensity
	from sklearn.neighbors import NearestNeighbors
	from Project_Clean_data import raw, header, standardize_this
	import statistics
	import matplotlib.pyplot as plt
	import seaborn as sns
	import math
	from collections import Counter
	from textwrap import wrap


	toggle_plot = True
	target_attribute_name = 'Dx'
	target_index = list(header).index(target_attribute_name)
	X = raw
	X = standardize_this(X)
	y = X[:, target_index]
	attributeNames = np.delete(header, target_index)
	classNames = ['Non-Outlier', 'Possible Outlier']
	N, M = X.shape

	window = N

	density_matrix = np.zeros((N, 1))


	### Gausian Kernel density estimator
	# cross-validate kernel width by leave-one-out-cross-validation
	# (efficient implementation in gausKernelDensity function)
	# evaluate for range of kernel widths
	widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3))
	logP = np.zeros(np.size(widths))
	for i, w in enumerate(widths):
	print('Fold {:2d}, w={:f}'.format(i, w))
	density, log_density = gausKernelDensity(X, w)
	logP[i] = log_density.sum()

	val = logP.max()
	ind = logP.argmax()

	width = widths[ind]
	print('Optimal estimated width is: {0}'.format(width))

	# evaluate density for estimated width
	density, log_density = gausKernelDensity(X, width)

	density_matrix = np.hstack((density_matrix, density))


	# Sort the densities
	i = (density.argsort(axis=0)).ravel()
	density = density[i].reshape(-1, )

	# Plot density estimate of outlier score
	if toggle_plot:
	figure(1)
	bar(range(window), density[:window])
	title('Gaussian Kernel: Density estimate')

	show()

	### K-neighbors density estimator
	# Neighbor to use:
	K = 5

	# Find the k nearest neighbors
	knn = NearestNeighbors(n_neighbors=K).fit(X)
	D, i = knn.kneighbors(X)

	density = 1. / (D.sum(axis=1) / K)
	density_toappend = 1. / (D.sum(axis=1) / K)
	density_toappend.shape = (density_toappend.shape[0], 1)
	density_matrix = np.hstack((density_matrix, density_toappend))

	# Sort the scores
	i = density.argsort()
	density = density[i]

	# Plot k-neighbor estimate of outlier score (distances)
	if toggle_plot:
	figure(3)
	bar(range(window), density[:window])
	title('KNN density: Outlier score')
	show()

	### K-nearest neigbor average relative density
	# Compute the average relative density

	knn = NearestNeighbors(n_neighbors=K).fit(X)
	D, i = knn.kneighbors(X)
	density = 1. / (D.sum(axis=1) / K)
	avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)
	avg_rel_density_to_append = density / (density[i[:, 1:]].sum(axis=1) / K)
	avg_rel_density_to_append.shape = (avg_rel_density_to_append.shape[0], 1)
	density_matrix = np.hstack((density_matrix, avg_rel_density_to_append))

	# Sort the avg.rel.densities
	i_avg_rel = avg_rel_density.argsort()
	avg_rel_density = avg_rel_density[i_avg_rel]

	# Plot k-neighbor estimate of outlier score (distances)
	if toggle_plot:
	figure(5)
	bar(range(window), avg_rel_density[:window])
	title('KNN average relative density: Outlier score')
	show()

	density_matrix = np.delete(density_matrix, 0, axis=1)

	indexes = np.argsort(density_matrix, axis=0)

	Check_the_last = 50

	method = ['GK', 'KN', 'KNA']

	outs = []

	for i in range(density_matrix.shape[1]):
	print('The outliers for method {}:'.format(method[i]))
	printer = []
	for j in range(density_matrix.shape[0]):
	if indexes[j, i] <= 40:
	printer.append(j)
	outs.append(j)
	print(printer)


	regularity = Counter(outs)
	plotthese = []
	for element in regularity.most_common(30):
	if element[1] > 1:
	print('Observation {} is an outlier because it appears the lists of lower densities {} times.'
	.format(element[0], element[1]))
	plotthese.append(element[0])


	outlier_features = X[list(set(plotthese)), :]

	if toggle_plot:
	sns.heatmap(outlier_features.transpose(), cbar=False, yticklabels=header, xticklabels=plotthese)
	plt.yticks(rotation=0)
	plt.show()

	C = 2

	# Keep main attributes and header
	important_attributes = [0, 1, 2, 3, 5, 8, 11]
	main_raw = raw[:,important_attributes]
	main_header = header[important_attributes]


	main_header = [ '\n'.join(wrap(l, 20)) for l in main_header ]


	# Compute values of N, M and C.
	M = len(important_attributes)
	X = main_raw
	z = [0 for x in range(X.shape[0])]

	for i in range(len(z)):
	if i in plotthese:
	z[i] = 1

	z = np.asarray(z)

	flatui = ["#457b9d", "#e63946", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]

	sns.set_palette(sns.color_palette(flatui))

	figure(figsize=(12,10))
	hold(True)
	for m1 in range(M):
	for m2 in range(M):
	subplot(M, M, m1*M + m2 + 1)
	for c in range(C):
	class_mask = (z == c)
	plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), '.')
	if m1==M-1:
	xlabel(main_header[m2], fontsize=7)
	else:
	xticks([])
	if m2==0:
	ylabel(main_header[m1], fontsize=7)
	else:
	yticks([])

	legend(classNames)

	show()