Skip to content

Instantly share code, notes, and snippets.

@duarteocarmo
Created December 3, 2017 14:11
Show Gist options
  • Save duarteocarmo/5da6851dfd0f7b6351622d674ad3ff38 to your computer and use it in GitHub Desktop.
Save duarteocarmo/5da6851dfd0f7b6351622d674ad3ff38 to your computer and use it in GitHub Desktop.
# exercise 11.4.1
import numpy as np
from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm,
subplot, show, legend, hold)
from matplotlib.pyplot import (figure, hold, subplot, plot, xlabel, ylabel,
xticks, yticks,legend,show)
from scipy.io import loadmat
from toolbox_02450 import gausKernelDensity
from sklearn.neighbors import NearestNeighbors
from Project_Clean_data import raw, header, standardize_this
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import math
from collections import Counter
from textwrap import wrap
toggle_plot = True
target_attribute_name = 'Dx'
target_index = list(header).index(target_attribute_name)
X = raw
X = standardize_this(X)
y = X[:, target_index]
attributeNames = np.delete(header, target_index)
classNames = ['Non-Outlier', 'Possible Outlier']
N, M = X.shape
window = N
density_matrix = np.zeros((N, 1))
### Gausian Kernel density estimator
# cross-validate kernel width by leave-one-out-cross-validation
# (efficient implementation in gausKernelDensity function)
# evaluate for range of kernel widths
widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3))
logP = np.zeros(np.size(widths))
for i, w in enumerate(widths):
print('Fold {:2d}, w={:f}'.format(i, w))
density, log_density = gausKernelDensity(X, w)
logP[i] = log_density.sum()
val = logP.max()
ind = logP.argmax()
width = widths[ind]
print('Optimal estimated width is: {0}'.format(width))
# evaluate density for estimated width
density, log_density = gausKernelDensity(X, width)
density_matrix = np.hstack((density_matrix, density))
# Sort the densities
i = (density.argsort(axis=0)).ravel()
density = density[i].reshape(-1, )
# Plot density estimate of outlier score
if toggle_plot:
figure(1)
bar(range(window), density[:window])
title('Gaussian Kernel: Density estimate')
show()
### K-neighbors density estimator
# Neighbor to use:
K = 5
# Find the k nearest neighbors
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
density = 1. / (D.sum(axis=1) / K)
density_toappend = 1. / (D.sum(axis=1) / K)
density_toappend.shape = (density_toappend.shape[0], 1)
density_matrix = np.hstack((density_matrix, density_toappend))
# Sort the scores
i = density.argsort()
density = density[i]
# Plot k-neighbor estimate of outlier score (distances)
if toggle_plot:
figure(3)
bar(range(window), density[:window])
title('KNN density: Outlier score')
show()
### K-nearest neigbor average relative density
# Compute the average relative density
knn = NearestNeighbors(n_neighbors=K).fit(X)
D, i = knn.kneighbors(X)
density = 1. / (D.sum(axis=1) / K)
avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K)
avg_rel_density_to_append = density / (density[i[:, 1:]].sum(axis=1) / K)
avg_rel_density_to_append.shape = (avg_rel_density_to_append.shape[0], 1)
density_matrix = np.hstack((density_matrix, avg_rel_density_to_append))
# Sort the avg.rel.densities
i_avg_rel = avg_rel_density.argsort()
avg_rel_density = avg_rel_density[i_avg_rel]
# Plot k-neighbor estimate of outlier score (distances)
if toggle_plot:
figure(5)
bar(range(window), avg_rel_density[:window])
title('KNN average relative density: Outlier score')
show()
density_matrix = np.delete(density_matrix, 0, axis=1)
indexes = np.argsort(density_matrix, axis=0)
Check_the_last = 50
method = ['GK', 'KN', 'KNA']
outs = []
for i in range(density_matrix.shape[1]):
print('The outliers for method {}:'.format(method[i]))
printer = []
for j in range(density_matrix.shape[0]):
if indexes[j, i] <= 40:
printer.append(j)
outs.append(j)
print(printer)
regularity = Counter(outs)
plotthese = []
for element in regularity.most_common(30):
if element[1] > 1:
print('Observation {} is an outlier because it appears the lists of lower densities {} times.'
.format(element[0], element[1]))
plotthese.append(element[0])
outlier_features = X[list(set(plotthese)), :]
if toggle_plot:
sns.heatmap(outlier_features.transpose(), cbar=False, yticklabels=header, xticklabels=plotthese)
plt.yticks(rotation=0)
plt.show()
C = 2
# Keep main attributes and header
important_attributes = [0, 1, 2, 3, 5, 8, 11]
main_raw = raw[:,important_attributes]
main_header = header[important_attributes]
main_header = [ '\n'.join(wrap(l, 20)) for l in main_header ]
# Compute values of N, M and C.
M = len(important_attributes)
X = main_raw
z = [0 for x in range(X.shape[0])]
for i in range(len(z)):
if i in plotthese:
z[i] = 1
z = np.asarray(z)
flatui = ["#457b9d", "#e63946", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.set_palette(sns.color_palette(flatui))
figure(figsize=(12,10))
hold(True)
for m1 in range(M):
for m2 in range(M):
subplot(M, M, m1*M + m2 + 1)
for c in range(C):
class_mask = (z == c)
plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), '.')
if m1==M-1:
xlabel(main_header[m2], fontsize=7)
else:
xticks([])
if m2==0:
ylabel(main_header[m1], fontsize=7)
else:
yticks([])
legend(classNames)
show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment