Created
December 3, 2017 14:11
-
-
Save duarteocarmo/5da6851dfd0f7b6351622d674ad3ff38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# exercise 11.4.1 | |
import numpy as np | |
from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm, | |
subplot, show, legend, hold) | |
from matplotlib.pyplot import (figure, hold, subplot, plot, xlabel, ylabel, | |
xticks, yticks,legend,show) | |
from scipy.io import loadmat | |
from toolbox_02450 import gausKernelDensity | |
from sklearn.neighbors import NearestNeighbors | |
from Project_Clean_data import raw, header, standardize_this | |
import statistics | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import math | |
from collections import Counter | |
from textwrap import wrap | |
toggle_plot = True | |
target_attribute_name = 'Dx' | |
target_index = list(header).index(target_attribute_name) | |
X = raw | |
X = standardize_this(X) | |
y = X[:, target_index] | |
attributeNames = np.delete(header, target_index) | |
classNames = ['Non-Outlier', 'Possible Outlier'] | |
N, M = X.shape | |
window = N | |
density_matrix = np.zeros((N, 1)) | |
### Gausian Kernel density estimator | |
# cross-validate kernel width by leave-one-out-cross-validation | |
# (efficient implementation in gausKernelDensity function) | |
# evaluate for range of kernel widths | |
widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3)) | |
logP = np.zeros(np.size(widths)) | |
for i, w in enumerate(widths): | |
print('Fold {:2d}, w={:f}'.format(i, w)) | |
density, log_density = gausKernelDensity(X, w) | |
logP[i] = log_density.sum() | |
val = logP.max() | |
ind = logP.argmax() | |
width = widths[ind] | |
print('Optimal estimated width is: {0}'.format(width)) | |
# evaluate density for estimated width | |
density, log_density = gausKernelDensity(X, width) | |
density_matrix = np.hstack((density_matrix, density)) | |
# Sort the densities | |
i = (density.argsort(axis=0)).ravel() | |
density = density[i].reshape(-1, ) | |
# Plot density estimate of outlier score | |
if toggle_plot: | |
figure(1) | |
bar(range(window), density[:window]) | |
title('Gaussian Kernel: Density estimate') | |
show() | |
### K-neighbors density estimator | |
# Neighbor to use: | |
K = 5 | |
# Find the k nearest neighbors | |
knn = NearestNeighbors(n_neighbors=K).fit(X) | |
D, i = knn.kneighbors(X) | |
density = 1. / (D.sum(axis=1) / K) | |
density_toappend = 1. / (D.sum(axis=1) / K) | |
density_toappend.shape = (density_toappend.shape[0], 1) | |
density_matrix = np.hstack((density_matrix, density_toappend)) | |
# Sort the scores | |
i = density.argsort() | |
density = density[i] | |
# Plot k-neighbor estimate of outlier score (distances) | |
if toggle_plot: | |
figure(3) | |
bar(range(window), density[:window]) | |
title('KNN density: Outlier score') | |
show() | |
### K-nearest neigbor average relative density | |
# Compute the average relative density | |
knn = NearestNeighbors(n_neighbors=K).fit(X) | |
D, i = knn.kneighbors(X) | |
density = 1. / (D.sum(axis=1) / K) | |
avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K) | |
avg_rel_density_to_append = density / (density[i[:, 1:]].sum(axis=1) / K) | |
avg_rel_density_to_append.shape = (avg_rel_density_to_append.shape[0], 1) | |
density_matrix = np.hstack((density_matrix, avg_rel_density_to_append)) | |
# Sort the avg.rel.densities | |
i_avg_rel = avg_rel_density.argsort() | |
avg_rel_density = avg_rel_density[i_avg_rel] | |
# Plot k-neighbor estimate of outlier score (distances) | |
if toggle_plot: | |
figure(5) | |
bar(range(window), avg_rel_density[:window]) | |
title('KNN average relative density: Outlier score') | |
show() | |
density_matrix = np.delete(density_matrix, 0, axis=1) | |
indexes = np.argsort(density_matrix, axis=0) | |
Check_the_last = 50 | |
method = ['GK', 'KN', 'KNA'] | |
outs = [] | |
for i in range(density_matrix.shape[1]): | |
print('The outliers for method {}:'.format(method[i])) | |
printer = [] | |
for j in range(density_matrix.shape[0]): | |
if indexes[j, i] <= 40: | |
printer.append(j) | |
outs.append(j) | |
print(printer) | |
regularity = Counter(outs) | |
plotthese = [] | |
for element in regularity.most_common(30): | |
if element[1] > 1: | |
print('Observation {} is an outlier because it appears the lists of lower densities {} times.' | |
.format(element[0], element[1])) | |
plotthese.append(element[0]) | |
outlier_features = X[list(set(plotthese)), :] | |
if toggle_plot: | |
sns.heatmap(outlier_features.transpose(), cbar=False, yticklabels=header, xticklabels=plotthese) | |
plt.yticks(rotation=0) | |
plt.show() | |
C = 2 | |
# Keep main attributes and header | |
important_attributes = [0, 1, 2, 3, 5, 8, 11] | |
main_raw = raw[:,important_attributes] | |
main_header = header[important_attributes] | |
main_header = [ '\n'.join(wrap(l, 20)) for l in main_header ] | |
# Compute values of N, M and C. | |
M = len(important_attributes) | |
X = main_raw | |
z = [0 for x in range(X.shape[0])] | |
for i in range(len(z)): | |
if i in plotthese: | |
z[i] = 1 | |
z = np.asarray(z) | |
flatui = ["#457b9d", "#e63946", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"] | |
sns.set_palette(sns.color_palette(flatui)) | |
figure(figsize=(12,10)) | |
hold(True) | |
for m1 in range(M): | |
for m2 in range(M): | |
subplot(M, M, m1*M + m2 + 1) | |
for c in range(C): | |
class_mask = (z == c) | |
plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), '.') | |
if m1==M-1: | |
xlabel(main_header[m2], fontsize=7) | |
else: | |
xticks([]) | |
if m2==0: | |
ylabel(main_header[m1], fontsize=7) | |
else: | |
yticks([]) | |
legend(classNames) | |
show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment