Skip to content

Instantly share code, notes, and snippets.

@sharonwoo
Forked from ivan-mitb/outlier.py
Created October 5, 2018 10:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sharonwoo/b9bfe244d4f82ed8747ed125738801bd to your computer and use it in GitHub Desktop.
Save sharonwoo/b9bfe244d4f82ed8747ed125738801bd to your computer and use it in GitHub Desktop.
annamalai detection using GMM
# load READY.DAT (56 cols)
from dataload import load_object
x_train, x_test, y_train, y_test = load_object('ready.dat')
x_train.max(axis=0)
del x_test, y_test # we don't need the test set for now
# under-sample the big classes to make the set manageable
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(ratio={'normal':50000, 'dos':50000}, random_state=4129)
x_train, y_train = rus.fit_sample(x_train, y_train.attack_type)
# from collections import Counter
# Counter(y_train_r)
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
##############################################
# figure out the optimal value of k (no of centroids)
# build a aic/bic table for n_components [10:50]
# n=15 seems to be right
res = np.zeros((15, 2))
for n in range(5,20):
gmm = GaussianMixture(n_components=n, n_init=1, random_state=4129, verbose=0)
gmm.fit(x_train)
res[n-5] = [gmm.aic(x_train), gmm.bic(x_train)]
plt.plot(res)
##############################################
gmm = GaussianMixture(n_components=15, n_init=3, random_state=4129, verbose=1)
gmm.fit(x_train)
# do PCA onto 2 princomps, purely for visualisation
from sklearn.decomposition import PCA
pca = PCA(2, random_state=4129)
x_train_pca = pca.fit_transform(x_train)
# plot the datapoints, and the k-centroids overlaid
y_cat = pd.Series(y_train).astype('category')
plt.figure(figsize=(10,6))
plt.scatter(x_train_pca[:,0], x_train_pca[:,1], c=y_cat.cat.codes, marker='.', alpha=0.3)
gmm_means_pca = pca.transform(gmm.means_)
plt.scatter(gmm_means_pca[:,0], gmm_means_pca[:,1], marker='x', s=100, c='orange')
for i in range(15):
plt.text(gmm_means_pca[:,0][i], gmm_means_pca[:,1][i], str(i), color='orange', size='x-large')
# visualise the rare class 'r2l' 'u2r'
plt.figure(figsize=(8,5))
plt.scatter(x_train_pca[y_train == 'r2l', 0], x_train_pca[y_train == 'r2l', 1])
plt.scatter(x_train_pca[y_train == 'u2r', 0], x_train_pca[y_train == 'u2r', 1])
plt.xlim((-1,2.5))
plt.ylim((-.8,2.1))
plt.scatter(gmm_means_pca[:,0], gmm_means_pca[:,1], marker='x', s=100, c='orange')
for i in range(15):
plt.text(gmm_means_pca[:,0][i], gmm_means_pca[:,1][i], str(i), color='brown', size='x-large')
#####################################
# the actual outlier analysis starts here
# 4 ways to compute an 'outlier score':
# i. weighted log probability (score_scamples())
# ii. Mahalanobis distance
# iii. LOF
# iv. 1-class SVM
# ii. Mahalanobis distance
from scipy.spatial.distance import mahalanobis
maha_min = np.apply_along_axis(lambda a: [mahalanobis(a, gmm.means_[i], gmm.precisions_[i]) for i in range(k)], axis=1, arr=x_train).min(axis=1)
# chop maha_min up by y_train
plt.boxplot([maha_min[y_train == i] for i in np.unique(y_train)], labels=np.unique(y_train), vert=False)
plt.show()
maha_min[np.logical_or(y_train == 'u2r', y_train == 'r2l')]
pd.DataFrame([maha_min[y_train == i].mean() for i in np.unique(y_train)], index=np.unique(y_train), columns=['mean Maha']).T
# iii. LOF
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_jobs=-1)
lof_score = lof.fit_predict(x_train)
# -1 for outliers, 1 for inliers
[np.mean(lof_score[y_train == i]) for i in np.unique(y_train)]
# 2-dim multivariate Gaussian
mahalanobis([.1,.1], [0,0], np.linalg.inv(np.array([[1,.4],[.4,1]])))
mahalanobis([.1,.1], [0,0], np.linalg.inv(np.array([[1,.6],[.6,1]])))
x = np.random.multivariate_normal([0,0], np.array([[1,.4],[.4,1]]), 2000)
plt.scatter(x[:,0], x[:,1], marker='.')
plt.xlim((-3,3)); plt.ylim((-3,3))
# weighted sum of Gaussian pdfs
from scipy.stats import norm
norm.pdf(1.96)
norm.pdf(gmm.means_[0], ) # WOW HOW TO DO THIS ??
gmm.weights_
# i. weighted log probability (score_scamples())
# score_samples() gives the weighted log probability of each sample.
# very small values => very low probability => ?outlier
plt.plot(gmm.score_samples(x_train))
gmm.score_samples(x_train[y_train == 'r2l'][:9])
gmm.score_samples(x_train[y_train == 'u2r'])
gmm.weights_, gmm.means_
gmm.predict_proba(x_train.iloc[:2])
#####################################
# (placed in a different section as 1-class SVM does not use the fitted GMM from above)
# iv. 1-class SVM
# http://scikit-learn.org/stable/modules/svm.html#svm-outlier-detection
# One-class SVM is used for novelty detection, that is, given a set of samples, it will detect the soft boundary of that set so as to classify new points as belonging to that set or not. The class that implements this is called OneClassSVM.
#
# In this case, as it is a type of unsupervised learning, the fit method will only take as input an array X, as there are no class labels.
from sklearn.svm import OneClassSVM
# 'linear' 17min to fit
sv1 = OneClassSVM(kernel='linear', max_iter=100, verbose=1, random_state=4129)
%time sv1.fit(x_train)
sv1.predict(x_train) # inlier +1, outlier -1
sv1.decision_function(x_train) # Signed distance is positive for an inlier and negative for an outlier.
sv1.support_vectors_ # [nSV, n_features]
#####################################
# yet another way to perform outlier detection
# Isolation Forest
# http://scikit-learn.org/stable/modules/outlier_detection.html#isolation-forest
from sklearn.ensemble import IsolationForest
# 3 seconds to fit !
iso = IsolationForest(n_estimators=100, max_features=1, n_jobs=1, random_state=4129, verbose=1)
%time iso.fit(x_train)
iso.decision_function(x_train) # The anomaly score of the input samples. The lower, the more abnormal.
iso.predict(x_train) # inlier +1, outlier -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment