Last active
October 5, 2018 10:06
-
-
Save ivan-mitb/6aa939e12af92085d054f5703de4e29f to your computer and use it in GitHub Desktop.
annamalai detection using GMM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load READY.DAT (56 cols) | |
from dataload import load_object | |
x_train, x_test, y_train, y_test = load_object('ready.dat') | |
x_train.max(axis=0) | |
del x_test, y_test # we don't need the test set for now | |
# under-sample the big classes to make the set manageable | |
from imblearn.under_sampling import RandomUnderSampler | |
rus = RandomUnderSampler(ratio={'normal':50000, 'dos':50000}, random_state=4129) | |
x_train, y_train = rus.fit_sample(x_train, y_train.attack_type) | |
# from collections import Counter | |
# Counter(y_train_r) | |
from sklearn.mixture import GaussianMixture | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
############################################## | |
# figure out the optimal value of k (no of centroids) | |
# build a aic/bic table for n_components [10:50] | |
# n=15 seems to be right | |
res = np.zeros((15, 2)) | |
for n in range(5,20): | |
gmm = GaussianMixture(n_components=n, n_init=1, random_state=4129, verbose=0) | |
gmm.fit(x_train) | |
res[n-5] = [gmm.aic(x_train), gmm.bic(x_train)] | |
plt.plot(res) | |
############################################## | |
gmm = GaussianMixture(n_components=15, n_init=3, random_state=4129, verbose=1) | |
gmm.fit(x_train) | |
# do PCA onto 2 princomps, purely for visualisation | |
from sklearn.decomposition import PCA | |
pca = PCA(2, random_state=4129) | |
x_train_pca = pca.fit_transform(x_train) | |
# plot the datapoints, and the k-centroids overlaid | |
y_cat = pd.Series(y_train).astype('category') | |
plt.figure(figsize=(10,6)) | |
plt.scatter(x_train_pca[:,0], x_train_pca[:,1], c=y_cat.cat.codes, marker='.', alpha=0.3) | |
gmm_means_pca = pca.transform(gmm.means_) | |
plt.scatter(gmm_means_pca[:,0], gmm_means_pca[:,1], marker='x', s=100, c='orange') | |
for i in range(15): | |
plt.text(gmm_means_pca[:,0][i], gmm_means_pca[:,1][i], str(i), color='orange', size='x-large') | |
# visualise the rare class 'r2l' 'u2r' | |
plt.figure(figsize=(8,5)) | |
plt.scatter(x_train_pca[y_train == 'r2l', 0], x_train_pca[y_train == 'r2l', 1]) | |
plt.scatter(x_train_pca[y_train == 'u2r', 0], x_train_pca[y_train == 'u2r', 1]) | |
plt.xlim((-1,2.5)) | |
plt.ylim((-.8,2.1)) | |
plt.scatter(gmm_means_pca[:,0], gmm_means_pca[:,1], marker='x', s=100, c='orange') | |
for i in range(15): | |
plt.text(gmm_means_pca[:,0][i], gmm_means_pca[:,1][i], str(i), color='brown', size='x-large') | |
##################################### | |
# the actual outlier analysis starts here | |
# 4 ways to compute an 'outlier score': | |
# i. weighted log probability (score_scamples()) | |
# ii. Mahalanobis distance | |
# iii. LOF | |
# iv. 1-class SVM | |
# ii. Mahalanobis distance | |
from scipy.spatial.distance import mahalanobis | |
maha_min = np.apply_along_axis(lambda a: [mahalanobis(a, gmm.means_[i], gmm.precisions_[i]) for i in range(k)], axis=1, arr=x_train).min(axis=1) | |
# chop maha_min up by y_train | |
plt.boxplot([maha_min[y_train == i] for i in np.unique(y_train)], labels=np.unique(y_train), vert=False) | |
plt.show() | |
maha_min[np.logical_or(y_train == 'u2r', y_train == 'r2l')] | |
pd.DataFrame([maha_min[y_train == i].mean() for i in np.unique(y_train)], index=np.unique(y_train), columns=['mean Maha']).T | |
# iii. LOF | |
from sklearn.neighbors import LocalOutlierFactor | |
lof = LocalOutlierFactor(n_jobs=-1) | |
lof_score = lof.fit_predict(x_train) | |
# -1 for outliers, 1 for inliers | |
[np.mean(lof_score[y_train == i]) for i in np.unique(y_train)] | |
# 2-dim multivariate Gaussian | |
mahalanobis([.1,.1], [0,0], np.linalg.inv(np.array([[1,.4],[.4,1]]))) | |
mahalanobis([.1,.1], [0,0], np.linalg.inv(np.array([[1,.6],[.6,1]]))) | |
x = np.random.multivariate_normal([0,0], np.array([[1,.4],[.4,1]]), 2000) | |
plt.scatter(x[:,0], x[:,1], marker='.') | |
plt.xlim((-3,3)); plt.ylim((-3,3)) | |
# weighted sum of Gaussian pdfs | |
from scipy.stats import norm | |
norm.pdf(1.96) | |
norm.pdf(gmm.means_[0], ) # WOW HOW TO DO THIS ?? | |
gmm.weights_ | |
# i. weighted log probability (score_scamples()) | |
# score_samples() gives the weighted log probability of each sample. | |
# very small values => very low probability => ?outlier | |
plt.plot(gmm.score_samples(x_train)) | |
gmm.score_samples(x_train[y_train == 'r2l'][:9]) | |
gmm.score_samples(x_train[y_train == 'u2r']) | |
gmm.weights_, gmm.means_ | |
gmm.predict_proba(x_train.iloc[:2]) | |
##################################### | |
# (placed in a different section as 1-class SVM does not use the fitted GMM from above) | |
# iv. 1-class SVM | |
# http://scikit-learn.org/stable/modules/svm.html#svm-outlier-detection | |
# One-class SVM is used for novelty detection, that is, given a set of samples, it will detect the soft boundary of that set so as to classify new points as belonging to that set or not. The class that implements this is called OneClassSVM. | |
# | |
# In this case, as it is a type of unsupervised learning, the fit method will only take as input an array X, as there are no class labels. | |
from sklearn.svm import OneClassSVM | |
# 'linear' 17min to fit | |
sv1 = OneClassSVM(kernel='linear', max_iter=100, verbose=1, random_state=4129) | |
%time sv1.fit(x_train) | |
sv1.predict(x_train) # inlier +1, outlier -1 | |
sv1.decision_function(x_train) # Signed distance is positive for an inlier and negative for an outlier. | |
sv1.support_vectors_ # [nSV, n_features] | |
##################################### | |
# yet another way to perform outlier detection | |
# Isolation Forest | |
# http://scikit-learn.org/stable/modules/outlier_detection.html#isolation-forest | |
from sklearn.ensemble import IsolationForest | |
# 3 seconds to fit ! | |
iso = IsolationForest(n_estimators=100, max_features=1, n_jobs=1, random_state=4129, verbose=1) | |
%time iso.fit(x_train) | |
iso.decision_function(x_train) # The anomaly score of the input samples. The lower, the more abnormal. | |
iso.predict(x_train) # inlier +1, outlier -1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment