Skip to content

Instantly share code, notes, and snippets.

@tam17aki
Last active August 30, 2021 08:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tam17aki/1b3d250915600d75a0c9c9980023ea00 to your computer and use it in GitHub Desktop.
Save tam17aki/1b3d250915600d75a0c9c9980023ea00 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Outlier detection based on Gaussian Mixture Model (GMM).
Copyright (C) 2021 by Akira TAMAMORI
Copyright (C) Wei Xue
Copyright (c) 2018, Yue Zhao
BSD 3-Clause License
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
from typing import NamedTuple, Optional
import numpy
from pyod.models.base import BaseDetector
from pyod.utils.utility import invert_order
from sklearn.mixture import GaussianMixture
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
class GmmConfig(NamedTuple):
"""Class for configuration of Gaussian Mixture Model."""
n_components: int = 1
covariance_type: str = "full"
tol: float = 1e-3
reg_covar: float = 1e-6
max_iter: int = 100
n_init: int = 1
init_params: str = "kmeans"
weights_init: Optional[numpy.ndarray] = None
means_init: Optional[numpy.ndarray] = None
precisions_init: Optional[numpy.ndarray] = None
random_state: Optional[int] = None
warm_start: bool = False
verbose: int = 0
verbose_interval: int = 10
class GMM(BaseDetector):
"""Wrapper of scikit-learn Gaussian Mixture Model with more functionalities.
Unsupervised Outlier Detection.
Estimate the support of a high-dimensional distribution.
Parameters
----------
n_components : int, default=1
The number of mixture components.
covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full'
String describing the type of covariance parameters to use.
tol : float, default=1e-3
The convergence threshold. EM iterations will stop when the
lower bound average gain is below this threshold.
reg_covar : float, default=1e-6
Non-negative regularization added to the diagonal of covariance.
Allows to assure that the covariance matrices are all positive.
max_iter : int, default=100
The number of EM iterations to perform.
n_init : int, default=1
The number of initializations to perform. The best results are kept.
init_params : {'kmeans', 'random'}, default='kmeans'
The method used to initialize the weights, the means and the
precisions.
weights_init : array-like of shape (n_components, ), default=None
The user-provided initial weights.
If it is None, weights are initialized using the `init_params` method.
means_init : array-like of shape (n_components, n_features), default=None
The user-provided initial means,
If it is None, means are initialized using the `init_params` method.
precisions_init : array-like, default=None
The user-provided initial precisions (inverse of the covariance
matrices).
If it is None, precisions are initialized using the 'init_params'
method.
random_state : int, RandomState instance or None, default=None
Controls the random seed given to the method chosen to initialize the
parameters.
warm_start : bool, default=False
If 'warm_start' is True, the solution of the last fitting is used as
initialization for the next call of fit().
verbose : int, default=0
Enable verbose output.
verbose_interval : int, default=10
Number of iteration done before the next print.
contamination : float in (0., 0.5), optional (default=0.1)
The amount of contamination of the data set.
Attributes
----------
weights_ : array-like of shape (n_components,)
The weights of each mixture components.
means_ : array-like of shape (n_components, n_features)
The mean of each mixture component.
covariances_ : array-like
The covariance of each mixture component.
precisions_ : array-like
The precision matrices for each component in the mixture.
precisions_cholesky_ : array-like
The cholesky decomposition of the precision matrices of each mixture
component.
converged_ : bool
True when convergence was reached in fit(), False otherwise.
n_iter_ : int
Number of step used by the best fit of EM to reach the convergence.
lower_bound_ : float
Lower bound value on the log-likelihood (of the training data with
respect to the model) of the best fit of EM.
decision_scores_ : numpy array of shape (n_samples,)
The outlier scores of the training data.
threshold_ : float
The threshold is based on ``contamination``. It is the
``n_samples * contamination`` most abnormal samples in
``decision_scores_``. The threshold is calculated for generating
binary outlier labels.
labels_ : int, either 0 or 1
The binary labels of the training data. 0 stands for inliers
and 1 for outliers/anomalies. It is generated by applying
``threshold_`` on ``decision_scores_``.
"""
def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
weights_init=None, means_init=None, precisions_init=None,
random_state=None, warm_start=False, verbose=0,
verbose_interval=10, contamination=0.1):
super().__init__(contamination=contamination)
self._config = GmmConfig(
n_components,
covariance_type,
tol,
reg_covar,
max_iter,
n_init,
init_params,
weights_init, means_init, precisions_init,
random_state, warm_start,
verbose, verbose_interval
)
self.detector_ = None
self.decision_scores_ = None
def fit(self, X, y=None):
"""Fit detector. y is ignored in unsupervised methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : Ignored
Not used, present for API consistency by convention.
sample_weight : array-like, shape (n_samples,)
Per-sample weights. Rescale C per sample. Higher weights
force the classifier to put more emphasis on these points.
Returns
-------
self : object
Fitted estimator.
"""
# validate inputs X and y (optional)
X = check_array(X)
self._set_n_classes(y)
self.detector_ = GaussianMixture(
n_components=self._config.n_components,
covariance_type=self._config.covariance_type,
tol=self._config.tol,
reg_covar=self._config.reg_covar,
max_iter=self._config.max_iter,
n_init=self._config.n_init,
init_params=self._config.init_params,
weights_init=self._config.weights_init,
means_init=self._config.means_init,
precisions_init=self._config.precisions_init,
random_state=self._config.random_state,
warm_start=self._config.warm_start,
verbose=self._config.verbose,
verbose_interval=self._config.verbose_interval,
)
self.detector_.fit(X=X, y=y)
# invert decision_scores_. Outliers comes with higher outlier scores
self.decision_scores_ = invert_order(self.detector_.score_samples(X))
self._process_decision_scores()
return self
def decision_function(self, X):
"""Predict raw anomaly score of X using the fitted detector.
The anomaly score of an input sample is computed based on different
detector algorithms. For consistency, outliers are assigned with
larger anomaly scores.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only
if they are supported by the base estimator.
Returns
-------
anomaly_scores : numpy array of shape (n_samples,)
The anomaly score of the input samples.
"""
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
# Invert outlier scores. Outliers comes with higher outlier scores
return invert_order(self.detector_.score_samples(X))
@property
def weights_(self):
"""The weights of each mixture components.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.weights_
@property
def means_(self):
"""The mean of each mixture component.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.means_
@property
def covariances_(self):
"""The covariance of each mixture component.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.convariances_
@property
def precisions_(self):
"""The precision matrices for each component in the mixture.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.precisions_
@property
def precisions_cholesky_(self):
"""The cholesky decomposition of the precision matrices
of each mixture component.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.precisions_cholesky_
@property
def converged_(self):
"""True when convergence was reached in fit(), False otherwise.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.converged_
@property
def n_iter_(self):
"""Number of step used by the best fit of EM to reach the convergence.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.n_iter_
@property
def lower_bound_(self):
"""Lower bound value on the log-likelihood of the best fit of EM.
Decorator for scikit-learn Gaussian Mixture Model attributes.
"""
return self.detector_.lower_bound_
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment