Last active
August 30, 2021 08:53
-
-
Save tam17aki/1b3d250915600d75a0c9c9980023ea00 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Outlier detection based on Gaussian Mixture Model (GMM). | |
Copyright (C) 2021 by Akira TAMAMORI | |
Copyright (C) Wei Xue | |
Copyright (c) 2018, Yue Zhao | |
BSD 3-Clause License | |
Redistribution and use in source and binary forms, with or without | |
modification, are permitted provided that the following conditions are met: | |
* Redistributions of source code must retain the above copyright notice, this | |
list of conditions and the following disclaimer. | |
* Redistributions in binary form must reproduce the above copyright notice, | |
this list of conditions and the following disclaimer in the documentation | |
and/or other materials provided with the distribution. | |
* Neither the name of the copyright holder nor the names of its contributors | |
may be used to endorse or promote products derived from this software without | |
specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
""" | |
from typing import NamedTuple, Optional | |
import numpy | |
from pyod.models.base import BaseDetector | |
from pyod.utils.utility import invert_order | |
from sklearn.mixture import GaussianMixture | |
from sklearn.utils import check_array | |
from sklearn.utils.validation import check_is_fitted | |
class GmmConfig(NamedTuple): | |
"""Class for configuration of Gaussian Mixture Model.""" | |
n_components: int = 1 | |
covariance_type: str = "full" | |
tol: float = 1e-3 | |
reg_covar: float = 1e-6 | |
max_iter: int = 100 | |
n_init: int = 1 | |
init_params: str = "kmeans" | |
weights_init: Optional[numpy.ndarray] = None | |
means_init: Optional[numpy.ndarray] = None | |
precisions_init: Optional[numpy.ndarray] = None | |
random_state: Optional[int] = None | |
warm_start: bool = False | |
verbose: int = 0 | |
verbose_interval: int = 10 | |
class GMM(BaseDetector): | |
"""Wrapper of scikit-learn Gaussian Mixture Model with more functionalities. | |
Unsupervised Outlier Detection. | |
Estimate the support of a high-dimensional distribution. | |
Parameters | |
---------- | |
n_components : int, default=1 | |
The number of mixture components. | |
covariance_type : {'full', 'tied', 'diag', 'spherical'}, default='full' | |
String describing the type of covariance parameters to use. | |
tol : float, default=1e-3 | |
The convergence threshold. EM iterations will stop when the | |
lower bound average gain is below this threshold. | |
reg_covar : float, default=1e-6 | |
Non-negative regularization added to the diagonal of covariance. | |
Allows to assure that the covariance matrices are all positive. | |
max_iter : int, default=100 | |
The number of EM iterations to perform. | |
n_init : int, default=1 | |
The number of initializations to perform. The best results are kept. | |
init_params : {'kmeans', 'random'}, default='kmeans' | |
The method used to initialize the weights, the means and the | |
precisions. | |
weights_init : array-like of shape (n_components, ), default=None | |
The user-provided initial weights. | |
If it is None, weights are initialized using the `init_params` method. | |
means_init : array-like of shape (n_components, n_features), default=None | |
The user-provided initial means, | |
If it is None, means are initialized using the `init_params` method. | |
precisions_init : array-like, default=None | |
The user-provided initial precisions (inverse of the covariance | |
matrices). | |
If it is None, precisions are initialized using the 'init_params' | |
method. | |
random_state : int, RandomState instance or None, default=None | |
Controls the random seed given to the method chosen to initialize the | |
parameters. | |
warm_start : bool, default=False | |
If 'warm_start' is True, the solution of the last fitting is used as | |
initialization for the next call of fit(). | |
verbose : int, default=0 | |
Enable verbose output. | |
verbose_interval : int, default=10 | |
Number of iteration done before the next print. | |
contamination : float in (0., 0.5), optional (default=0.1) | |
The amount of contamination of the data set. | |
Attributes | |
---------- | |
weights_ : array-like of shape (n_components,) | |
The weights of each mixture components. | |
means_ : array-like of shape (n_components, n_features) | |
The mean of each mixture component. | |
covariances_ : array-like | |
The covariance of each mixture component. | |
precisions_ : array-like | |
The precision matrices for each component in the mixture. | |
precisions_cholesky_ : array-like | |
The cholesky decomposition of the precision matrices of each mixture | |
component. | |
converged_ : bool | |
True when convergence was reached in fit(), False otherwise. | |
n_iter_ : int | |
Number of step used by the best fit of EM to reach the convergence. | |
lower_bound_ : float | |
Lower bound value on the log-likelihood (of the training data with | |
respect to the model) of the best fit of EM. | |
decision_scores_ : numpy array of shape (n_samples,) | |
The outlier scores of the training data. | |
threshold_ : float | |
The threshold is based on ``contamination``. It is the | |
``n_samples * contamination`` most abnormal samples in | |
``decision_scores_``. The threshold is calculated for generating | |
binary outlier labels. | |
labels_ : int, either 0 or 1 | |
The binary labels of the training data. 0 stands for inliers | |
and 1 for outliers/anomalies. It is generated by applying | |
``threshold_`` on ``decision_scores_``. | |
""" | |
def __init__(self, n_components=1, covariance_type='full', tol=1e-3, | |
reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans', | |
weights_init=None, means_init=None, precisions_init=None, | |
random_state=None, warm_start=False, verbose=0, | |
verbose_interval=10, contamination=0.1): | |
super().__init__(contamination=contamination) | |
self._config = GmmConfig( | |
n_components, | |
covariance_type, | |
tol, | |
reg_covar, | |
max_iter, | |
n_init, | |
init_params, | |
weights_init, means_init, precisions_init, | |
random_state, warm_start, | |
verbose, verbose_interval | |
) | |
self.detector_ = None | |
self.decision_scores_ = None | |
def fit(self, X, y=None): | |
"""Fit detector. y is ignored in unsupervised methods. | |
Parameters | |
---------- | |
X : numpy array of shape (n_samples, n_features) | |
The input samples. | |
y : Ignored | |
Not used, present for API consistency by convention. | |
sample_weight : array-like, shape (n_samples,) | |
Per-sample weights. Rescale C per sample. Higher weights | |
force the classifier to put more emphasis on these points. | |
Returns | |
------- | |
self : object | |
Fitted estimator. | |
""" | |
# validate inputs X and y (optional) | |
X = check_array(X) | |
self._set_n_classes(y) | |
self.detector_ = GaussianMixture( | |
n_components=self._config.n_components, | |
covariance_type=self._config.covariance_type, | |
tol=self._config.tol, | |
reg_covar=self._config.reg_covar, | |
max_iter=self._config.max_iter, | |
n_init=self._config.n_init, | |
init_params=self._config.init_params, | |
weights_init=self._config.weights_init, | |
means_init=self._config.means_init, | |
precisions_init=self._config.precisions_init, | |
random_state=self._config.random_state, | |
warm_start=self._config.warm_start, | |
verbose=self._config.verbose, | |
verbose_interval=self._config.verbose_interval, | |
) | |
self.detector_.fit(X=X, y=y) | |
# invert decision_scores_. Outliers comes with higher outlier scores | |
self.decision_scores_ = invert_order(self.detector_.score_samples(X)) | |
self._process_decision_scores() | |
return self | |
def decision_function(self, X): | |
"""Predict raw anomaly score of X using the fitted detector. | |
The anomaly score of an input sample is computed based on different | |
detector algorithms. For consistency, outliers are assigned with | |
larger anomaly scores. | |
Parameters | |
---------- | |
X : numpy array of shape (n_samples, n_features) | |
The training input samples. Sparse matrices are accepted only | |
if they are supported by the base estimator. | |
Returns | |
------- | |
anomaly_scores : numpy array of shape (n_samples,) | |
The anomaly score of the input samples. | |
""" | |
check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) | |
# Invert outlier scores. Outliers comes with higher outlier scores | |
return invert_order(self.detector_.score_samples(X)) | |
@property | |
def weights_(self): | |
"""The weights of each mixture components. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.weights_ | |
@property | |
def means_(self): | |
"""The mean of each mixture component. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.means_ | |
@property | |
def covariances_(self): | |
"""The covariance of each mixture component. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.convariances_ | |
@property | |
def precisions_(self): | |
"""The precision matrices for each component in the mixture. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.precisions_ | |
@property | |
def precisions_cholesky_(self): | |
"""The cholesky decomposition of the precision matrices | |
of each mixture component. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.precisions_cholesky_ | |
@property | |
def converged_(self): | |
"""True when convergence was reached in fit(), False otherwise. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.converged_ | |
@property | |
def n_iter_(self): | |
"""Number of step used by the best fit of EM to reach the convergence. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.n_iter_ | |
@property | |
def lower_bound_(self): | |
"""Lower bound value on the log-likelihood of the best fit of EM. | |
Decorator for scikit-learn Gaussian Mixture Model attributes. | |
""" | |
return self.detector_.lower_bound_ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment