Yeo-Johnson Transformation
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import sys | |
__author__ = "Mohsen Mesgarpour" | |
__copyright__ = "Copyright 2016, https://github.com/mesgarpour" | |
__credits__ = ["Mohsen Mesgarpour"] | |
__license__ = "GPL" | |
__version__ = "1.0" | |
__maintainer__ = "Mohsen Mesgarpour" | |
__email__ = "mohsen.mesgarpour@gmail.com" | |
class YeoJohnson: | |
""" | |
Computing Yeo-Johnson transofrmation, which is an extension of Box-Cox transformation | |
but can handle both positive and negative values. | |
References: | |
Weisberg, S. (2001). Yeo-Johnson Power Transformations. | |
Department of Applied Statistics, University of Minnesota. Retrieved June, 1, 2003. | |
https://www.stat.umn.edu/arc/yjpower.pdf | |
Adapted from CRAN - Package VGAM | |
""" | |
def fit(self, y, lmbda, derivative=0, epsilon=np.finfo(np.float).eps, inverse=False): | |
""" | |
:param y: The variable to be transformed (numeric array). | |
:param lmbda: The function's Lambda value (numeric value or array). | |
:param derivative: The derivative with respect to lambda | |
(non-negative integer; default: ordinary function evaluation). | |
:param epsilon: The lambda's tolerance (positive value). | |
:param inverse: The inverse transformation option (logical value). | |
:return: The Yeo-Johnson transformation or its inverse, or its derivatives with respect to lambda, of y. | |
""" | |
# Validate arguments | |
self.__validate(y, lmbda, derivative, epsilon, inverse) | |
# initialise | |
y = np.array(y, dtype=float) | |
result = y | |
if not (isinstance(lmbda, list) or isinstance(lmbda, np.ndarray)): | |
lmbda, y = np.broadcast_arrays(lmbda, y) | |
lmbda = np.array(lmbda, dtype=float) | |
l0 = np.abs(lmbda) > epsilon | |
l2 = np.abs(lmbda - 2) > epsilon | |
# Inverse | |
with warnings.catch_warnings(): # suppress warnings | |
warnings.simplefilter("ignore") | |
if inverse is True: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.power(np.multiply(y[mask], lmbda[mask]) + 1, 1 / lmbda[mask]) - 1 | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.expm1(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = 1 - np.power(np.multiply(-(2 - lmbda[mask]), y[mask]) + 1, 1 / (2 - lmbda[mask])) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.expm1(-y[mask]) | |
# Derivative | |
else: | |
if derivative == 0: | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.power(y[mask] + 1, lmbda[mask]) - 1, lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.log1p(y[mask]) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.power(-y[mask] + 1, 2 - lmbda[mask]) - 1), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = -np.log1p(-y[mask]) | |
# Not Derivative | |
else: | |
p = self.fit(y, lmbda, derivative=derivative - 1, epsilon=epsilon, inverse=inverse) | |
mask = np.where(((y >= 0) & l0) == True) | |
result[mask] = np.divide(np.multiply(np.power(y[mask] + 1, lmbda[mask]), | |
np.power(np.log1p(y[mask]), derivative)) - | |
np.multiply(derivative, p[mask]), lmbda[mask]) | |
mask = np.where(((y >= 0) & ~l0) == True) | |
result[mask] = np.divide(np.power(np.log1p(y[mask]), derivative + 1), derivative + 1) | |
mask = np.where(((y < 0) & l2) == True) | |
result[mask] = np.divide(-(np.multiply(np.power(-y[mask] + 1, 2 - lmbda[mask]), | |
np.power(-np.log1p(-y[mask]), derivative)) - | |
np.multiply(derivative, p[mask])), 2 - lmbda[mask]) | |
mask = np.where(((y < 0) & ~l2) == True) | |
result[mask] = np.divide(np.power(-np.log1p(-y[mask]), derivative + 1), derivative + 1) | |
return result | |
@staticmethod | |
def __validate(y, lmbda, derivative, epsilon, inverse): | |
try: | |
if not isinstance(y, (list, np.ndarray, pd.Series)): | |
raise Exception("Argument 'y' must be a list!") | |
if not isinstance(lmbda, (int, float, np.int, np.float)): | |
if not isinstance(lmbda, (list, np.ndarray, pd.Series)) or len(lmbda) != len(y): | |
raise Exception("Argument 'lmbda' must be a number " | |
"or a list, which its length matches 'y' argument!") | |
if not isinstance(derivative, (int, float, np.int, np.float)) or derivative < 0: | |
raise Exception("Argument 'derivative' must be a non-negative integer!") | |
if not isinstance(epsilon, (int, float, np.int, np.float)) or epsilon <= 0: | |
raise Exception("Argument 'epsilon' must be a positive number!") | |
if not isinstance(inverse, bool): | |
raise Exception("Argument 'inverse' must be boolean!") | |
if inverse is True and derivative != 0: | |
raise Exception("Argument 'derivative' must be zero " | |
"when argument 'inverse' is 'True'!") | |
except (): | |
sys.exit() |
This comment has been minimized.
This comment has been minimized.
Sorry @croessert, I was not monitoring the comments. The Lambda value indicates the power to which all data should be raised to transform it to normal distribution. You have three principal options:
Some general notes:
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Could you tell me how to extract lambda from R using rpy2? This value should be somewhere in r_scale.rx('yj'), right?
Thanks!