|
import numpy as np |
|
from numpy.linalg import solve |
|
import logging |
|
logging.basicConfig(level = logging.DEBUG) |
|
from scipy.stats import moment,norm |
|
|
|
def fleishman(b, c, d): |
|
"""calculate the variance, skew and kurtois of a Fleishman distribution |
|
F = -c + bZ + cZ^2 + dZ^3, where Z ~ N(0,1) |
|
""" |
|
b2 = b * b |
|
c2 = c * c |
|
d2 = d * d |
|
bd = b * d |
|
var = b2 + 6*bd + 2*c2 + 15*d2 |
|
skew = 2 * c * (b2 + 24*bd + 105*d2 + 2) |
|
kurt = 24 * (bd + c2 * (1 + b2 + 28*bd) + |
|
d2 * (12 + 48*bd + 141*c2 + 225*d2)) |
|
return (var, skew, kurt) |
|
|
|
def flfunc(b, c, d, skew, kurtosis): |
|
""" |
|
Given the fleishman coefficients, and a target skew and kurtois |
|
this function will have a root if the coefficients give the desired skew and kurtosis |
|
""" |
|
x,y,z = fleishman(b,c,d) |
|
return (x - 1, y - skew, z - kurtosis) |
|
|
|
def flderiv(b, c, d): |
|
""" |
|
The deriviative of the flfunc above |
|
returns a matrix of partial derivatives |
|
""" |
|
b2 = b * b |
|
c2 = c * c |
|
d2 = d * d |
|
bd = b * d |
|
df1db = 2*b + 6*d |
|
df1dc = 4*c |
|
df1dd = 6*b + 30*d |
|
df2db = 4*c * (b + 12*d) |
|
df2dc = 2 * (b2 + 24*bd + 105*d2 + 2) |
|
df2dd = 4 * c * (12*b + 105*d) |
|
df3db = 24 * (d + c2 * (2*b + 28*d) + 48 * d**3) |
|
df3dc = 48 * c * (1 + b2 + 28*bd + 141*d2) |
|
df3dd = 24 * (b + 28*b * c2 + 2 * d * (12 + 48*bd + |
|
141*c2 + 225*d2) + d2 * (48*b + 450*d)) |
|
return np.matrix([[df1db, df1dc, df1dd], |
|
[df2db, df2dc, df2dd], |
|
[df3db, df3dc, df3dd]]) |
|
|
|
def newton(a, b, c, skew, kurtosis, max_iter=25, converge=1e-5): |
|
"""Implements newtons method to find a root of flfunc.""" |
|
f = flfunc(a, b, c, skew, kurtosis) |
|
for i in range(max_iter): |
|
if max(map(abs, f)) < converge: |
|
break |
|
J = flderiv(a, b, c) |
|
delta = -solve(J, f) |
|
(a, b, c) = delta + (a,b,c) |
|
f = flfunc(a, b, c, skew, kurtosis) |
|
return (a, b, c) |
|
|
|
|
|
def fleishmanic(skew, kurt): |
|
"""Find an initial estimate of the fleisman coefficients, to feed to newtons method""" |
|
c1 = 0.95357 - 0.05679 * kurt + 0.03520 * skew**2 + 0.00133 * kurt**2 |
|
c2 = 0.10007 * skew + 0.00844 * skew**3 |
|
c3 = 0.30978 - 0.31655 * c1 |
|
logging.debug("inital guess {},{},{}".format(c1,c2,c3)) |
|
return (c1, c2, c3) |
|
|
|
|
|
def fit_fleishman_from_sk(skew, kurt): |
|
"""Find the fleishman distribution with given skew and kurtosis |
|
mean =0 and stdev =1 |
|
|
|
Returns None if no such distribution can be found |
|
""" |
|
if kurt < -1.13168 + 1.58837 * skew**2: |
|
return None |
|
a, b, c = fleishmanic(skew, kurt) |
|
coef = newton(a, b, c, skew, kurt) |
|
return(coef) |
|
|
|
def fit_fleishman_from_standardised_data(data): |
|
"""Fit a fleishman distribution to standardised data.""" |
|
skew = moment(data,3) |
|
kurt = moment(data,4) |
|
coeff = fit_fleishman_from_sk(skew,kurt) |
|
return coeff |
|
|
|
"""Data in form of one number per line""" |
|
with open('data.txt') as f: |
|
data = np.array([float(line) for line in f]) |
|
mean = sum(data)/len(data) |
|
std = moment(data,2)**0.5 |
|
std_data = (data - mean)/std |
|
|
|
coeff = fit_fleishman_from_standardised_data(std_data) |
|
print(coeff) |
|
|
|
def describe(data): |
|
"""Return summary statistics of as set of data""" |
|
mean = sum(data)/len(data) |
|
var = moment(data,2) |
|
skew = moment(data,3)/var**1.5 |
|
kurt = moment(data,4)/var**2 |
|
return (mean,var,skew,kurt) |
|
|
|
def generate_fleishman(a,b,c,d,N=100): |
|
"""Generate N data items from fleishman's distribution with given coefficents""" |
|
Z = norm.rvs(size=N) |
|
F = a + Z*(b +Z*(c+ Z*d)) |
|
return F |
|
|
|
print(describe(data)) |
|
sim = (generate_fleishman(-coeff[1],*coeff,N=10000))*std+mean |
|
print(describe(sim)) |
I beleive there is an issue with kurtosis and excess kurtosis in this implementation. For example, I executed the last three lines with a larger sample size, and the expected kurtosis is 3 less than the realised kurtosis.
print(describe(data))
sim = (generate_fleishman(-coeff[1],*coeff,N=1000000))*std+mean
print(describe(sim))
(2.6576000000000004, 4.361090239999999, 1.8834004146338363, 6.508247433133436)
(2.660625610458773, 4.375291622768023, 1.8875322656830191, 9.531098999229004)
If we make this adjustment by converting kurtosis to excess kurtosis, then the example you have considered won't work.
def fit_fleishman_from_standardised_data_2(data):
"""Fit a fleishman distribution to standardised data."""
skew = moment(data,3)
kurt = moment(data,4)
coeff = fit_fleishman_from_sk(skew,kurt-3)
return coeff
coeff_2 = fit_fleishman_from_standardised_data_2(std_data)
print(coeff_2)
None
This is caused by the fact that fit_fleishman_from_sk will not solve with the particular skew and kurtosis from the test set.