Skip to content

Instantly share code, notes, and snippets.

@jukujala
Created December 5, 2023 07:42
Show Gist options
  • Save jukujala/cbaadd3233b2637a12ccdf6e3be464d7 to your computer and use it in GitHub Desktop.
Save jukujala/cbaadd3233b2637a12ccdf6e3be464d7 to your computer and use it in GitHub Desktop.
Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.
"""
Generate data from different distributions, fit a log-normal and
show how much fitted mean overvaluates data mean.
Overvaluation 0.05 = 5% higher fitted log-norm mean than data mean.
Copy-paste of output:
* log normal data
- mean(empirical data): 4.22, mean(fitted long-norm): 4.26, overvaluation: 0.01
* uniform data
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29
* positive normal data
- mean(empirical data): 1.33, mean(fitted long-norm): 1.53, overvaluation: 0.15
* truncated positive normal data
- mean(empirical data): 1.18, mean(fitted long-norm): 1.34, overvaluation: 0.14
* ratio of normals data
- mean(empirical data): 6.13, mean(fitted long-norm): 2.68, overvaluation: -0.56
* exp data
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29
* gamma data
- mean(empirical data): 2.03, mean(fitted long-norm): 2.50, overvaluation: 0.23
* power law data
- mean(empirical data): 0.08, mean(fitted long-norm): 1976909492656863.75, overvaluation: 23310471692441804.00
* poisson data
- mean(empirical data): 3.95, mean(fitted long-norm): 4.01, overvaluation: 0.01
* log normal data + few big outliers
- mean(empirical data): 4.78, mean(fitted long-norm): 4.53, overvaluation: -0.05
"""
import numpy as np
import scipy.stats as stats
def fit_lognorm(some_data):
# fit some_data to log-norm an compare mean if fitted distribution vs empirical data
shape, loc, scale = stats.lognorm.fit(some_data, floc=0)
# log-norm distribution parameter explanations:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html
sigma = shape
mu = np.log(scale)
lognorm_mean = np.exp(mu + sigma**2 / 2)
overval = lognorm_mean/np.mean(some_data)-1
print(f" - mean(empirical data): {np.mean(some_data):.2f}, mean(fitted long-norm): {lognorm_mean:.2f}, overvaluation: {overval:.2f}")
# sanity check for log norm fitting:
# lognorm_samples = stats.lognorm.rvs(shape, loc=loc, scale=scale, size=1000)
# print(f"log-norm sample mean {np.mean(lognorm_samples)}")
return shape, loc, scale
print("Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.")
print("Overvaluation 0.05 = 5% higher log-normal mean vs data\n\n")
print(" * log normal data")
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000)
fit_lognorm(some_data)
print(" * uniform data")
some_data = np.random.uniform(low=0.0, high=1.0, size=1000)
fit_lognorm(some_data)
print(" * positive normal data")
some_data = []
while len(some_data) < 1000:
sample = np.random.normal(1.0, 1.0)
if sample > 0:
some_data.append(sample)
fit_lognorm(some_data)
print(" * truncated positive normal data")
some_data = []
while len(some_data) < 1000:
sample = np.random.normal(1.0, 1.0)
sample = min(sample, 2.0)
if sample > 0:
some_data.append(sample)
fit_lognorm(some_data)
print(" * ratio of normals data")
some_data = []
while len(some_data) < 1000:
sample1 = np.random.normal(1.0, 1.0)
sample2 = np.random.normal(1.0, 1.0)
sample = sample1/sample2
if sample > 0:
some_data.append(sample)
fit_lognorm(some_data)
print(" * exp data")
mean = 2.0 # for example
scale = 1.0 / mean
some_data = np.random.exponential(scale, 1000)
fit_lognorm(some_data)
print(" * gamma data")
shape = 1.0 # Example value for k (alpha)
scale = 2.0 # Example value for theta (beta)
some_data = np.random.gamma(shape, scale, 1000)
fit_lognorm(some_data)
print(" * power law data")
shape_parameter = 0.1
some_data = np.random.power(shape_parameter, 1000)
fit_lognorm(some_data)
print(" * poisson data")
lambda_param = 3.0
some_data = [x+1 for x in np.random.poisson(lambda_param, 1000)]
fit_lognorm(some_data)
print(" * log normal data + few big outliers")
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000)
some_data[5] = 100.0
some_data[500] = 100.0
some_data[678] = 100.0
fit_lognorm(some_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment