Created
December 5, 2023 07:42
-
-
Save jukujala/cbaadd3233b2637a12ccdf6e3be464d7 to your computer and use it in GitHub Desktop.
Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Generate data from different distributions, fit a log-normal and | |
show how much fitted mean overvaluates data mean. | |
Overvaluation 0.05 = 5% higher fitted log-norm mean than data mean. | |
Copy-paste of output: | |
* log normal data | |
- mean(empirical data): 4.22, mean(fitted long-norm): 4.26, overvaluation: 0.01 | |
* uniform data | |
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29 | |
* positive normal data | |
- mean(empirical data): 1.33, mean(fitted long-norm): 1.53, overvaluation: 0.15 | |
* truncated positive normal data | |
- mean(empirical data): 1.18, mean(fitted long-norm): 1.34, overvaluation: 0.14 | |
* ratio of normals data | |
- mean(empirical data): 6.13, mean(fitted long-norm): 2.68, overvaluation: -0.56 | |
* exp data | |
- mean(empirical data): 0.50, mean(fitted long-norm): 0.64, overvaluation: 0.29 | |
* gamma data | |
- mean(empirical data): 2.03, mean(fitted long-norm): 2.50, overvaluation: 0.23 | |
* power law data | |
- mean(empirical data): 0.08, mean(fitted long-norm): 1976909492656863.75, overvaluation: 23310471692441804.00 | |
* poisson data | |
- mean(empirical data): 3.95, mean(fitted long-norm): 4.01, overvaluation: 0.01 | |
* log normal data + few big outliers | |
- mean(empirical data): 4.78, mean(fitted long-norm): 4.53, overvaluation: -0.05 | |
""" | |
import numpy as np | |
import scipy.stats as stats | |
def fit_lognorm(some_data): | |
# fit some_data to log-norm an compare mean if fitted distribution vs empirical data | |
shape, loc, scale = stats.lognorm.fit(some_data, floc=0) | |
# log-norm distribution parameter explanations: | |
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html | |
sigma = shape | |
mu = np.log(scale) | |
lognorm_mean = np.exp(mu + sigma**2 / 2) | |
overval = lognorm_mean/np.mean(some_data)-1 | |
print(f" - mean(empirical data): {np.mean(some_data):.2f}, mean(fitted long-norm): {lognorm_mean:.2f}, overvaluation: {overval:.2f}") | |
# sanity check for log norm fitting: | |
# lognorm_samples = stats.lognorm.rvs(shape, loc=loc, scale=scale, size=1000) | |
# print(f"log-norm sample mean {np.mean(lognorm_samples)}") | |
return shape, loc, scale | |
print("Generate data from different distributions, fit a log-normal and show how much fitted mean overvaluates data mean.") | |
print("Overvaluation 0.05 = 5% higher log-normal mean vs data\n\n") | |
print(" * log normal data") | |
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000) | |
fit_lognorm(some_data) | |
print(" * uniform data") | |
some_data = np.random.uniform(low=0.0, high=1.0, size=1000) | |
fit_lognorm(some_data) | |
print(" * positive normal data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample = np.random.normal(1.0, 1.0) | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * truncated positive normal data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample = np.random.normal(1.0, 1.0) | |
sample = min(sample, 2.0) | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * ratio of normals data") | |
some_data = [] | |
while len(some_data) < 1000: | |
sample1 = np.random.normal(1.0, 1.0) | |
sample2 = np.random.normal(1.0, 1.0) | |
sample = sample1/sample2 | |
if sample > 0: | |
some_data.append(sample) | |
fit_lognorm(some_data) | |
print(" * exp data") | |
mean = 2.0 # for example | |
scale = 1.0 / mean | |
some_data = np.random.exponential(scale, 1000) | |
fit_lognorm(some_data) | |
print(" * gamma data") | |
shape = 1.0 # Example value for k (alpha) | |
scale = 2.0 # Example value for theta (beta) | |
some_data = np.random.gamma(shape, scale, 1000) | |
fit_lognorm(some_data) | |
print(" * power law data") | |
shape_parameter = 0.1 | |
some_data = np.random.power(shape_parameter, 1000) | |
fit_lognorm(some_data) | |
print(" * poisson data") | |
lambda_param = 3.0 | |
some_data = [x+1 for x in np.random.poisson(lambda_param, 1000)] | |
fit_lognorm(some_data) | |
print(" * log normal data + few big outliers") | |
some_data = np.random.lognormal(mean=1.0, sigma=1.0, size=1000) | |
some_data[5] = 100.0 | |
some_data[500] = 100.0 | |
some_data[678] = 100.0 | |
fit_lognorm(some_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment