Last active
January 19, 2022 10:30
-
-
Save pierdom/235278048ab7127b64c1f87ca7297c87 to your computer and use it in GitHub Desktop.
[Find best fitting distributions] Find the best fitting PDFs (power distribution functions) from a list of well-known distributions in scipy. Inspired by this: https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python #datascience #python #matplotlib #visualization #statistics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import matplotlib and set inline | |
import matplotlib | |
matplotlib.use("Agg") | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
# import other libraries | |
import scipy.stats as st | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import warnings | |
# set seaborn style | |
sns.set_style("whitegrid") | |
# my distribution | |
mydistr = ... | |
# X axis with custom binning (change here, depending on what you need) | |
mydistr_x = np.linspace(mydistr.min(),mydistr.max(),int(mydistr.max())) | |
# define a set of distributions to check | |
distribution_list = [ | |
st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine, | |
st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk, | |
st.foldcauchy,st.foldnorm,st.frechet_r,st.frechet_l,st.genlogistic,st.genpareto,st.gennorm,st.genexpon, | |
st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r, | |
st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss, | |
st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,st.levy_stable, | |
st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf, | |
st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal, | |
st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda, | |
st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy | |
] | |
# this list will contain the result of the fittings | |
distribution_fitting = [] | |
# fit every distribution in the list above | |
for ref_distr in distribution_list: | |
# pick distribution name | |
distr_name = type(ref_distr).__name__.split("_")[0] | |
try: | |
# ignore warnings | |
with warnings.catch_warnings(): | |
warnings.simplefilter("ignore") | |
# get parameters from the best fit | |
params = ref_distr.fit(mydistr) | |
arg = params[:-2] | |
loc = params[-2] | |
scale = params[-1] | |
# build the PDF from previous parameters | |
pdf_fitted = ref_distr.pdf(mydistr_x,loc=loc,scale=scale,*arg) | |
# calculate maximum likelihood estimator | |
mle = ref_distr.nnlf(params, mydistr) | |
# as an alternative: sum of square error | |
sse = np.sum(np.power(mydistr_x - pdf_fitted, 2.0)) | |
# add results to list | |
distribution_fitting.append({ | |
"distr_name": distr_name, "mle": mle, "sse":sse, | |
"pdf_fitted": pdf_fitted, "params": params}) | |
# ignore distributions that could not be fitted | |
except Exception: | |
print("Discarded function: {}".format(distr_name)) | |
# plot data histograms | |
fig, ax = plt.subplots(1,1,dpi=120) | |
_ = ax.hist(mydistr, mydistr_x, normed=True, color="gray", alpha=0.4) | |
# set stuff for this axis | |
ax.set_prop_cycle('color',plt.cm.rainbow(np.linspace(0,1,top))) | |
ax.set_title("Best fitting PDFs") | |
# just print TOP distributions according to maxim. likelihood estimator | |
top = 5 | |
sort_by = "mle" # or sse | |
for d in sorted(distribution_fitting, key=lambda k: k[sort_by])[:top]: | |
distr_name = d['distr_name'] | |
mle = d['mle'] | |
pdf_fitted = d['pdf_fitted'] | |
params = d['params'] | |
label = "{} [{:.2f}]".format(distr_name,mle) | |
_ = ax.plot(mydistr_x, pdf_fitted, label=label) | |
_ = ax.legend() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment