Skip to content

Instantly share code, notes, and snippets.

@hkawabata
Last active February 7, 2023 07:52
いろいろな分布で中心極限定理を実験して確かめる
import numpy as np
import math
from matplotlib import pyplot as plt
from abc import ABC, abstractmethod
def ln_factorial(n):
"""階乗の log を計算するための補助関数"""
ret = 0
for i in range(1, n+1):
ret += np.log(i)
return ret
def gauss(x, mu, sigma):
"""x を引数として平均 mu, 標準偏差 sigma の正規分布の確率密度関数を計算"""
ret = np.exp(- (x-mu)**2 * 0.5 / sigma**2)
ret /= np.sqrt(2.0 * np.pi) * sigma
return ret
class Distribution(ABC):
"""一般的な確率分布を表すクラス"""
@abstractmethod
def get_name(self):
pass
@abstractmethod
def calc_dist_func_xy(self):
"""確率密度関数描画のための x, y を返す"""
pass
@abstractmethod
def get_average_of_samples(self, n):
"""n 件の標本をランダムに抽出し、平均値を返す"""
pass
def get_mu(self):
return self.__mu
def get_sigma(self):
return self.__sigma
def set_mu(self, mu):
self.__mu = mu
def set_sigma(self, sigma):
self.__sigma = sigma
class UniformDist(Distribution):
"""一様分布"""
def __init__(self, a, b):
self.set_mu((a + b) * 0.5)
variance = (b-a)**2/12.0
self.set_sigma(np.sqrt(variance))
self.__a = a
self.__b = b
def get_name(self):
return 'Uniform Distribution ($a={},b={}$)'.format(self.__a, self.__b)
def calc_dist_func_xy(self):
w = self.__b - self.__a
x = np.linspace(self.__a-w, self.__b+w, 300)
y = np.where((self.__a <= x) & (x <= self.__b), 1.0/w, 0)
return x, y
def get_average_of_samples(self, n):
samples = np.random.rand(n) * (self.__b - self.__a) + self.__a
return samples.mean()
class BetaDist(Distribution):
"""ベータ分布"""
def __init__(self, a, b):
self.set_mu(a/(a+b))
variance = a*b/(a+b+1)/(a+b)**2
self.set_sigma(np.sqrt(variance))
self.__a = a
self.__b = b
def get_name(self):
return r'Beta Distribution ($\alpha={}, \beta={}$)'.format(self.__a, self.__b)
def calc_dist_func_xy(self):
def beta(p):
ret = np.zeros(p.shape)
for i in range(len(p)):
if 0 < p[i] < 1:
log_beta = (self.__a-1) * np.log(p[i]) + (self.__b-1) * np.log(1-p[i]) + ln_factorial(self.__a+self.__b-1) - ln_factorial(self.__a-1) - ln_factorial(self.__b-1)
ret[i] = np.exp(log_beta)
return ret
x = np.linspace(0, 1.0, 100)
y = beta(x)
return x, y
def get_average_of_samples(self, n):
samples = np.random.beta(self.__a, self.__b, n)
return samples.mean()
class ChiSquaredDist(Distribution):
"""カイ二乗分布"""
def __init__(self, k):
self.set_mu(k)
variance = 2 * k
self.set_sigma(np.sqrt(variance))
self.__k = k
def get_name(self):
return r'$\chi^2$ Distribution ($k={}$)'.format(self.__k)
def calc_dist_func_xy(self):
x = np.linspace(0, 20.0, 100)
y = np.exp(-x*0.5) / (2**(self.__k*0.5)) / math.gamma(self.__k*0.5) * np.power(x, self.__k*0.5-1)
return x, y
def get_average_of_samples(self, n):
samples = np.random.chisquare(self.__k, n)
return samples.mean()
class MyLinearDist(Distribution):
"""一次関数型の分布"""
def __init__(self, a):
self.set_mu(a * 2.0 / 3.0)
variance = a * a / 18.0
self.set_sigma(np.sqrt(variance))
self.__x_max = a
self.__c = 2.0 / (a*a) # 直線の傾き
self.__y_max = 2.0 / a # 分布内の y の最大値
def get_name(self):
return 'Self-Made Linear Distribution'
def calc_dist_func_xy(self):
x = np.linspace(0-self.__x_max*0.5, self.__x_max*1.5, 100)
y = np.where((0 <= x) & (x <= self.__x_max), self.__c * x, 0)
return x, y
def get_average_of_samples(self, n):
samples = []
while len(samples) < n:
x_tmp = np.random.rand() * self.__x_max
y_tmp = np.random.rand() * self.__y_max
if y_tmp <= self.__c * x_tmp:
samples.append(x_tmp)
return np.mean(samples)
class MyQuadraticDist(Distribution):
"""二次関数型の分布"""
def __init__(self, a):
self.set_mu(0)
variance = a * a * 3.0 / 5.0
self.set_sigma(np.sqrt(variance))
self.__x_max = a
self.__c = 3.0 / (2.0*a*a*a) # 二次関数の係数
self.__y_max = 3.0 / (2.0*a) # 分布内の y の最大値
def get_name(self):
return 'Self-Made Quadratic Distribution'
def calc_dist_func_xy(self):
x = np.linspace(-self.__x_max*1.5, self.__x_max*1.5, 100)
y = np.where((-self.__x_max <= x) & (x <= self.__x_max), x*x*self.__c, 0)
return x, y
def get_average_of_samples(self, n):
samples = []
while len(samples) < n:
x_tmp = (np.random.rand()*2-1.0) * self.__x_max
y_tmp = np.random.rand() * self.__y_max
if y_tmp <= self.__c * x_tmp * x_tmp:
samples.append(x_tmp)
return np.mean(samples)
def func(dist):
"""
与えられた確率分布について色々な標本サイズで標本平均を繰り返し計算し、
標本平均の分布が正規分布に近づくことをグラフで確認する
"""
T = 100000 # n個の標本を抽出して平均を取る操作を何度繰り返すか
n_samples = [2, 4, 8, 32, 128] # 標本サイズ
plt.figure(figsize=(10, 6))
plt.subplots_adjust(wspace=0.3, hspace=0.4)
# 母集団の確率密度関数を描画
plt.subplot(2, 3, 1)
x, y = dist.calc_dist_func_xy()
plt.title('Distribution Function')
plt.grid()
plt.fill_between(x, np.zeros(x.size), y, facecolor='red', alpha=0.3)
plt.plot(x, y, color='black')
# 標本を繰り返し抽出して平均値を記録・ヒストグラム描画
for i in range(len(n_samples)):
n = n_samples[i]
sample_mean = np.empty(T, dtype='float')
for t in range(T):
sample_mean[t] = dist.get_average_of_samples(n)
# ヒストグラム描画のための階級幅を計算
mean_max = sample_mean.max()
mean_min = sample_mean.min()
bins = np.linspace(mean_min, mean_max, 50)
# ヒストグラムを描画
plt.subplot(2, 3, i+2)
plt.hist(sample_mean, bins=bins, density=True)
# 中心極限定理により収束が期待される正規分布を描画
mu = dist.get_mu()
sigma = dist.get_sigma() / np.sqrt(n)
x_norm = np.linspace(mean_min-(mean_max-mean_min)*0.2, mean_max+(mean_max-mean_min)*0.2, 100)
norm = gauss(x_norm, mu=mu, sigma=sigma)
plt.title(r'$n_{{\rm sample}} = {}$'.format(n))
plt.plot(x_norm, norm, lw=1.0, color='red', label=r'$N(\mu, \sigma^2/{})$'.format(n))
plt.legend()
plt.suptitle('Sample Mean of ' + dist.get_name())
plt.show()
func(UniformDist(0, 6.0))
func(BetaDist(9, 3))
func(BetaDist(2, 2))
func(ChiSquaredDist(2))
func(ChiSquaredDist(5))
func(MyLinearDist(1.0))
func(MyQuadraticDist(1.0))
@hkawabata
Copy link
Author

UniformDist

BetaDist-1

BetaDist-2

ChiSquaredDist-1

ChiSquaredDist-2

LinearDist

QuadraticDist

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment