/central-limit-theorem.py Secret
Last active
February 7, 2023 07:52
いろいろな分布で中心極限定理を実験して確かめる
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import math | |
from matplotlib import pyplot as plt | |
from abc import ABC, abstractmethod | |
def ln_factorial(n): | |
"""階乗の log を計算するための補助関数""" | |
ret = 0 | |
for i in range(1, n+1): | |
ret += np.log(i) | |
return ret | |
def gauss(x, mu, sigma): | |
"""x を引数として平均 mu, 標準偏差 sigma の正規分布の確率密度関数を計算""" | |
ret = np.exp(- (x-mu)**2 * 0.5 / sigma**2) | |
ret /= np.sqrt(2.0 * np.pi) * sigma | |
return ret | |
class Distribution(ABC): | |
"""一般的な確率分布を表すクラス""" | |
@abstractmethod | |
def get_name(self): | |
pass | |
@abstractmethod | |
def calc_dist_func_xy(self): | |
"""確率密度関数描画のための x, y を返す""" | |
pass | |
@abstractmethod | |
def get_average_of_samples(self, n): | |
"""n 件の標本をランダムに抽出し、平均値を返す""" | |
pass | |
def get_mu(self): | |
return self.__mu | |
def get_sigma(self): | |
return self.__sigma | |
def set_mu(self, mu): | |
self.__mu = mu | |
def set_sigma(self, sigma): | |
self.__sigma = sigma | |
class UniformDist(Distribution): | |
"""一様分布""" | |
def __init__(self, a, b): | |
self.set_mu((a + b) * 0.5) | |
variance = (b-a)**2/12.0 | |
self.set_sigma(np.sqrt(variance)) | |
self.__a = a | |
self.__b = b | |
def get_name(self): | |
return 'Uniform Distribution ($a={},b={}$)'.format(self.__a, self.__b) | |
def calc_dist_func_xy(self): | |
w = self.__b - self.__a | |
x = np.linspace(self.__a-w, self.__b+w, 300) | |
y = np.where((self.__a <= x) & (x <= self.__b), 1.0/w, 0) | |
return x, y | |
def get_average_of_samples(self, n): | |
samples = np.random.rand(n) * (self.__b - self.__a) + self.__a | |
return samples.mean() | |
class BetaDist(Distribution): | |
"""ベータ分布""" | |
def __init__(self, a, b): | |
self.set_mu(a/(a+b)) | |
variance = a*b/(a+b+1)/(a+b)**2 | |
self.set_sigma(np.sqrt(variance)) | |
self.__a = a | |
self.__b = b | |
def get_name(self): | |
return r'Beta Distribution ($\alpha={}, \beta={}$)'.format(self.__a, self.__b) | |
def calc_dist_func_xy(self): | |
def beta(p): | |
ret = np.zeros(p.shape) | |
for i in range(len(p)): | |
if 0 < p[i] < 1: | |
log_beta = (self.__a-1) * np.log(p[i]) + (self.__b-1) * np.log(1-p[i]) + ln_factorial(self.__a+self.__b-1) - ln_factorial(self.__a-1) - ln_factorial(self.__b-1) | |
ret[i] = np.exp(log_beta) | |
return ret | |
x = np.linspace(0, 1.0, 100) | |
y = beta(x) | |
return x, y | |
def get_average_of_samples(self, n): | |
samples = np.random.beta(self.__a, self.__b, n) | |
return samples.mean() | |
class ChiSquaredDist(Distribution): | |
"""カイ二乗分布""" | |
def __init__(self, k): | |
self.set_mu(k) | |
variance = 2 * k | |
self.set_sigma(np.sqrt(variance)) | |
self.__k = k | |
def get_name(self): | |
return r'$\chi^2$ Distribution ($k={}$)'.format(self.__k) | |
def calc_dist_func_xy(self): | |
x = np.linspace(0, 20.0, 100) | |
y = np.exp(-x*0.5) / (2**(self.__k*0.5)) / math.gamma(self.__k*0.5) * np.power(x, self.__k*0.5-1) | |
return x, y | |
def get_average_of_samples(self, n): | |
samples = np.random.chisquare(self.__k, n) | |
return samples.mean() | |
class MyLinearDist(Distribution): | |
"""一次関数型の分布""" | |
def __init__(self, a): | |
self.set_mu(a * 2.0 / 3.0) | |
variance = a * a / 18.0 | |
self.set_sigma(np.sqrt(variance)) | |
self.__x_max = a | |
self.__c = 2.0 / (a*a) # 直線の傾き | |
self.__y_max = 2.0 / a # 分布内の y の最大値 | |
def get_name(self): | |
return 'Self-Made Linear Distribution' | |
def calc_dist_func_xy(self): | |
x = np.linspace(0-self.__x_max*0.5, self.__x_max*1.5, 100) | |
y = np.where((0 <= x) & (x <= self.__x_max), self.__c * x, 0) | |
return x, y | |
def get_average_of_samples(self, n): | |
samples = [] | |
while len(samples) < n: | |
x_tmp = np.random.rand() * self.__x_max | |
y_tmp = np.random.rand() * self.__y_max | |
if y_tmp <= self.__c * x_tmp: | |
samples.append(x_tmp) | |
return np.mean(samples) | |
class MyQuadraticDist(Distribution): | |
"""二次関数型の分布""" | |
def __init__(self, a): | |
self.set_mu(0) | |
variance = a * a * 3.0 / 5.0 | |
self.set_sigma(np.sqrt(variance)) | |
self.__x_max = a | |
self.__c = 3.0 / (2.0*a*a*a) # 二次関数の係数 | |
self.__y_max = 3.0 / (2.0*a) # 分布内の y の最大値 | |
def get_name(self): | |
return 'Self-Made Quadratic Distribution' | |
def calc_dist_func_xy(self): | |
x = np.linspace(-self.__x_max*1.5, self.__x_max*1.5, 100) | |
y = np.where((-self.__x_max <= x) & (x <= self.__x_max), x*x*self.__c, 0) | |
return x, y | |
def get_average_of_samples(self, n): | |
samples = [] | |
while len(samples) < n: | |
x_tmp = (np.random.rand()*2-1.0) * self.__x_max | |
y_tmp = np.random.rand() * self.__y_max | |
if y_tmp <= self.__c * x_tmp * x_tmp: | |
samples.append(x_tmp) | |
return np.mean(samples) | |
def func(dist): | |
""" | |
与えられた確率分布について色々な標本サイズで標本平均を繰り返し計算し、 | |
標本平均の分布が正規分布に近づくことをグラフで確認する | |
""" | |
T = 100000 # n個の標本を抽出して平均を取る操作を何度繰り返すか | |
n_samples = [2, 4, 8, 32, 128] # 標本サイズ | |
plt.figure(figsize=(10, 6)) | |
plt.subplots_adjust(wspace=0.3, hspace=0.4) | |
# 母集団の確率密度関数を描画 | |
plt.subplot(2, 3, 1) | |
x, y = dist.calc_dist_func_xy() | |
plt.title('Distribution Function') | |
plt.grid() | |
plt.fill_between(x, np.zeros(x.size), y, facecolor='red', alpha=0.3) | |
plt.plot(x, y, color='black') | |
# 標本を繰り返し抽出して平均値を記録・ヒストグラム描画 | |
for i in range(len(n_samples)): | |
n = n_samples[i] | |
sample_mean = np.empty(T, dtype='float') | |
for t in range(T): | |
sample_mean[t] = dist.get_average_of_samples(n) | |
# ヒストグラム描画のための階級幅を計算 | |
mean_max = sample_mean.max() | |
mean_min = sample_mean.min() | |
bins = np.linspace(mean_min, mean_max, 50) | |
# ヒストグラムを描画 | |
plt.subplot(2, 3, i+2) | |
plt.hist(sample_mean, bins=bins, density=True) | |
# 中心極限定理により収束が期待される正規分布を描画 | |
mu = dist.get_mu() | |
sigma = dist.get_sigma() / np.sqrt(n) | |
x_norm = np.linspace(mean_min-(mean_max-mean_min)*0.2, mean_max+(mean_max-mean_min)*0.2, 100) | |
norm = gauss(x_norm, mu=mu, sigma=sigma) | |
plt.title(r'$n_{{\rm sample}} = {}$'.format(n)) | |
plt.plot(x_norm, norm, lw=1.0, color='red', label=r'$N(\mu, \sigma^2/{})$'.format(n)) | |
plt.legend() | |
plt.suptitle('Sample Mean of ' + dist.get_name()) | |
plt.show() | |
func(UniformDist(0, 6.0)) | |
func(BetaDist(9, 3)) | |
func(BetaDist(2, 2)) | |
func(ChiSquaredDist(2)) | |
func(ChiSquaredDist(5)) | |
func(MyLinearDist(1.0)) | |
func(MyQuadraticDist(1.0)) |
Author
hkawabata
commented
Feb 7, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment