hkawabata/central-limit-theorem.py Secret

## central-limit-theorem.py
import numpy as np
import math
from matplotlib import pyplot as plt
from abc import ABC, abstractmethod

def ln_factorial(n):
	"""階乗の log を計算するための補助関数"""
	ret = 0
	for i in range(1, n+1):
		ret += np.log(i)
	return ret

def gauss(x, mu, sigma):
	"""x を引数として平均 mu, 標準偏差 sigma の正規分布の確率密度関数を計算"""
	ret = np.exp(- (x-mu)**2 * 0.5 / sigma**2)
	ret /= np.sqrt(2.0 * np.pi) * sigma
	return ret

class Distribution(ABC):
	"""一般的な確率分布を表すクラス"""
	@abstractmethod
	def get_name(self):
		pass

	@abstractmethod
	def calc_dist_func_xy(self):
		"""確率密度関数描画のための x, y を返す"""
		pass

	@abstractmethod
	def get_average_of_samples(self, n):
		"""n 件の標本をランダムに抽出し、平均値を返す"""
		pass

	def get_mu(self):
		return self.__mu

	def get_sigma(self):
		return self.__sigma

	def set_mu(self, mu):
		self.__mu = mu

	def set_sigma(self, sigma):
		self.__sigma = sigma


class UniformDist(Distribution):
	"""一様分布"""
	def __init__(self, a, b):
		self.set_mu((a + b) * 0.5)
		variance = (b-a)**2/12.0
		self.set_sigma(np.sqrt(variance))
		self.__a = a
		self.__b = b

	def get_name(self):
		return 'Uniform Distribution ($a={},b={}$)'.format(self.__a, self.__b)

	def calc_dist_func_xy(self):
		w = self.__b - self.__a
		x = np.linspace(self.__a-w, self.__b+w, 300)
		y = np.where((self.__a <= x) & (x <= self.__b), 1.0/w, 0)
		return x, y

	def get_average_of_samples(self, n):
		samples = np.random.rand(n) * (self.__b - self.__a) + self.__a
		return samples.mean()


class BetaDist(Distribution):
	"""ベータ分布"""
	def __init__(self, a, b):
		self.set_mu(a/(a+b))
		variance = a*b/(a+b+1)/(a+b)**2
		self.set_sigma(np.sqrt(variance))
		self.__a = a
		self.__b = b

	def get_name(self):
		return r'Beta Distribution ($\alpha={}, \beta={}$)'.format(self.__a, self.__b)

	def calc_dist_func_xy(self):
		def beta(p):
			ret = np.zeros(p.shape)
			for i in range(len(p)):
				if 0 < p[i] < 1:
					log_beta = (self.__a-1) * np.log(p[i]) + (self.__b-1) * np.log(1-p[i]) + ln_factorial(self.__a+self.__b-1) - ln_factorial(self.__a-1) - ln_factorial(self.__b-1)
					ret[i] = np.exp(log_beta)
			return ret
		x = np.linspace(0, 1.0, 100)
		y = beta(x)
		return x, y

	def get_average_of_samples(self, n):
		samples = np.random.beta(self.__a, self.__b, n)
		return samples.mean()


class ChiSquaredDist(Distribution):
	"""カイ二乗分布"""
	def __init__(self, k):
		self.set_mu(k)
		variance = 2 * k
		self.set_sigma(np.sqrt(variance))
		self.__k = k

	def get_name(self):
		return r'$\chi^2$ Distribution ($k={}$)'.format(self.__k)

	def calc_dist_func_xy(self):
		x = np.linspace(0, 20.0, 100)
		y = np.exp(-x*0.5) / (2**(self.__k*0.5)) / math.gamma(self.__k*0.5) * np.power(x, self.__k*0.5-1)
		return x, y

	def get_average_of_samples(self, n):
		samples = np.random.chisquare(self.__k, n)
		return samples.mean()


class MyLinearDist(Distribution):
	"""一次関数型の分布"""
	def __init__(self, a):
		self.set_mu(a * 2.0 / 3.0)
		variance = a * a / 18.0
		self.set_sigma(np.sqrt(variance))
		self.__x_max = a
		self.__c = 2.0 / (a*a)  # 直線の傾き
		self.__y_max = 2.0 / a  # 分布内の y の最大値

	def get_name(self):
		return 'Self-Made Linear Distribution'

	def calc_dist_func_xy(self):
		x = np.linspace(0-self.__x_max*0.5, self.__x_max*1.5, 100)
		y = np.where((0 <= x) & (x <= self.__x_max), self.__c * x, 0)
		return x, y

	def get_average_of_samples(self, n):
		samples = []
		while len(samples) < n:
			x_tmp = np.random.rand() * self.__x_max
			y_tmp = np.random.rand() * self.__y_max
			if y_tmp <= self.__c * x_tmp:
				samples.append(x_tmp)
		return np.mean(samples)


class MyQuadraticDist(Distribution):
	"""二次関数型の分布"""
	def __init__(self, a):
		self.set_mu(0)
		variance = a * a * 3.0 / 5.0
		self.set_sigma(np.sqrt(variance))
		self.__x_max = a
		self.__c = 3.0 / (2.0*a*a*a)  # 二次関数の係数
		self.__y_max = 3.0 / (2.0*a)  # 分布内の y の最大値

	def get_name(self):
		return 'Self-Made Quadratic Distribution'

	def calc_dist_func_xy(self):
		x = np.linspace(-self.__x_max*1.5, self.__x_max*1.5, 100)
		y = np.where((-self.__x_max <= x) & (x <= self.__x_max), x*x*self.__c, 0)
		return x, y

	def get_average_of_samples(self, n):
		samples = []
		while len(samples) < n:
			x_tmp = (np.random.rand()*2-1.0) * self.__x_max
			y_tmp = np.random.rand() * self.__y_max
			if y_tmp <= self.__c * x_tmp * x_tmp:
				samples.append(x_tmp)
		return np.mean(samples)


def func(dist):
	"""
	与えられた確率分布について色々な標本サイズで標本平均を繰り返し計算し、
	標本平均の分布が正規分布に近づくことをグラフで確認する
	"""
	T = 100000  # n個の標本を抽出して平均を取る操作を何度繰り返すか
	n_samples = [2, 4, 8, 32, 128]  # 標本サイズ
	plt.figure(figsize=(10, 6))
	plt.subplots_adjust(wspace=0.3, hspace=0.4)
	# 母集団の確率密度関数を描画
	plt.subplot(2, 3, 1)
	x, y = dist.calc_dist_func_xy()
	plt.title('Distribution Function')
	plt.grid()
	plt.fill_between(x, np.zeros(x.size), y, facecolor='red', alpha=0.3)
	plt.plot(x, y, color='black')
	# 標本を繰り返し抽出して平均値を記録・ヒストグラム描画
	for i in range(len(n_samples)):
		n = n_samples[i]
		sample_mean = np.empty(T, dtype='float')
		for t in range(T):
			sample_mean[t] = dist.get_average_of_samples(n)
		# ヒストグラム描画のための階級幅を計算
		mean_max = sample_mean.max()
		mean_min = sample_mean.min()
		bins = np.linspace(mean_min, mean_max, 50)
		# ヒストグラムを描画
		plt.subplot(2, 3, i+2)
		plt.hist(sample_mean, bins=bins, density=True)
		# 中心極限定理により収束が期待される正規分布を描画
		mu = dist.get_mu()
		sigma = dist.get_sigma() / np.sqrt(n)
		x_norm = np.linspace(mean_min-(mean_max-mean_min)*0.2, mean_max+(mean_max-mean_min)*0.2, 100)
		norm = gauss(x_norm, mu=mu, sigma=sigma)
		plt.title(r'$n_{{\rm sample}} = {}$'.format(n))
		plt.plot(x_norm, norm, lw=1.0, color='red', label=r'$N(\mu, \sigma^2/{})$'.format(n))
		plt.legend()
	plt.suptitle('Sample Mean of ' + dist.get_name())
	plt.show()

func(UniformDist(0, 6.0))
func(BetaDist(9, 3))
func(BetaDist(2, 2))
func(ChiSquaredDist(2))
func(ChiSquaredDist(5))
func(MyLinearDist(1.0))
func(MyQuadraticDist(1.0))
	import numpy as np
	import math
	from matplotlib import pyplot as plt
	from abc import ABC, abstractmethod

	def ln_factorial(n):
	"""階乗の log を計算するための補助関数"""
	ret = 0
	for i in range(1, n+1):
	ret += np.log(i)
	return ret

	def gauss(x, mu, sigma):
	"""x を引数として平均 mu, 標準偏差 sigma の正規分布の確率密度関数を計算"""
	ret = np.exp(- (x-mu)*2 0.5 / sigma**2)
	ret /= np.sqrt(2.0 * np.pi) * sigma
	return ret

	class Distribution(ABC):
	"""一般的な確率分布を表すクラス"""
	@abstractmethod
	def get_name(self):
	pass

	@abstractmethod
	def calc_dist_func_xy(self):
	"""確率密度関数描画のための x, y を返す"""
	pass

	@abstractmethod
	def get_average_of_samples(self, n):
	"""n 件の標本をランダムに抽出し、平均値を返す"""
	pass

	def get_mu(self):
	return self.__mu

	def get_sigma(self):
	return self.__sigma

	def set_mu(self, mu):
	self.__mu = mu

	def set_sigma(self, sigma):
	self.__sigma = sigma


	class UniformDist(Distribution):
	"""一様分布"""
	def __init__(self, a, b):
	self.set_mu((a + b) * 0.5)
	variance = (b-a)**2/12.0
	self.set_sigma(np.sqrt(variance))
	self.__a = a
	self.__b = b

	def get_name(self):
	return 'Uniform Distribution ($a={},b={}$)'.format(self.__a, self.__b)

	def calc_dist_func_xy(self):
	w = self.__b - self.__a
	x = np.linspace(self.__a-w, self.__b+w, 300)
	y = np.where((self.__a <= x) & (x <= self.__b), 1.0/w, 0)
	return x, y

	def get_average_of_samples(self, n):
	samples = np.random.rand(n) * (self.__b - self.__a) + self.__a
	return samples.mean()


	class BetaDist(Distribution):
	"""ベータ分布"""
	def __init__(self, a, b):
	self.set_mu(a/(a+b))
	variance = ab/(a+b+1)/(a+b)*2
	self.set_sigma(np.sqrt(variance))
	self.__a = a
	self.__b = b

	def get_name(self):
	return r'Beta Distribution ($\alpha={}, \beta={}$)'.format(self.__a, self.__b)

	def calc_dist_func_xy(self):
	def beta(p):
	ret = np.zeros(p.shape)
	for i in range(len(p)):
	if 0 < p[i] < 1:
	log_beta = (self.__a-1) * np.log(p[i]) + (self.__b-1) * np.log(1-p[i]) + ln_factorial(self.__a+self.__b-1) - ln_factorial(self.__a-1) - ln_factorial(self.__b-1)
	ret[i] = np.exp(log_beta)
	return ret
	x = np.linspace(0, 1.0, 100)
	y = beta(x)
	return x, y

	def get_average_of_samples(self, n):
	samples = np.random.beta(self.__a, self.__b, n)
	return samples.mean()


	class ChiSquaredDist(Distribution):
	"""カイ二乗分布"""
	def __init__(self, k):
	self.set_mu(k)
	variance = 2 * k
	self.set_sigma(np.sqrt(variance))
	self.__k = k

	def get_name(self):
	return r'$\chi^2$ Distribution ($k={}$)'.format(self.__k)

	def calc_dist_func_xy(self):
	x = np.linspace(0, 20.0, 100)
	y = np.exp(-x0.5) / (2(self.__k0.5)) / math.gamma(self.__k0.5) np.power(x, self.__k*0.5-1)
	return x, y

	def get_average_of_samples(self, n):
	samples = np.random.chisquare(self.__k, n)
	return samples.mean()


	class MyLinearDist(Distribution):
	"""一次関数型の分布"""
	def __init__(self, a):
	self.set_mu(a * 2.0 / 3.0)
	variance = a * a / 18.0
	self.set_sigma(np.sqrt(variance))
	self.__x_max = a
	self.__c = 2.0 / (a*a) # 直線の傾き
	self.__y_max = 2.0 / a # 分布内の y の最大値

	def get_name(self):
	return 'Self-Made Linear Distribution'

	def calc_dist_func_xy(self):
	x = np.linspace(0-self.__x_max0.5, self.__x_max1.5, 100)
	y = np.where((0 <= x) & (x <= self.__x_max), self.__c * x, 0)
	return x, y

	def get_average_of_samples(self, n):
	samples = []
	while len(samples) < n:
	x_tmp = np.random.rand() * self.__x_max
	y_tmp = np.random.rand() * self.__y_max
	if y_tmp <= self.__c * x_tmp:
	samples.append(x_tmp)
	return np.mean(samples)


	class MyQuadraticDist(Distribution):
	"""二次関数型の分布"""
	def __init__(self, a):
	self.set_mu(0)
	variance = a * a * 3.0 / 5.0
	self.set_sigma(np.sqrt(variance))
	self.__x_max = a
	self.__c = 3.0 / (2.0aa*a) # 二次関数の係数
	self.__y_max = 3.0 / (2.0*a) # 分布内の y の最大値

	def get_name(self):
	return 'Self-Made Quadratic Distribution'

	def calc_dist_func_xy(self):
	x = np.linspace(-self.__x_max1.5, self.__x_max1.5, 100)
	y = np.where((-self.__x_max <= x) & (x <= self.__x_max), xxself.__c, 0)
	return x, y

	def get_average_of_samples(self, n):
	samples = []
	while len(samples) < n:
	x_tmp = (np.random.rand()2-1.0) self.__x_max
	y_tmp = np.random.rand() * self.__y_max
	if y_tmp <= self.__c * x_tmp * x_tmp:
	samples.append(x_tmp)
	return np.mean(samples)


	def func(dist):
	"""
	与えられた確率分布について色々な標本サイズで標本平均を繰り返し計算し、
	標本平均の分布が正規分布に近づくことをグラフで確認する
	"""
	T = 100000 # n個の標本を抽出して平均を取る操作を何度繰り返すか
	n_samples = [2, 4, 8, 32, 128] # 標本サイズ
	plt.figure(figsize=(10, 6))
	plt.subplots_adjust(wspace=0.3, hspace=0.4)
	# 母集団の確率密度関数を描画
	plt.subplot(2, 3, 1)
	x, y = dist.calc_dist_func_xy()
	plt.title('Distribution Function')
	plt.grid()
	plt.fill_between(x, np.zeros(x.size), y, facecolor='red', alpha=0.3)
	plt.plot(x, y, color='black')
	# 標本を繰り返し抽出して平均値を記録・ヒストグラム描画
	for i in range(len(n_samples)):
	n = n_samples[i]
	sample_mean = np.empty(T, dtype='float')
	for t in range(T):
	sample_mean[t] = dist.get_average_of_samples(n)
	# ヒストグラム描画のための階級幅を計算
	mean_max = sample_mean.max()
	mean_min = sample_mean.min()
	bins = np.linspace(mean_min, mean_max, 50)
	# ヒストグラムを描画
	plt.subplot(2, 3, i+2)
	plt.hist(sample_mean, bins=bins, density=True)
	# 中心極限定理により収束が期待される正規分布を描画
	mu = dist.get_mu()
	sigma = dist.get_sigma() / np.sqrt(n)
	x_norm = np.linspace(mean_min-(mean_max-mean_min)0.2, mean_max+(mean_max-mean_min)0.2, 100)
	norm = gauss(x_norm, mu=mu, sigma=sigma)
	plt.title(r'$n_{{\rm sample}} = {}$'.format(n))
	plt.plot(x_norm, norm, lw=1.0, color='red', label=r'$N(\mu, \sigma^2/{})$'.format(n))
	plt.legend()
	plt.suptitle('Sample Mean of ' + dist.get_name())
	plt.show()

	func(UniformDist(0, 6.0))
	func(BetaDist(9, 3))
	func(BetaDist(2, 2))
	func(ChiSquaredDist(2))
	func(ChiSquaredDist(5))
	func(MyLinearDist(1.0))
	func(MyQuadraticDist(1.0))