Skip to content

Instantly share code, notes, and snippets.

@matsuken92
Last active January 23, 2022 14:45
Show Gist options
  • Save matsuken92/acff9fc19b680531792c to your computer and use it in GitHub Desktop.
Save matsuken92/acff9fc19b680531792c to your computer and use it in GitHub Desktop.
Describe and explain Q-Q plot
%matplotlib inline
import sys
import matplotlib.pyplot as plt
from matplotlib import animation as ani
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.special import ndtri
# Data Import
df = pd.read_table('Mansion2.data')
df2 = pd.DataFrame(df.values, columns=['Walk_min','distance','Price','Type','Area','Direction','Year'])
data_size = len(df2)
plt.figure(figsize=(12,11))
price = df2['Price']
mins = df2['Walk_min']
area = df2['Area']
plt.subplot(221)
plt.hist(price,bins=20)
plt.title("Histgram of House Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.subplot(222)
plt.title("Scatter plot (Price - Area)")
plt.xlim(6000, 20000)
plt.ylabel("Area")
plt.xlabel("Price")
plt.scatter(price, area)
plt.subplot(223)
plt.title("Scatter plot (Prie - Walk mins)")
plt.scatter(price, mins)
plt.xlim(6000, 20000)
plt.xlabel("Price")
plt.ylabel("Walk mins")
plt.subplot(224)
plt.title("Scatter plot (Area - Walk mins)")
plt.scatter(area, mins)
plt.xlabel("Area")
plt.ylabel("Walk mins")
plt.show()
# ヒストグラムと正規分布の比較
mu_p = np.mean(price)
var_p = np.var(price)
xx = np.linspace(min(price), max(price), 300)
x_density = st.norm.pdf(xx, loc=mu_p, scale=np.sqrt(var_p))
plt.figure(figsize=(8,6))
plt.hist(price,bins=20)
plt.title("Histgram of Price")
plt.xlabel("Price")
ax = plt.twinx()
ax.plot(xx, x_density, "red", linewidth=2, zorder=300)
plt.show()
# 累積のヒストグラムと累積正規分布の比較
xx = np.linspace(min(price)-1000, max(price), 300)
x_cdensity = st.norm.cdf(xx, loc=mu_p, scale=np.sqrt(var_p))
plt.figure(figsize=(8,6))
plt.xlim(min(price)-1000, max(price))
plt.ylim(0, 188)
plt.hist(price,bins=20, cumulative=True, histtype='step')
plt.title("Histgram of Price (Cumulative)")
plt.xlabel("Price")
ax = plt.twinx()
ax.set_xlim(min(price)-1000, max(price))
ax.set_ylim(0,1)
ax.plot(xx, x_cdensity, "red", linewidth=2, zorder=300)
plt.show()
# 家賃
plt.figure(figsize=(7,6))
plt.xlim(0, 1)
plt.ylim(5900, 19500)
plt.title("House Price(sorted)", size=13)
plt.scatter(np.linspace(0, 1, data_size), price_ordered)
plt.grid(True)
# 正規累積分布関数
plt.figure(figsize=(7,6))
plt.xlim(-3, 3)
plt.ylim(0,1)
plt.title("Cumulative Norm Dist", size=13)
plt.scatter(np.linspace(-3, 3, data_size), st.norm.cdf(np.linspace(-3, 3, data_size)))
plt.grid(True)
# 家賃
# 家賃を値段の順番に並び替え
price_ordered = np.sort(price)
# 標準正規分布の逆関数(xの定義域と粒度は0-1の間をデータサイズの数分割したもの)
inv = ndtri(np.linspace(0, 1, data_size))#float(i)/len(price)) for i in range(len(price))]
plt.title("Q-Q Plot", size=13)
plt.xlabel("Theoretical Quantailes")
plt.ylabel("Price")
plt.ylim(5900, 20000)
plt.xlim(-3, 3)
plt.scatter(inv, price_ordered)
plt.show()
data = price_ordered
def animate(nframe):
global num_frame
global data
sys.stdout.write(str(int(float(nframe)/num_frame*100)) + "%, ")
if nframe < 90:
ind = nframe * 2
else:
ind = 90 + nframe
plt.clf()
# 小さい順に並べたデータのプロット
xx1 = np.linspace(0, 1, data_size)
plt.subplot(222)
plt.xlim(0, 1)
plt.ylim(min(data), max(data))
plt.scatter(xx1, data)
plt.scatter(xx1[ind], data[ind], color='red', s=100, zorder=300)
plt.plot([xx1[ind],xx1[ind]],[0, 20000], "k", linewidth=2)
plt.plot([0,1],[data[ind], data[ind]], "k--", linewidth=1)
plt.title("Data(sorted)=%d"%data[ind], size=13)
plt.grid(True)
# 正規累積分布の描画
plt.subplot(223)
xx2 = np.linspace(-3, 3, data_size)
plt.xlim(-3, 3)
plt.ylim(0,1)
plt.scatter(xx2, st.norm.cdf(xx2))
c = st.norm.cdf(xx2[ind])
inv_norm = ndtri(xx1[ind])
plt.scatter(inv_norm, xx1[ind], color='red', s=100, zorder=300)
plt.plot([-3,3], [xx1[ind],xx1[ind]], "k", linewidth=2)
plt.plot([ndtri(xx1[ind]),inv_norm],[0,1], "k--", linewidth=1)
plt.title("Cumulative Norm Dist x=%.3f"%inv_norm, size=13)
plt.grid(True)
# Q-Qプロットの描画
plt.subplot(221)
plt.title(u"Q-Q Plot (%.3f, %d)"%(inv_norm, data[ind]), size=13)
plt.ylim(min(data), max(data))
plt.xlim(-3, 3)
plt.scatter(inv, data)
plt.scatter(inv[ind], data[ind], color='red', s=100, zorder=300)
plt.plot([-3,3],[data[ind], data[ind]], "k--", linewidth=1)
plt.plot([inv_norm, inv_norm],[min(data), max(data)], "k--", linewidth=1)
plt.grid(True)
# 対角直線の描画
plt.plot([-3,3], [min(data), max(data)])
# 情報描画エリア
plt.subplot(224)
plt.xlim(0,1)
plt.ylim(0,1)
plt.title("(%.3f,%.3f)"%(xx1[ind],xx1[ind]))
plt.plot([0,1],[0,1])
plt.plot([0,xx1[ind]], [xx1[ind],xx1[ind]], "k", linewidth=2)
plt.plot([xx1[ind],xx1[ind]],[xx1[ind], 1], "k", linewidth=2,)
plt.hist(data, bins=20)
plt.show()
num_frame = 98
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_House_price.gif', writer='imagemagick', fps=5, dpi=64)
# 正規分布
data = np.random.normal(loc=10, scale=3, size=188)
data = np.sort(data)
plt.hist(data, bins=20)
plt.show()
num_frame = 98
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_Norm.gif', writer='imagemagick', fps=5, dpi=64)
# ----------------------
#指数分布
lam = 0.1
data = np.random.exponential(1./lam, size=188)
data = np.sort(data)
plt.hist(data, bins=20)
plt.show()
num_frame = 98
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_Exp_Dist.gif', writer='imagemagick', fps=5, dpi=64)
#--------------------------
# F分布
data = np.random.f(40, 50, 188)
data = np.sort(data)
plt.hist(data, bins=20)
plt.show()
num_frame = 94 + 4 #188
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_F_Dist.gif', writer='imagemagick', fps=5, dpi=64)
#--------------------------
# ベータ分布1
data = np.random.beta(6, 2, 188)
data = np.sort(data)
plt.hist(data, bins=20)
plt.show()
num_frame = 98
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_Beta_Dist.gif', writer='imagemagick', fps=5, dpi=64)
#--------------------------
# ベータ分布2
data = np.random.beta(0.5, 0.5, 188)
data = np.sort(data)
plt.hist(data, bins=20)
plt.show()
num_frame = 94 + 4 #188
fig = plt.figure(figsize=(10,10))
anim = ani.FuncAnimation(fig, animate, frames=num_frame, blit=True)
anim.save('Q-Q_plot_Beta_Dist2.gif', writer='imagemagick', fps=5, dpi=64)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment