Skip to content

Instantly share code, notes, and snippets.

@yosimox
Created July 26, 2013 06:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yosimox/6086725 to your computer and use it in GitHub Desktop.
Save yosimox/6086725 to your computer and use it in GitHub Desktop.
pandasとscipyでクロス集計表のカイ2乗検定+残差分析。調整済み標準化残差まで出せるように。
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
from pandas import DataFrame, Series
def chi_sq_test(df):
res = {}
#カイ2乗検定の実施→カイ2乗値、p値、自由度、期待値が戻り値
df_chi = sp.stats.chi2_contingency(df)
res = {
"data" : df,
#p値
"p_val" : df_chi[1],
#期待値
"df_exp": DataFrame(df_chi[3])
}
#期待値のカラム名とインデックス名を基データに合わせる
res["df_exp"].columns = df.columns
res["df_exp"].index = df.index
#残差
res["df_res"] = df - res["df_exp"]
res["df_res"].columns = df.columns
res["df_res"].index = df.index
#行%の計算
arr = []
for row in df.T:
arr.append(df.T[row] / float(df.T[row].sum()))
res["df_per"] = DataFrame(arr)
res["df_per"].columns = df.columns
res["df_per"].index = df.index
#残差分析用前処理
row_sum = df.T.sum()
col_sum = df.sum()
full_sum = float(row_sum.sum())
#残差分散を算出
arr_all = []
for r in row_sum:
arr = []
for c in col_sum:
arr.append((1-(r/full_sum))*(1-(c/full_sum)))
arr_all.append(arr)
res["df_res_var"] = DataFrame(arr_all)
res["df_res_var"].columns = df.columns
res["df_res_var"].index = df.index
col_size = df.columns.size
row_size = df.index.size
#調整済み標準化残差を算出
arr_all = []
for r in np.arange(row_size):
arr = []
for c in np.arange(col_size):
arr.append(res["df_res"].iloc[r].iloc[c] / np.sqrt(res["df_exp"].iloc[r].iloc[c] * res["df_res_var"].iloc[r].iloc[c]))
arr_all.append(arr)
res["df_res_final"] = DataFrame(arr_all)
res["df_res_final"].columns = df.columns
res["df_res_final"].index = df.index
return res
#データ出力用
def print_chisq(res):
print "data:"
print res["data"]
print "\npercentile:"
print res["df_per"]
'''print "\nexpectation:"
print res["df_exp"]
print "\nresiduals:"
print res["df_res"]
print "\nresiduals_var:"
print res["df_res_var"]'''
print "\nchouseizumi_hyoujunnka_zansa:"
print res["df_res_final"]
print "\np_value:"
print res["p_val"]
#データ定義
data = DataFrame([[30,6,23,42], [23,10,8,8], [32,12,2,5], [32,42,2,2], [33,33,2,3]])
data.columns = ["a","b","c","d"]
data.index = ["01","02","03","04", "05"]
res = chi_sq(data)
print_all(res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment