nebuta/stat_test.py

## stat_test.py
import csv
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

import matplotlib
from matplotlib import rc
font = {'family':'IPAexGothic'}
rc('font', **font)


# https://stackoverflow.com/questions/39239087/run-a-chi-square-test-with-observation-and-expectation-counts-and-get-confidence
def diffprop(obs):
    """
    `obs` must be a 2x2 numpy array.

    Returns:
    delta
        The difference in proportions
    ci
        The Wald 95% confidence interval for delta
    corrected_ci
        Yates continuity correction for the 95% confidence interval of delta.
    """
    n1, n2 = obs.sum(axis=1)
    prop1 = obs[0,0] / n1
    prop2 = obs[1,0] / n2
    delta = prop1 - prop2

    # Wald 95% confidence interval for delta
    se = np.sqrt(prop1*(1 - prop1)/n1 + prop2*(1 - prop2)/n2)
    ci = (delta - 1.96*se, delta + 1.96*se)

    # Yates continuity correction for confidence interval of delta
    correction = 0.5*(1/n1 + 1/n2)
    corrected_ci = (ci[0] - correction, ci[1] + correction)

    return delta, ci, corrected_ci


def one_row(r,j):
    # mt: male total, fp: female pass, ff: female fail, etc.
    v = [int(s.replace(',','').replace('-','0')) for s in r[j:j+6]]
    return {'mt': v[0],'ft': v[1], 'mp': v[3], 'fp': v[4], 'mf': v[0] - v[3], 'ff': v[1] - v[4]}


def get_of_year(v):
    return np.array([[v['mp'],v['mf']],[v['fp'],v['ff']]])


def get_confidence_interval(v):
    m_bottom, m_up = stats.binom.interval(alpha=0.95, n=v['mt'], p=v['mp']/v['mt'], loc=0)
    f_bottom, f_up = stats.binom.interval(alpha=0.95, n=v['ft'], p=v['fp']/v['ft'], loc=0)
    return np.array(
        [[v['mp']/v['mt'],m_bottom/v['mt'],m_up/v['mt']],
        [v['fp']/v['ft'],f_bottom/v['ft'],f_up/v['ft']]])


values = {}
with open('data.csv') as f:
    reader = csv.reader(f)
    for _ in range(3):
        next(reader)
    for row in reader:
        if row[3] == '合計':
            vss = [one_row(row,loc) for i,loc in enumerate([4,12,20,28,36,44])]
            vs_total = {'mt': sum([v['mt'] for v in vss]),'ft': sum([v['ft'] for v in vss]),
                'mp': sum([v['mp'] for v in vss]),'fp': sum([v['fp'] for v in vss]),
                'mf': sum([v['mf'] for v in vss]),'ff': sum([v['ff'] for v in vss])}
            vss.append(vs_total)
            values[row[2]] = vss

count = 1
figcount = 1
plt.subplots_adjust(hspace=0.7)
delta_cis = []
names = []
# plt.figure(figsize=(15,12))
for name,vss in values.items():
    for vs,year in zip(vss,[2018,2017,2016,2015,2014,2013,'Total']):
        if year != 'Total':
            continue
        try:
            if count > 12:
                plt.savefig('fig %d.pdf' % figcount)
                plt.clf()
                figcount += 1
                # plt.show()
                # plt.figure(figsize=(15,12))
                plt.subplots_adjust(hspace=0.7)
                count = 1
            obs = get_of_year(vs)
            chi2, p, dof, ex  = stats.chi2_contingency(obs,correction=False)
            print(year,name,'%.3f' % p)
            print(obs)
            m_ci,f_ci = get_confidence_interval(vs)
            delta,delta_ci,_ = diffprop(obs)
            delta_cis.append(delta_ci)
            names.append(name)
            print(delta,delta_ci)
            ax = plt.subplot(3,4,count)
            if count % 4 != 1:
                ax.get_yaxis().set_visible(False)
            plt.bar([0,1],[m_ci[0],f_ci[0]],tick_label=['男','女'],alpha=0.4)
            plt.errorbar([0,1],[m_ci[0],f_ci[0]], yerr=[m_ci[0]-m_ci[1],f_ci[0]-f_ci[1]],fmt='.',markersize=0)
            plt.title(name,fontsize=10)
            plt.ylim([0,0.5])
            count += 1
        except Exception as e:
            print('error',e)

plt.savefig('fig %d.pdf' % figcount)
plt.clf()
figcount += 1
# plt.show()
plt.close()

# plt.figure()
fig, ax = plt.subplots(figsize=(7,14))
ys = [(ds[0]+ds[1])/2 for ds in delta_cis]
yerr = [(ds[1]-ds[0])/2 for ds in delta_cis]

colors = []

for y,n,e in zip(ys,range(len(names)),yerr):
    c = (0,0,0)
    if y - e > 0:
        c = (0,0,1)
    elif y + e < 0:
        c = (1,0,0)
    colors.append(c)
    plt.errorbar(y,n,xerr=e,fmt='.',c=c,elinewidth=4,alpha=0.4)

ax.yaxis.set_ticks(np.arange(len(names)))
plt.plot([0,0],[-1,len(names)],ls='--',c=(0.3,0.3,0.3))
plt.ylim([-1,len(names)])
plt.xlim([-0.15,0.15])
plt.tick_params(labeltop=True,top=True)
ax.set_yticklabels(names,fontsize=6)
for ytick, c in zip(ax.get_yticklabels(),colors):
    ytick.set_color(c)

ax.invert_yaxis()
ax.set_aspect(aspect=0.01)
plt.savefig('all_difference.pdf')
plt.show()
	import csv
	import numpy as np
	from scipy import stats
	import matplotlib.pyplot as plt

	import matplotlib
	from matplotlib import rc
	font = {'family':'IPAexGothic'}
	rc('font', **font)


	# https://stackoverflow.com/questions/39239087/run-a-chi-square-test-with-observation-and-expectation-counts-and-get-confidence
	def diffprop(obs):
	"""
	`obs` must be a 2x2 numpy array.

	Returns:
	delta
	The difference in proportions
	ci
	The Wald 95% confidence interval for delta
	corrected_ci
	Yates continuity correction for the 95% confidence interval of delta.
	"""
	n1, n2 = obs.sum(axis=1)
	prop1 = obs[0,0] / n1
	prop2 = obs[1,0] / n2
	delta = prop1 - prop2

	# Wald 95% confidence interval for delta
	se = np.sqrt(prop1(1 - prop1)/n1 + prop2(1 - prop2)/n2)
	ci = (delta - 1.96se, delta + 1.96se)

	# Yates continuity correction for confidence interval of delta
	correction = 0.5*(1/n1 + 1/n2)
	corrected_ci = (ci[0] - correction, ci[1] + correction)

	return delta, ci, corrected_ci


	def one_row(r,j):
	# mt: male total, fp: female pass, ff: female fail, etc.
	v = [int(s.replace(',','').replace('-','0')) for s in r[j:j+6]]
	return {'mt': v[0],'ft': v[1], 'mp': v[3], 'fp': v[4], 'mf': v[0] - v[3], 'ff': v[1] - v[4]}


	def get_of_year(v):
	return np.array([[v['mp'],v['mf']],[v['fp'],v['ff']]])


	def get_confidence_interval(v):
	m_bottom, m_up = stats.binom.interval(alpha=0.95, n=v['mt'], p=v['mp']/v['mt'], loc=0)
	f_bottom, f_up = stats.binom.interval(alpha=0.95, n=v['ft'], p=v['fp']/v['ft'], loc=0)
	return np.array(
	[[v['mp']/v['mt'],m_bottom/v['mt'],m_up/v['mt']],
	[v['fp']/v['ft'],f_bottom/v['ft'],f_up/v['ft']]])


	values = {}
	with open('data.csv') as f:
	reader = csv.reader(f)
	for _ in range(3):
	next(reader)
	for row in reader:
	if row[3] == '合計':
	vss = [one_row(row,loc) for i,loc in enumerate([4,12,20,28,36,44])]
	vs_total = {'mt': sum([v['mt'] for v in vss]),'ft': sum([v['ft'] for v in vss]),
	'mp': sum([v['mp'] for v in vss]),'fp': sum([v['fp'] for v in vss]),
	'mf': sum([v['mf'] for v in vss]),'ff': sum([v['ff'] for v in vss])}
	vss.append(vs_total)
	values[row[2]] = vss

	count = 1
	figcount = 1
	plt.subplots_adjust(hspace=0.7)
	delta_cis = []
	names = []
	# plt.figure(figsize=(15,12))
	for name,vss in values.items():
	for vs,year in zip(vss,[2018,2017,2016,2015,2014,2013,'Total']):
	if year != 'Total':
	continue
	try:
	if count > 12:
	plt.savefig('fig %d.pdf' % figcount)
	plt.clf()
	figcount += 1
	# plt.show()
	# plt.figure(figsize=(15,12))
	plt.subplots_adjust(hspace=0.7)
	count = 1
	obs = get_of_year(vs)
	chi2, p, dof, ex = stats.chi2_contingency(obs,correction=False)
	print(year,name,'%.3f' % p)
	print(obs)
	m_ci,f_ci = get_confidence_interval(vs)
	delta,delta_ci,_ = diffprop(obs)
	delta_cis.append(delta_ci)
	names.append(name)
	print(delta,delta_ci)
	ax = plt.subplot(3,4,count)
	if count % 4 != 1:
	ax.get_yaxis().set_visible(False)
	plt.bar([0,1],[m_ci[0],f_ci[0]],tick_label=['男','女'],alpha=0.4)
	plt.errorbar([0,1],[m_ci[0],f_ci[0]], yerr=[m_ci[0]-m_ci[1],f_ci[0]-f_ci[1]],fmt='.',markersize=0)
	plt.title(name,fontsize=10)
	plt.ylim([0,0.5])
	count += 1
	except Exception as e:
	print('error',e)

	plt.savefig('fig %d.pdf' % figcount)
	plt.clf()
	figcount += 1
	# plt.show()
	plt.close()

	# plt.figure()
	fig, ax = plt.subplots(figsize=(7,14))
	ys = [(ds[0]+ds[1])/2 for ds in delta_cis]
	yerr = [(ds[1]-ds[0])/2 for ds in delta_cis]

	colors = []

	for y,n,e in zip(ys,range(len(names)),yerr):
	c = (0,0,0)
	if y - e > 0:
	c = (0,0,1)
	elif y + e < 0:
	c = (1,0,0)
	colors.append(c)
	plt.errorbar(y,n,xerr=e,fmt='.',c=c,elinewidth=4,alpha=0.4)

	ax.yaxis.set_ticks(np.arange(len(names)))
	plt.plot([0,0],[-1,len(names)],ls='--',c=(0.3,0.3,0.3))
	plt.ylim([-1,len(names)])
	plt.xlim([-0.15,0.15])
	plt.tick_params(labeltop=True,top=True)
	ax.set_yticklabels(names,fontsize=6)
	for ytick, c in zip(ax.get_yticklabels(),colors):
	ytick.set_color(c)

	ax.invert_yaxis()
	ax.set_aspect(aspect=0.01)
	plt.savefig('all_difference.pdf')
	plt.show()