ruseel/compare.py

## compare.py
import random
import statsmodels
import pmap

groups = ["control", "treatment"]
POPULATION = 1e6
SAMPLE = 1e4
TRIES_PER_PERSON = 3


def population():
    h = {}
    for i in range(int(POPULATION)):
        v = random.random()
        h[i] = (v)
    return h


def predetermined_group(id):
    return groups[id % 2]


def justintime_group(id):
    return groups[int(random.random() < 0.5)]


def trial(_pop, fn_group):
    _sample = random.sample(list(_pop.items()), int(SAMPLE))

    count = {'control': 0, 'treatment': 0}
    nobs = {'control': 0, 'treatment': 0}
    for (person_id, feature) in _sample:
        group = fn_group(person_id)
        for _ in range(TRIES_PER_PERSON):
            nobs[group] += 1
            if feature > 0.9:
                count[group] += 1
    return {"count": count, "nobs": nobs}


from statsmodels.stats.proportion import proportions_ztest


def pval(x):
    counts = [x["count"]["control"], x["count"]["treatment"]]
    nobs = [x["nobs"]["control"], x["nobs"]["treatment"]]
    stat, pval = proportions_ztest(counts, nobs)
    return pval


import sys
import pprint


def pval_trials():
    _pop = population()
    lst = []
    for _ in range(100):
        pvals = (pval(trial(_pop, predetermined_group)),
                 pval(trial(_pop, justintime_group)))
        lst.append(pvals)
    return lst


def main():
    pvals = pval_trials()
    counts = list((int(a < 0.05), int(b < 0.05)) for a, b in pvals)
    print(sum(a for (a, b) in counts), sum(b for (a, b) in counts))
    sys.stdout.flush()


for _ in range(100):
    main()

## group_spliting.md

      
    Raw
  

              group_spliting.md
            
          
    100만명(1e6)의 사람이 있다. 각각의 사람이 1~1e6 까지의 id를 가진다.

사람마다 성향X 가 있다고 하자. 성향X는 [0,1) 의 값을 가지고 random 한 값이다.
이 그룹에서 만명(1e4)을 랜덤하게 골라서
실험e 에 투입한다고 하자.
실험a는 A/A 테스트이고
하나의 observaiton에 한 사람이 두 번씩 참여한다.
각각의 참여를 모두 기록한다.
성향X의 값이 0.9가 넘어가면 실험결과가 구입이고 아니면 미구입 이라고 하자. ??
그룹을 나누는 방법은 두 가지이다.

Id % 10이 홀수이면 실험군, 짝수이면 대조군으로 한다. (미리 random 하게 나누는 방법)
(미리 random 하게 나누는 방법)을 쓰지 않고 실험e 에 투입된 사람만 랜덤하게 실험군, 대조군으로 나눈다 (just-in-time 나누기)

"두 group에서 conversion rate간에 차이가 없다"가 귀무가설이다.
결과에 대해 two-sample-proportion test 를 (from statsmodels.stats.proportion import proportions_ztest)했을 때
P-value가 0.05보다 작으면 가설을 기각할만한 증거가 된다.
그러면 차이가 있다고 주장할 수 있다.
이 실험을 n번 돌려봤을 때
차이가 있다라고 주장할 수 있는 경우가
몇 번인가?
양쪽에 차이가 있나?
두 방법이 차이가 있었으면 싶었지만,, 차이가 없었다.
	import random
	import statsmodels
	import pmap

	groups = ["control", "treatment"]
	POPULATION = 1e6
	SAMPLE = 1e4
	TRIES_PER_PERSON = 3


	def population():
	h = {}
	for i in range(int(POPULATION)):
	v = random.random()
	h[i] = (v)
	return h


	def predetermined_group(id):
	return groups[id % 2]


	def justintime_group(id):
	return groups[int(random.random() < 0.5)]


	def trial(_pop, fn_group):
	_sample = random.sample(list(_pop.items()), int(SAMPLE))

	count = {'control': 0, 'treatment': 0}
	nobs = {'control': 0, 'treatment': 0}
	for (person_id, feature) in _sample:
	group = fn_group(person_id)
	for _ in range(TRIES_PER_PERSON):
	nobs[group] += 1
	if feature > 0.9:
	count[group] += 1
	return {"count": count, "nobs": nobs}


	from statsmodels.stats.proportion import proportions_ztest


	def pval(x):
	counts = [x["count"]["control"], x["count"]["treatment"]]
	nobs = [x["nobs"]["control"], x["nobs"]["treatment"]]
	stat, pval = proportions_ztest(counts, nobs)
	return pval


	import sys
	import pprint


	def pval_trials():
	_pop = population()
	lst = []
	for _ in range(100):
	pvals = (pval(trial(_pop, predetermined_group)),
	pval(trial(_pop, justintime_group)))
	lst.append(pvals)
	return lst


	def main():
	pvals = pval_trials()
	counts = list((int(a < 0.05), int(b < 0.05)) for a, b in pvals)
	print(sum(a for (a, b) in counts), sum(b for (a, b) in counts))
	sys.stdout.flush()


	for _ in range(100):
	main()