Skip to content

Instantly share code, notes, and snippets.

@timm
Last active November 6, 2022 19:56
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save timm/41b3a8790c1adce26d63c5874fbea393 to your computer and use it in GitHub Desktop.
Save timm/41b3a8790c1adce26d63c5874fbea393 to your computer and use it in GitHub Desktop.
Scott Knot with nonparametric effect size and significance test
#!/usr/bin/env python3
# vim: sta:et:sw=2:ts=2:sts=2 :
from copy import deepcopy as kopy
import sys,random
"""
Scott-Knot test + non parametric effect size + significance tests.
Tim Menzies, 2019. Share and enjoy. No warranty. Caveat Emptor.
Accepts data as per the following exmaple (you can ignore the "*n"
stuff, that is just there for the purposes of demos on larger
and larger data)
Ouputs treatments, clustered such that things that have similar
results get the same ranks.
For a demo of this code, just run
python3 sk.py
"""
#-----------------------------------------------------
# Examples
def skDemo(n=5) :
#Rx.data is one way to run the code
return Rx.data( x1 =[ 0.34, 0.49 ,0.51, 0.6]*n,
x2 =[0.6 ,0.7 , 0.8 , 0.89]*n,
x3 =[0.13 ,0.23, 0.38 , 0.38]*n,
x4 =[0.6 ,0.7, 0.8 , 0.9]*n,
x5 =[0.1 ,0.2, 0.3 , 0.4]*n)
"""
Another is to make a file
x1 0.34 0.49 0.51 0.6
x2 0.6 0.7 0.8 0.9
x3 0.15 0.25 0.4 0.35
x4 0.6 0.7 0.8 0.9
x5 0.1 0.2 0.3 0.4
Then call
Rx.fileIn( fileName )
"""
#-----------------------------------------------------
# Config
class o:
def __init__(i,**d) : i.__dict__.update(**d)
class THE:
cliffs = o(dull= [0.147, # small
0.33, # medium
0.474 # large
][0])
bs= o( conf=0.05,
b=500)
mine = o( private="_")
char = o( skip="?")
rx = o( show="%4s %10s %s")
tile = o( width=50,
chops=[0.1 ,0.3,0.5,0.7,0.9],
marks=[" " ,"-","-","-"," "],
bar="|",
star="*",
show=" %5.3f")
#-----------------------------------------------------
def cliffsDeltaSlow(lst1,lst2, dull = THE.cliffs.dull):
"""Returns true if there are more than 'dull' difference.
Warning: O(N)^2."""
n= gt = lt = 0.0
for x in lst1:
for y in lst2:
n += 1
if x > y: gt += 1
if x < y: lt += 1
return abs(lt - gt)/n <= dull
def cliffsDelta(lst1, lst2, dull=THE.cliffs.dull):
"By pre-soring the lists, this cliffsDelta runs in NlogN time"
def runs(lst):
for j,two in enumerate(lst):
if j == 0: one,i = two,0
if one!=two:
yield j - i,one
i = j
one=two
yield j - i + 1,two
#---------------------
m, n = len(lst1), len(lst2)
lst2 = sorted(lst2)
j = more = less = 0
for repeats,x in runs(sorted(lst1)):
while j <= (n - 1) and lst2[j] < x: j += 1
more += j*repeats
while j <= (n - 1) and lst2[j] == x: j += 1
less += (n - j)*repeats
d= (more - less) / (m*n)
return abs(d) <= dull
def bootstrap(y0,z0,conf=THE.bs.conf,b=THE.bs.b):
"""
two lists y0,z0 are the same if the same patterns can be seen in all of them, as well
as in 100s to 1000s sub-samples from each.
From p220 to 223 of the Efron text 'introduction to the boostrap'.
Typically, conf=0.05 and b is 100s to 1000s.
"""
class Sum():
def __init__(i,some=[]):
i.sum = i.n = i.mu = 0 ; i.all=[]
for one in some: i.put(one)
def put(i,x):
i.all.append(x);
i.sum +=x; i.n += 1; i.mu = float(i.sum)/i.n
def __add__(i1,i2): return Sum(i1.all + i2.all)
def testStatistic(y,z):
tmp1 = tmp2 = 0
for y1 in y.all: tmp1 += (y1 - y.mu)**2
for z1 in z.all: tmp2 += (z1 - z.mu)**2
s1 = float(tmp1)/(y.n - 1)
s2 = float(tmp2)/(z.n - 1)
delta = z.mu - y.mu
if s1+s2:
delta = delta/((s1/y.n + s2/z.n)**0.5)
return delta
def one(lst): return lst[ int(any(len(lst))) ]
def any(n) : return random.uniform(0,n)
y,z = Sum(y0), Sum(z0)
x = y + z
baseline = testStatistic(y,z)
yhat = [y1 - y.mu + x.mu for y1 in y.all]
zhat = [z1 - z.mu + x.mu for z1 in z.all]
bigger = 0
for i in range(b):
if testStatistic(Sum([one(yhat) for _ in yhat]),
Sum([one(zhat) for _ in zhat])) > baseline:
bigger += 1
return bigger / b >= conf
#-------------------------------------------------------
# misc functions
def same(x): return x
class Mine:
"class that, amongst other times, pretty prints objects"
oid = 0
def identify(i):
Mine.oid += 1
i.oid = Mine.oid
return i.oid
def __repr__(i):
pairs = sorted([(k, v) for k, v in i.__dict__.items()
if k[0] != THE.mine.private])
pre = i.__class__.__name__ + '{'
def q(z):
if isinstance(z,str): return "'%s'" % z
if callable(z): return "fun(%s)" % z.__name__
return str(z)
return pre + ", ".join(['%s=%s' % (k, q(v))])
#-------------------------------------------------------
class Rx(Mine):
"place to manage pairs of (TreatmentName,ListofResults)"
def __init__(i, rx="",vals=[], key=same):
i.rx = rx
i.vals = sorted([x for x in vals if x != THE.char.skip])
i.n = len(i.vals)
i.med = i.vals[int(i.n/2)]
i.mu = sum(i.vals)/i.n
i.rank = 1
def tiles(i,lo=0,hi=1): return xtile(i.vals,lo,hi)
def __lt__(i,j): return i.med < j.med
def __eq__(i,j):
return cliffsDelta(i.vals,j.vals) and \
bootstrap(i.vals,j.vals)
def __repr__(i):
return '%4s %10s %s' % (i.rank, i.rx, i.tiles())
def xpect(i,j,b4):
"Expected value of difference in emans before and after a split"
n = i.n + j.n
return i.n/n * (b4.med- i.med)**2 + j.n/n * (j.med-b4.med)**2
#-- end instance methods --------------------------
@staticmethod
def data(**d):
"convert dictionary to list of treatments"
return [Rx(k,v) for k,v in d.items()]
@staticmethod
def fileIn(f,lo=0,hi=1):
d={}
what=None
for word in words(f):
x = thing(word)
if isinstance(x,str):
what=x
d[what] = d.get(what,[])
else:
d[what] += [x]
Rx.show(Rx.sk(Rx.data(**d)),lo=lo,hi=hi)
@staticmethod
def sum(rxs):
"make a new rx from all the rxs' vals"
all = []
for rx in rxs:
for val in rx.vals:
all += [val]
return Rx(vals=all)
@staticmethod
def show(rxs,lo=0,hi=1):
"pretty print set of treatments"
tmp=Rx.sum(rxs)
lo,hi=tmp.vals[0], tmp.vals[-1]
for rx in sorted(rxs):
print(THE.rx.show % (rx.rank, rx.rx,
rx.tiles(lo=lo,hi=hi)))
@staticmethod
def sk(rxs):
"sort treatments and rank them"
def divide(lo,hi,b4,rank):
cut = left=right=None
best = 0
for j in range(lo+1,hi):
left0 = Rx.sum( rxs[lo:j] )
right0 = Rx.sum( rxs[j:hi] )
now = left0.xpect(right0, b4)
if now > best:
if left0 != right0:
best, cut,left,right = now,j,kopy(left0),kopy(right0)
if cut:
rank = divide(lo, cut, left, rank) + 1
rank = divide(cut ,hi, right,rank)
else:
for rx in rxs[lo:hi]:
rx.rank = rank
return rank
#-- sk main
rxs=sorted(rxs)
divide(0, len(rxs),Rx.sum(rxs),1)
return rxs
#-------------------------------------------------------
def pairs(lst):
"Return all pairs of items i,i+1 from a list."
last=lst[0]
for i in lst[1:]:
yield last,i
last = i
def words(f):
with open(f) as fp:
for line in fp:
for word in line.split():
yield word
def xtile(lst,lo,hi,
width= THE.tile.width,
chops= THE.tile.chops,
marks= THE.tile.marks,
bar= THE.tile.bar,
star= THE.tile.star,
show= THE.tile.show):
"""The function _xtile_ takes a list of (possibly)
unsorted numbers and presents them as a horizontal
xtile chart (in ascii format). The default is a
contracted _quintile_ that shows the
10,30,50,70,90 breaks in the data (but this can be
changed- see the optional flags of the function).
"""
def pos(p) : return ordered[int(len(lst)*p)]
def place(x) :
return int(width*float((x - lo))/(hi - lo+0.00001))
def pretty(lst) :
return ', '.join([show % x for x in lst])
ordered = sorted(lst)
lo = min(lo,ordered[0])
hi = max(hi,ordered[-1])
what = [pos(p) for p in chops]
where = [place(n) for n in what]
out = [" "] * width
for one,two in pairs(where):
for i in range(one,two):
out[i] = marks[0]
marks = marks[1:]
out[int(width/2)] = bar
out[place(pos(0.5))] = star
return '('+''.join(out) + ")," + pretty(what)
def thing(x):
"Numbers become numbers; every other x is a symbol."
try: return int(x)
except ValueError:
try: return float(x)
except ValueError:
return x
#-------------------------------------------------------
def _cliffsDelta():
"demo function"
lst1=[1,2,3,4,5,6,7]*100
n=1
for _ in range(10):
lst2=[x*n for x in lst1]
print("_cliffsDelta",
cliffsDelta(lst1,lst2),n) # should return False
n*=1.03
def bsTest(n=1000,mu1=10,sigma1=1,mu2=10.2,sigma2=1):
def g(mu,sigma) : return random.gauss(mu,sigma)
x = [g(mu1,sigma1) for i in range(n)]
y = [g(mu2,sigma2) for i in range(n)]
return n,mu1,sigma1,mu2,sigma2,\
'same' if bootstrap(x,y) else 'different'
#-------------------------------------------------------
if __name__ == "__main__":
random.seed(1)
Rx.fileIn("sk1.csv")
print("-"*50)
Rx.fileIn("sk2.csv",lo=0,hi=100)
print("-"*50)
_cliffsDelta()
print("-"*50)
print("bootstrap", bsTest(100, 10, .5, 10, .5) )
print("bootstrap", bsTest(100, 10, 1, 20, 1) )
print("bootstrap", bsTest(100, 10, 10, 10.5, 10) )
print("-"*50)
n=1
l1= [1,2,3,4,5,6,7,8,9,10]*10
for _ in range(10):
l2=[n*x for x in l1]
print('same' if bootstrap(l1,l2) else 'different',n)
n*=1.02
n=1
print("-"*50)
for _ in range(4):
print()
print(n*5)
Rx.show(Rx.sk(skDemo(n)))
n*=5
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
x1 0.34 0.49 0.51 0.6
x2 0.6 0.7 0.8 0.9
x3 0.15 0.25 0.4 0.35
x4 0.6 0.7 0.8 0.9
x5 0.1 0.2 0.3 0.4
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
apples
1 3 5 2 4 6 5 6 2 2 3 4 5 6 4 5 2 2
4 4 5 5 3 2 3 4 5 3 2 2 4 5 5 4 3 2
oranges
4 5 7 8 9 6 5 4 4 5 6 6 7 8 8 7 6 5
4 4 5 6 7 8 9 8 7 5 4
pears
6 7 8 9 0 9 8 7 6 7 8 9 0 9 8 7 6 7 8
bananas
1 12 13 15 14 13 11 12 13 14 15 14 13 11 12 13 14 15
17 18 10 19 10 19 17 16 15 16 15 17 17 19 10 10 18 17
16 15 15 16 17 17 19 19 17
DEcf.5f
79 86 86 77 65 77 72 73 65 56 60 69 75 70
62 56 69 55 67 99
GA.05p10c2
27 27 26 2 25 26 10 20 31 6 17 25 25 19 25 23 16 5 2 23
GA.01p10c1
37 62 55 56 63 36 34 65 56 52 33 46 41 34 40 50 40 61 35 65
5

Input

x1  0.34  0.49  0.51  0.6
x2  0.6   0.7   0.8   0.9
x3  0.15  0.25  0.4   0.35
x4  0.6   0.7   0.8   0.9
x5  0.1   0.2   0.3   0.4

Output:

   1         x5 (         -----*----      |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
   2         x3 (            -----*-      |                        ), 0.150,  0.250,  0.350,  0.350,  0.400
   3         x1 (                        -*---                     ), 0.340,  0.490,  0.510,  0.510,  0.600
   4         x2 (                         |        -----*----      ), 0.600,  0.700,  0.800,  0.800,  0.900
   4         x4 (                         |        -----*----      ), 0.600,  0.700,  0.800,  0.800,  0.900

Output of python3 sk.py

     1	   1         x5 (      ------*-----       |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
     2	   2         x3 (         ------*--       |                        ), 0.150,  0.250,  0.350,  0.350,  0.400
     3	   3         x1 (                        -*-----                   ), 0.340,  0.490,  0.510,  0.510,  0.600
     4	   4         x2 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
     5	   4         x4 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
     6	--------------------------------------------------
     7	   1     apples ( -*                      |                        ), 2.000,  3.000,  4.000,  5.000,  5.000
     8	   2    oranges (  -*                     |                        ), 4.000,  5.000,  6.000,  7.000,  8.000
     9	   3      pears (   *                     |                        ), 0.000,  7.000,  7.000,  8.000,  9.000
    10	   4    bananas (      -*-                |                        ), 10.000,  13.000,  15.000,  17.000,  19.000
    11	   5 GA.05p10c2 (        ---*-            |                        ), 5.000,  17.000,  23.000,  25.000,  27.000
    12	   6 GA.01p10c1 (                  -----*-|-----                   ), 34.000,  37.000,  46.000,  56.000,  63.000
    13	   7    DEcf.5f (                         |      ---*-------       ), 56.000,  65.000,  70.000,  77.000,  86.000
    14	--------------------------------------------------
    15	_cliffsDelta True 1
    16	_cliffsDelta True 1.03
    17	_cliffsDelta True 1.0609
    18	_cliffsDelta True 1.092727
    19	_cliffsDelta True 1.1255088100000001
    20	_cliffsDelta True 1.1592740743
    21	_cliffsDelta False 1.1940522965290001
    22	_cliffsDelta False 1.2298738654248702
    23	_cliffsDelta False 1.2667700813876164
    24	_cliffsDelta False 1.304773183829245
    25	--------------------------------------------------
    26	bootstrap (100, 10, 0.5, 10, 0.5, 'same')
    27	bootstrap (100, 10, 1, 20, 1, 'different')
    28	bootstrap (100, 10, 10, 10.5, 10, 'same')
    29	--------------------------------------------------
    30	same 1
    31	same 1.02
    32	same 1.0404
    33	same 1.061208
    34	same 1.08243216
    35	same 1.1040808032
    36	same 1.126162419264
    37	different 1.14868566764928
    38	different 1.1716593810022657
    39	different 1.195092568622311
    40	--------------------------------------------------
    41	
    42	5
    43	   1         x5 (      ------*-----       |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
    44	   1         x3 (        ---------*       |                        ), 0.130,  0.230,  0.380,  0.380,  0.380
    45	   2         x1 (                        -*-----                   ), 0.340,  0.490,  0.510,  0.510,  0.600
    46	   3         x2 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.890
    47	   3         x4 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
    48	
    49	25
    50	   1         x5 (      ------*-----       |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
    51	   1         x3 (        ---------*       |                        ), 0.130,  0.230,  0.380,  0.380,  0.380
    52	   2         x1 (                        -*-----                   ), 0.340,  0.490,  0.510,  0.510,  0.600
    53	   3         x2 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.890
    54	   3         x4 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
    55	
    56	125
    57	   1         x5 (      ------*-----       |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
    58	   2         x3 (        ---------*       |                        ), 0.130,  0.230,  0.380,  0.380,  0.380
    59	   3         x1 (                        -*-----                   ), 0.340,  0.490,  0.510,  0.510,  0.600
    60	   4         x2 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.890
    61	   4         x4 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
    62	
    63	625
    64	   1         x5 (      ------*-----       |                        ), 0.100,  0.200,  0.300,  0.300,  0.400
    65	   2         x3 (        ---------*       |                        ), 0.130,  0.230,  0.380,  0.380,  0.380
    66	   3         x1 (                        -*-----                   ), 0.340,  0.490,  0.510,  0.510,  0.600
    67	   4         x2 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.890
    68	   4         x4 (                         |           ------*----- ), 0.600,  0.700,  0.800,  0.800,  0.900
@timm
Copy link
Author

timm commented Jun 23, 2020

Bug fix : line 247. Should be:

divide(0, len(rxs),Rx.sum(rxs),1)

@timm
Copy link
Author

timm commented Aug 21, 2021

To map the above textual results into latex, use this macro:

\newcommand{\quart}[4]{\begin{picture}(80,4)%1
    {\color{black}\put(#3,2){\circle*{4}}\put(#1,2){\line(1,0){#2}}}\end{picture}}

@ahsan2010
Copy link

Hi Prof. @timm. Thanks for sharing this implementation. I am wondering which non-parametric test did you implement here?
From what I understand, I think you used the "Mann-Whitney test, is it right?

@timm
Copy link
Author

timm commented Nov 5, 2022 via email

@ahsan2010
Copy link

Thank you for pointing me in the right direction. :-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment