Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import numpy as np
from numpy import log
# we generate two independent "parent" processes, then for each one we independently create thinnings.
# we aim to have a distance measure that correctly clusters the two sets according to their parent.
# by Dan Stowell, copyright Feb 2018
parents = [np.cumsum(np.random.exponential(size=100)) for whichp in range(2)]
thindens = 0.5
children = []
for parent in parents:
children.append([parent[np.nonzero(np.random.rand(len(parent)) < thindens)] for _ in range(10)])
print("parent lengths:")
print([len(par) for par in parents])
print("child lengths:")
print([[len(chi) for chi in chilist] for chilist in children])
childrenconcat = np.concatenate(children)
def thinningdistance(s1, s2):
"Defined as the negative joint log-likelihood of the two sequences, assuming they were both generated by thinning from the same Poisson process realisation. Neglects a constant offset due to the overall likelihood of the union (which means that distances cannot be compared across datasets, but can be optimised or combined triangle-wise, with some care). Also uses a simplifying assumption that the generating process rates are equal to the empirical rates of the observations."
s1 = set(s1)
s2 = set(s2)
if s1 == s2:
return 0 # shortcut
su = s1 | s2
n1 = float(len(s1))
n2 = float(len(s2))
nu = float(len(su))
ll = n1 * log(n1/nu) + n2 * log(n2/nu) + (nu-n1) * log(1 - n1/nu) + (nu-n2) * log(1 - n2/nu)
return -ll
childrenconcat = np.concatenate(children)
dists = [[thinningdistance(a, b) for b in childrenconcat] for a in childrenconcat]
print("Distances:")
for row in dists: print map(int, row)
print("Distances between distinct children of same process:")
somedists = []
for chilist in children:
for whichc, childa in enumerate(chilist):
for childb in chilist[:whichc]:
somedists.append(thinningdistance(childa, childb))
print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))
print("Distances between children of different processes:")
somedists = []
for whichc, chilista in enumerate(children):
for chilistb in children[:whichc]:
for childa in chilista:
for childb in chilistb:
somedists.append(thinningdistance(childa, childb))
print("Min %.1f, Mean %.1f, Median %.1f, Max %.1f" % (np.min(somedists), np.mean(somedists), np.median(somedists), np.max(somedists), ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment