Skip to content

Instantly share code, notes, and snippets.

@tdunning
Last active November 26, 2018 20:05
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tdunning/26ce5f109cd0ea044caaf3ebfd21eb99 to your computer and use it in GitHub Desktop.
Save tdunning/26ce5f109cd0ea044caaf3ebfd21eb99 to your computer and use it in GitHub Desktop.
A simplified implementation of a merging t-digest in R with some visualization of the results
### x is either a vector of numbers or a data frame with sums and weights. Digest is a data frame.
merge = function(x, digest, compression=100) {
## Force the digest to be a data.frame, possibly empty
if (!is.data.frame(digest) && is.na(digest)) {
digest = data.frame(sum=c(), weight=c())
}
## and coerce the incoming data likewise ... a vector of points have default weighting of 1
if (!is.data.frame(x)) {
x = data.frame(sum=x, weight=1)
}
## glue and sort all of the data
merged = rbind(x, digest)
merged = merged[order(merged$sum/merged$weight),]
## and now merge into the result data.frame r
n = sum(merged$weight)
j = 1 # index for output
r = merged[j,] # starts with first row of merged data
q = 0 # weight before now is nil
for (i in 2:dim(merged)[1]) {
## increment in q is out pending output plus a candidate addition
dq = (r[j, "weight"] + merged[i,"weight"])/n
## check to see if it is acceptable
k.size = compression * (q.to.k(q + dq) - q.to.k(q))
if (k.size > 1) {
## nope... commit to this output row start the next one
r = rbind(r, merged[i,])
q = q + r$weight[j]/n
j = j+1
} else {
## yep... it fits
r[j,] = r[j,] + merged[i,]
}
}
return(r)
}
### returns the center of a single centroid
center = function(td, i) {
td[i, "sum"] / td[i, "weight"]
}
interpolate = function(values, weights) {
return (sum(values * weights) / sum(weights))
}
### computes the estimated cdf of a point
cdf = function(td, x) {
if (x < center(td,1)) {
return (0);
} else if (x > center(td,dim(td)[1])) {
return (1);
} else {
i = which(td$sum/td$weight > x)[1] - 1
boundary = interpolate(center(td, i:(i+1)), td$weight[i:(i+1)])
if (x < boundary) {
## on the right side of a centroid
base = sum(td$weight[1:(i-1)])
local = td$weight[i]
## base + local/2 is total weight at the center of centroid i
## base + local is the weight on the right side of that same centroid
return (interpolate(c(base + local/2, base + local)/sum(td$weight), c(boundary-x, x-center(td,i))))
} else {
## on the left side
base = sum(td$weight[1:i])
local = td$weight[i+1]
## base is the center, base - local/2 is the left edge
return (interpolate(c(base - local/2, base)/sum(td$weight), c(center(td,i+1)-x, x-boundary)))
}
}
}
### returns the distribution as estimated by the centroids
dist = function(td) {
n = dim(td)[1]
## each centroid is assumed to be split evenly at its center
d = data.frame(x = td$sum / td$weight, cdf = (cumsum(td$weight) - td$weight/2)/sum(td$weight))
return(d)
}
### Converts q to index function
q.to.k = function(q) {
asin(2*min(q,1)-1)/pi + 0.5
}
M = 200
N = 50
### KS distribution 50%-ile and 95%-iles
### these approximations are accurate to 2-3 significant digits
### relative to values computed by
### Richard Simard's program.
### http://www.iro.umontreal.ca/~simardr/ksdir/KolmogorovSmirnovDist.java
zx = data.frame(n=(1:100), p50=0, p95=0)
for (i in 1:100) {
zx$p50[i] = 0.02599203/sqrt(i*M)*sqrt(M)
zx$p95[i] = 0.0427621/sqrt(i*M)*sqrt(M)
}
### plot the evolution of a t-digest over a bunch of additions of a thousand points
ks.summary = data.frame(i=c(), ks=c(), k=c(), ks1=c(), ks2=c())
for (k in 1:10) {
xx = seq(0,1,by=0.01)
yy = 4 * sqrt(xx * (1-xx))
td.x = NA
ks = rep(0, N+1)
layout(matrix(c(1,2), nrow=1))
ks.details = data.frame(i=rep(0,N), ks1=0, ks2=0)
data.set = runif(N * M)
for (i in 1:N) {
batch = data.set[(i-1)*M + 1:M]
td.x = merge(batch, td.x)
plot(td.x$sum / td.x$weight, td.x$weight, type='b', cex=0.7, main=i, xlab='q', ylab='weight')
lines(xx,sum(td.x$sum) * yy / 100 * pi/2)
dx = dist(td.x)
ks[i] = max(abs(dx$cdf - dx$x))
ks.details$ks1[i] = ks[i]
ks.details$ks2[i] = ks.test(data.set[1:(i*M)], punif)$statistic
plot(ks, type='l', xlab="Step", main=k)
print(c(k, dim(td.x)[1]))
## Mark roughly the median of the KS distribution
lines(p50 ~ n, zx, col='blue')
lines(p95 ~ n, zx, col='lightblue', lty=1)
}
ks.summary = rbind(ks.summary, data.frame(i=1:N, ks=ks[1:N], k=k, ks1=ks.details$ks1, ks2=ks.details$ks2))
}
pdf('ks.evolution.pdf', width=6, height=5)
boxplot(ks ~ i, ks.summary)
lines(p50 ~ n, zx, col='blue')
lines(p95 ~ n, zx, col='lightblue', lty=1)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment