tdunning/td-in-r.r

## td-in-r.r
### x is either a vector of numbers or a data frame with sums and weights. Digest is a data frame.
merge = function(x, digest, compression=100) {
    ## Force the digest to be a data.frame, possibly empty
    if (!is.data.frame(digest) && is.na(digest)) {
        digest = data.frame(sum=c(), weight=c())
    }
    ## and coerce the incoming data likewise ... a vector of points have default weighting of 1
    if (!is.data.frame(x)) {
        x = data.frame(sum=x, weight=1)
    }
    ## glue and sort all of the data
    merged = rbind(x, digest)
    merged = merged[order(merged$sum/merged$weight),]

    ## and now merge into the result data.frame r
    n = sum(merged$weight)
    j = 1                              # index for output
    r = merged[j,]                     # starts with first row of merged data
    q = 0                              # weight before now is nil
    for (i in 2:dim(merged)[1]) {
        ## increment in q is out pending output plus a candidate addition
        dq = (r[j, "weight"] + merged[i,"weight"])/n
        ## check to see if it is acceptable
        k.size = compression * (q.to.k(q + dq) - q.to.k(q))
        if (k.size > 1) {
            ## nope... commit to this output row start the next one
            r = rbind(r, merged[i,])
            q = q + r$weight[j]/n
            j = j+1
        } else {
            ## yep... it fits
            r[j,] = r[j,] + merged[i,]
        }
    }
    return(r)
}

### returns the center of a single centroid
center = function(td, i) {
    td[i, "sum"] / td[i, "weight"]
}

interpolate = function(values, weights) {
    return (sum(values * weights) / sum(weights))
}

### computes the estimated cdf of a point
cdf = function(td, x) {
    if (x < center(td,1)) {
        return (0);
    } else if (x > center(td,dim(td)[1])) {
        return (1);
    } else {
        i = which(td$sum/td$weight > x)[1] - 1
        boundary = interpolate(center(td, i:(i+1)), td$weight[i:(i+1)])
        if (x < boundary) {
            ## on the right side of a centroid
            base = sum(td$weight[1:(i-1)])
            local = td$weight[i]
            ## base + local/2 is total weight at the center of centroid i
            ## base + local is the weight on the right side of that same centroid
            return (interpolate(c(base + local/2, base + local)/sum(td$weight), c(boundary-x, x-center(td,i))))
        } else {
            ## on the left side
            base = sum(td$weight[1:i])
            local = td$weight[i+1]
            ## base is the center, base - local/2 is the left edge
            return (interpolate(c(base - local/2, base)/sum(td$weight), c(center(td,i+1)-x, x-boundary)))
        }
    }
}

### returns the distribution as estimated by the centroids
dist = function(td) {
    n = dim(td)[1]
    ## each centroid is assumed to be split evenly at its center
    d = data.frame(x = td$sum / td$weight, cdf = (cumsum(td$weight) - td$weight/2)/sum(td$weight))
    return(d)
}

### Converts q to index function
q.to.k = function(q) {
    asin(2*min(q,1)-1)/pi + 0.5
}


M = 200
N = 50

### KS distribution 50%-ile and 95%-iles
### these approximations are accurate to 2-3 significant digits
### relative to values computed by
### Richard Simard's program.
### http://www.iro.umontreal.ca/~simardr/ksdir/KolmogorovSmirnovDist.java
zx = data.frame(n=(1:100), p50=0, p95=0)
for (i in 1:100) {
    zx$p50[i] = 0.02599203/sqrt(i*M)*sqrt(M)
    zx$p95[i] = 0.0427621/sqrt(i*M)*sqrt(M)
}


### plot the evolution of a t-digest over a bunch of additions of a thousand points
ks.summary = data.frame(i=c(), ks=c(), k=c(), ks1=c(), ks2=c())
for (k in 1:10) {
    xx = seq(0,1,by=0.01)
    yy = 4 * sqrt(xx * (1-xx))
    td.x = NA
    ks = rep(0, N+1)
    layout(matrix(c(1,2), nrow=1))
    ks.details = data.frame(i=rep(0,N), ks1=0, ks2=0)
    data.set = runif(N * M)
    for (i in 1:N) {
        batch = data.set[(i-1)*M + 1:M]
        td.x = merge(batch, td.x)
        plot(td.x$sum / td.x$weight, td.x$weight, type='b', cex=0.7, main=i, xlab='q', ylab='weight')
        lines(xx,sum(td.x$sum) * yy / 100 * pi/2)
        dx = dist(td.x)
        ks[i] = max(abs(dx$cdf - dx$x))
        ks.details$ks1[i] = ks[i]
        ks.details$ks2[i] = ks.test(data.set[1:(i*M)], punif)$statistic
        plot(ks, type='l', xlab="Step", main=k)
        print(c(k, dim(td.x)[1]))
        ## Mark roughly the median of the KS distribution
        lines(p50 ~ n, zx, col='blue')
        lines(p95 ~ n, zx, col='lightblue', lty=1)
    }
    ks.summary = rbind(ks.summary, data.frame(i=1:N, ks=ks[1:N], k=k, ks1=ks.details$ks1, ks2=ks.details$ks2))
}

pdf('ks.evolution.pdf', width=6, height=5)
boxplot(ks ~ i, ks.summary)
lines(p50 ~ n, zx, col='blue')
lines(p95 ~ n, zx, col='lightblue', lty=1)
dev.off()
	### x is either a vector of numbers or a data frame with sums and weights. Digest is a data frame.
	merge = function(x, digest, compression=100) {
	## Force the digest to be a data.frame, possibly empty
	if (!is.data.frame(digest) && is.na(digest)) {
	digest = data.frame(sum=c(), weight=c())
	}
	## and coerce the incoming data likewise ... a vector of points have default weighting of 1
	if (!is.data.frame(x)) {
	x = data.frame(sum=x, weight=1)
	}
	## glue and sort all of the data
	merged = rbind(x, digest)
	merged = merged[order(merged$sum/merged$weight),]

	## and now merge into the result data.frame r
	n = sum(merged$weight)
	j = 1 # index for output
	r = merged[j,] # starts with first row of merged data
	q = 0 # weight before now is nil
	for (i in 2:dim(merged)[1]) {
	## increment in q is out pending output plus a candidate addition
	dq = (r[j, "weight"] + merged[i,"weight"])/n
	## check to see if it is acceptable
	k.size = compression * (q.to.k(q + dq) - q.to.k(q))
	if (k.size > 1) {
	## nope... commit to this output row start the next one
	r = rbind(r, merged[i,])
	q = q + r$weight[j]/n
	j = j+1
	} else {
	## yep... it fits
	r[j,] = r[j,] + merged[i,]
	}
	}
	return(r)
	}

	### returns the center of a single centroid
	center = function(td, i) {
	td[i, "sum"] / td[i, "weight"]
	}

	interpolate = function(values, weights) {
	return (sum(values * weights) / sum(weights))
	}

	### computes the estimated cdf of a point
	cdf = function(td, x) {
	if (x < center(td,1)) {
	return (0);
	} else if (x > center(td,dim(td)[1])) {
	return (1);
	} else {
	i = which(td$sum/td$weight > x)[1] - 1
	boundary = interpolate(center(td, i:(i+1)), td$weight[i:(i+1)])
	if (x < boundary) {
	## on the right side of a centroid
	base = sum(td$weight[1:(i-1)])
	local = td$weight[i]
	## base + local/2 is total weight at the center of centroid i
	## base + local is the weight on the right side of that same centroid
	return (interpolate(c(base + local/2, base + local)/sum(td$weight), c(boundary-x, x-center(td,i))))
	} else {
	## on the left side
	base = sum(td$weight[1:i])
	local = td$weight[i+1]
	## base is the center, base - local/2 is the left edge
	return (interpolate(c(base - local/2, base)/sum(td$weight), c(center(td,i+1)-x, x-boundary)))
	}
	}
	}

	### returns the distribution as estimated by the centroids
	dist = function(td) {
	n = dim(td)[1]
	## each centroid is assumed to be split evenly at its center
	d = data.frame(x = td$sum / td$weight, cdf = (cumsum(td$weight) - td$weight/2)/sum(td$weight))
	return(d)
	}

	### Converts q to index function
	q.to.k = function(q) {
	asin(2*min(q,1)-1)/pi + 0.5
	}


	M = 200
	N = 50

	### KS distribution 50%-ile and 95%-iles
	### these approximations are accurate to 2-3 significant digits
	### relative to values computed by
	### Richard Simard's program.
	### http://www.iro.umontreal.ca/~simardr/ksdir/KolmogorovSmirnovDist.java
	zx = data.frame(n=(1:100), p50=0, p95=0)
	for (i in 1:100) {
	zx$p50[i] = 0.02599203/sqrt(iM)sqrt(M)
	zx$p95[i] = 0.0427621/sqrt(iM)sqrt(M)
	}


	### plot the evolution of a t-digest over a bunch of additions of a thousand points
	ks.summary = data.frame(i=c(), ks=c(), k=c(), ks1=c(), ks2=c())
	for (k in 1:10) {
	xx = seq(0,1,by=0.01)
	yy = 4 * sqrt(xx * (1-xx))
	td.x = NA
	ks = rep(0, N+1)
	layout(matrix(c(1,2), nrow=1))
	ks.details = data.frame(i=rep(0,N), ks1=0, ks2=0)
	data.set = runif(N * M)
	for (i in 1:N) {
	batch = data.set[(i-1)*M + 1:M]
	td.x = merge(batch, td.x)
	plot(td.x$sum / td.x$weight, td.x$weight, type='b', cex=0.7, main=i, xlab='q', ylab='weight')
	lines(xx,sum(td.x$sum) * yy / 100 * pi/2)
	dx = dist(td.x)
	ks[i] = max(abs(dx$cdf - dx$x))
	ks.details$ks1[i] = ks[i]
	ks.details$ks2[i] = ks.test(data.set[1:(i*M)], punif)$statistic
	plot(ks, type='l', xlab="Step", main=k)
	print(c(k, dim(td.x)[1]))
	## Mark roughly the median of the KS distribution
	lines(p50 ~ n, zx, col='blue')
	lines(p95 ~ n, zx, col='lightblue', lty=1)
	}
	ks.summary = rbind(ks.summary, data.frame(i=1:N, ks=ks[1:N], k=k, ks1=ks.details$ks1, ks2=ks.details$ks2))
	}

	pdf('ks.evolution.pdf', width=6, height=5)
	boxplot(ks ~ i, ks.summary)
	lines(p50 ~ n, zx, col='blue')
	lines(p95 ~ n, zx, col='lightblue', lty=1)
	dev.off()