Skip to content

Instantly share code, notes, and snippets.

Created December 12, 2015 07:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/b182c1a175171117579b to your computer and use it in GitHub Desktop.
Save anonymous/b182c1a175171117579b to your computer and use it in GitHub Desktop.
bandi2 <- function(y,cap=4,uneven=5,reps=40,cut=0.14){
## Scale the data to mean 0, sd 1 (so we have a common scale) and
## fix outliers, by capping values
## to say +/4
y1 <- pmin(pmax(scale((y)),-cap),cap);
## Compute K-means with 2 centers. Note this can give false
## positivees Give the kmeans function a set of random uniform
## points and it _will_ return two clusters Also since kmeans is
## senstive to centers, we replicate and take medians
m <- do.call(rbind,lapply(1:reps,function(i){
km <- kmeans( y1,centers=2)
clusters <- km$cluster; center <- km$center;diffc <- abs(diff(center))
## To handle the false positive, we need to check for
## separation of clusters. The approach we use is to compute
## the distance of every point to both centers and take the
## ratio of the distance to its cluster center to the sum of
## the distances to the two centers
sepration <- mean(unlist(Map(function(p, cc){
abs( p -center[cc,]) / (sum(abs(p-center)))
}, y1, clusters)),trim=0.05)
## We want mixtures to be not too skewed, one group cannot be
## 'uneven' times larger than another e.g. no worse than 10:90
## split
spl <- prop.table(table(clusters)); r <- max(spl)/min(spl)
if(r> uneven) return(c(sep=sepration,isBandy=0))
## Are the cluster means significantly different?
(t1 <- t.test( y1[ clusters==1], y1[ clusters==2] ,alternative="two.sided",var.equal=TRUE))
## Do the clusters correspond to a level shift? I.e. not
## banded but one group followed by a level shift in the mean
## not overlapping in time. This is like a Wilcoxon test
icdf <- qnorm(seq_along(y)/(length(y)+1))
(t2 <- t.test( icdf[ clusters==1], icdf[clusters==2] ,alternative="two.sided",var.equal=FALSE))
## Thus a test is bandy if the centers are signficantly
## different and there is no level shift and sepration is <
## the cuttof. The default of 12 is based on URL:
## In essence:
## small bandy (<cut) & isBandy == 1 ==> bandiness
## small bandy (<cut) & isBandy == 0 ==> strong level shift(not gradual)
return(c(sep=sepration, isBandy = 1*(t1$p.value <0.01 && t2$p.value >=0.01 && sepration<=cut) ))
}))
apply(m,2,median)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment