Created
December 12, 2015 07:01
-
-
Save anonymous/b182c1a175171117579b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bandi2 <- function(y,cap=4,uneven=5,reps=40,cut=0.14){ | |
## Scale the data to mean 0, sd 1 (so we have a common scale) and | |
## fix outliers, by capping values | |
## to say +/4 | |
y1 <- pmin(pmax(scale((y)),-cap),cap); | |
## Compute K-means with 2 centers. Note this can give false | |
## positivees Give the kmeans function a set of random uniform | |
## points and it _will_ return two clusters Also since kmeans is | |
## senstive to centers, we replicate and take medians | |
m <- do.call(rbind,lapply(1:reps,function(i){ | |
km <- kmeans( y1,centers=2) | |
clusters <- km$cluster; center <- km$center;diffc <- abs(diff(center)) | |
## To handle the false positive, we need to check for | |
## separation of clusters. The approach we use is to compute | |
## the distance of every point to both centers and take the | |
## ratio of the distance to its cluster center to the sum of | |
## the distances to the two centers | |
sepration <- mean(unlist(Map(function(p, cc){ | |
abs( p -center[cc,]) / (sum(abs(p-center))) | |
}, y1, clusters)),trim=0.05) | |
## We want mixtures to be not too skewed, one group cannot be | |
## 'uneven' times larger than another e.g. no worse than 10:90 | |
## split | |
spl <- prop.table(table(clusters)); r <- max(spl)/min(spl) | |
if(r> uneven) return(c(sep=sepration,isBandy=0)) | |
## Are the cluster means significantly different? | |
(t1 <- t.test( y1[ clusters==1], y1[ clusters==2] ,alternative="two.sided",var.equal=TRUE)) | |
## Do the clusters correspond to a level shift? I.e. not | |
## banded but one group followed by a level shift in the mean | |
## not overlapping in time. This is like a Wilcoxon test | |
icdf <- qnorm(seq_along(y)/(length(y)+1)) | |
(t2 <- t.test( icdf[ clusters==1], icdf[clusters==2] ,alternative="two.sided",var.equal=FALSE)) | |
## Thus a test is bandy if the centers are signficantly | |
## different and there is no level shift and sepration is < | |
## the cuttof. The default of 12 is based on URL: | |
## In essence: | |
## small bandy (<cut) & isBandy == 1 ==> bandiness | |
## small bandy (<cut) & isBandy == 0 ==> strong level shift(not gradual) | |
return(c(sep=sepration, isBandy = 1*(t1$p.value <0.01 && t2$p.value >=0.01 && sepration<=cut) )) | |
})) | |
apply(m,2,median) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment