Created
November 11, 2016 06:36
-
-
Save Orbifold/5816c46c599f8dc6f97bcd330e9b756c to your computer and use it in GitHub Desktop.
Noise reduction using chi2. A synthetic proof by example.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
probabilityInsideCluster = 0.5 | |
probabilityOutsideCluster = 0.2 | |
probabilityNoise = 0.3 | |
clusterBoundaries = c(10, 25, 56, 81, 151, 293) | |
clientTypeCount = c(0, 0, 0, 0) | |
library(dplyr) | |
library(rpart) | |
# this create a frame of clients | |
makeClientsFrame = function(attribCount = 1000, | |
clientCount = 1000, | |
typeDistribution = c(0.0, 0.3, 0.3, 0.4)) { | |
# a stochastic cluster variable | |
# if k is null or 0 the noise is uniform | |
stoch = function(k, i) { | |
if (is.null(k) || k == 0) | |
{ | |
return(ifelse(runif(1) < probabilityNoise, 1, 0)) | |
} | |
if (clusterBoundaries[2 * k - 1] < i && | |
i <= clusterBoundaries[2 * k]) { | |
return(ifelse(runif(1) < probabilityInsideCluster, 1, 0)) | |
} | |
return(ifelse(runif(1) < probabilityOutsideCluster, 1, 0)) | |
} | |
# make a single row of values | |
makeClient = function(clus) { | |
if (is.null(clus) || | |
clus == 0) | |
clientTypeCount[1] <<- clientTypeCount[1] + 1 | |
else | |
clientTypeCount[clus] <<- clientTypeCount[clus] + 1 | |
r = sapply(1:attribCount, function(x) { | |
stoch(clus, x) | |
}) | |
r = c(r, clus) # keep the type of client at the end | |
return(r) | |
} | |
# create the initial frame | |
df = data_frame(makeClient(0)) | |
addClient = function(clus) { | |
df <<- data.frame(df, data_frame(makeClient(clus))) | |
} | |
invisible(replicate(clientCount, addClient(sample(0:3, 1, prob = typeDistribution)))) | |
df = data.frame(t(df)) | |
# adding a target column | |
#df = mutate(df, Target = sample(c(1,0), clientCount + 1, replace = TRUE)) | |
colnames(df) = c(paste("A" , 1:attribCount, sep = ""), "Target") | |
return(df) | |
} | |
baseFrame = makeClientsFrame() | |
chi = function(frame, index) { | |
colName = paste("A", index, sep = "") | |
colIndex = as.numeric(which(colnames(frame) == colName)) | |
targetIndex = as.numeric(which(colnames(frame) == "Target")) | |
q = frame[, c(colIndex, targetIndex)] | |
col = as.symbol(colnames(q)[1]) | |
c1 = count_(filter(q, Target == 1), colName)$n | |
c2 = count_(filter(q, Target == 2), colName)$n | |
c3 = count_(filter(q, Target == 3), colName)$n | |
df = data.frame(c1, c2, c3) | |
chisq.test(df)$p.value | |
} | |
# look at the Pearson coefficient for each of the first 500 attribute | |
chis = lapply(1:1000, function(x) { | |
chi(baseFrame, x) | |
}) | |
plot(1:500, chis[1:500], xlab = "Feature #", ylab = "Pearson coefficient") | |
abline(v = clusterBoundaries, col = "red") | |
# extremely small Pearson means high correlation | |
goodchis = lapply(chis, function(x) { | |
if (x < 0.003) | |
x | |
else | |
NA | |
}) | |
# they are at which position? | |
which(!is.na(goodchis)) | |
plot(500:1000, chis[500:1000], xlab = "Feature #", ylab = "Pearson coefficient") | |
#-------------------------------------------------------------------------------- | |
# Randomly distributed signals | |
#-------------------------------------------------------------------------------- | |
options = list( | |
attribCount = 1000, | |
# number of attributes | |
clientCount = 15000, | |
# number of clients | |
foldingAttribCount = 50, | |
# number of significant attribs | |
targetCount = 3, | |
# number of targets or classes | |
percentageUnclassified = 50 # percent of clientCount not classified, i.e. value 0 | |
) | |
makeRandomPartition = function(options) { | |
splitIntervalTo = ceiling((1 - options$percentageUnclassified / 100) * options$clientCount) | |
starts = as.vector(sort(sample( | |
2:(splitIntervalTo - 1), options$targetCount - 1 | |
))) | |
starts = c(1, starts, splitIntervalTo) | |
r = c() | |
for (k in 1:(options$targetCount)) { | |
r = c(r, rep(k, starts[k + 1] - starts[k])) | |
} | |
r = c(r, rep(0, options$clientCount - splitIntervalTo + 1)) | |
return(r) | |
} | |
# creates one attrib vector for all clients | |
makeAttrib = function(target, partition, options) { | |
if (is.null(target) || target == 0) { | |
sample(0:1, options$clientCount, replace = TRUE) | |
} else{ | |
series = lapply(partition, function(x) { | |
if (x == target) { | |
sample(0:1, 1, prob = c(0.3, 0.7)) | |
} | |
else{ | |
sample(0:1, 1) | |
} | |
}) | |
return(as.numeric(series)) | |
} | |
} | |
makeRandomFrame = function(options){ | |
mem = list() # keeps track of attribs contributing to which target | |
significantColumns = sample(1:options$attribCount, options$foldingAttribCount) | |
partition = makeRandomPartition(options) # the target column | |
df = data.frame(matrix(NA, nrow = options$clientCount, ncol = 0)) | |
for(k in 1:options$attribCount){ | |
if(is.element(k, significantColumns)) { | |
whichTarget = sample(1:options$targetCount,1) | |
memName = paste("T", whichTarget, sep="") | |
mem[[memName]] = c(mem[[memName]], k) # col k leads to target whichTarget | |
mem[["All"]] = c(mem[["All"]], k) | |
attrib = makeAttrib(whichTarget, partition, options) | |
df = bind_cols(df, as.data.frame(attrib)) | |
} | |
else{ | |
attrib = makeAttrib(0, partition, options) | |
df = bind_cols(df, as.data.frame(attrib)) | |
} | |
} | |
df = bind_cols(df, as.data.frame(partition)) | |
colnames(df) = c(paste("A" , 1:options$attribCount, sep=""), "Target") | |
result = list(data = df, contrib = mem) | |
return(result) | |
} | |
chi = function(frame, index, options){ | |
colName = paste("A", index, sep = "") | |
colIndex = as.numeric( which(colnames(frame) == colName)) | |
targetIndex = as.numeric( which(colnames(frame) == "Target")) | |
q = frame[,c(colIndex, targetIndex)] | |
df = data.frame(matrix(NA, nrow = 2, ncol = 0)) | |
#for(k in 1:options$targetCount){ | |
for(k in c(0,2)){ | |
count01s = count_(filter(q, Target == k), colName )$n | |
tcolName = paste("T", index, sep = "") | |
appen = as.data.frame(count01s) | |
colnames(appen) = c(tcolName) | |
df = bind_cols(df, appen) | |
} | |
chisq.test(df)$p.value | |
} | |
frame = makeRandomFrame(options) | |
frameData = frame$data | |
contrib = frame$contrib | |
chis = lapply(1:options$attribCount, function(x){chi(frameData, x, options)}) | |
#hoera! | |
plot(1:options$attribCount, chis, ylim=c(0, 0.05)) | |
abline(v= contrib[["All"]], col = "red") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment