Skip to content

Instantly share code, notes, and snippets.

@Orbifold
Created November 11, 2016 06:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Orbifold/5816c46c599f8dc6f97bcd330e9b756c to your computer and use it in GitHub Desktop.
Save Orbifold/5816c46c599f8dc6f97bcd330e9b756c to your computer and use it in GitHub Desktop.
Noise reduction using chi2. A synthetic proof by example.
probabilityInsideCluster = 0.5
probabilityOutsideCluster = 0.2
probabilityNoise = 0.3
clusterBoundaries = c(10, 25, 56, 81, 151, 293)
clientTypeCount = c(0, 0, 0, 0)
library(dplyr)
library(rpart)
# this create a frame of clients
makeClientsFrame = function(attribCount = 1000,
clientCount = 1000,
typeDistribution = c(0.0, 0.3, 0.3, 0.4)) {
# a stochastic cluster variable
# if k is null or 0 the noise is uniform
stoch = function(k, i) {
if (is.null(k) || k == 0)
{
return(ifelse(runif(1) < probabilityNoise, 1, 0))
}
if (clusterBoundaries[2 * k - 1] < i &&
i <= clusterBoundaries[2 * k]) {
return(ifelse(runif(1) < probabilityInsideCluster, 1, 0))
}
return(ifelse(runif(1) < probabilityOutsideCluster, 1, 0))
}
# make a single row of values
makeClient = function(clus) {
if (is.null(clus) ||
clus == 0)
clientTypeCount[1] <<- clientTypeCount[1] + 1
else
clientTypeCount[clus] <<- clientTypeCount[clus] + 1
r = sapply(1:attribCount, function(x) {
stoch(clus, x)
})
r = c(r, clus) # keep the type of client at the end
return(r)
}
# create the initial frame
df = data_frame(makeClient(0))
addClient = function(clus) {
df <<- data.frame(df, data_frame(makeClient(clus)))
}
invisible(replicate(clientCount, addClient(sample(0:3, 1, prob = typeDistribution))))
df = data.frame(t(df))
# adding a target column
#df = mutate(df, Target = sample(c(1,0), clientCount + 1, replace = TRUE))
colnames(df) = c(paste("A" , 1:attribCount, sep = ""), "Target")
return(df)
}
baseFrame = makeClientsFrame()
chi = function(frame, index) {
colName = paste("A", index, sep = "")
colIndex = as.numeric(which(colnames(frame) == colName))
targetIndex = as.numeric(which(colnames(frame) == "Target"))
q = frame[, c(colIndex, targetIndex)]
col = as.symbol(colnames(q)[1])
c1 = count_(filter(q, Target == 1), colName)$n
c2 = count_(filter(q, Target == 2), colName)$n
c3 = count_(filter(q, Target == 3), colName)$n
df = data.frame(c1, c2, c3)
chisq.test(df)$p.value
}
# look at the Pearson coefficient for each of the first 500 attribute
chis = lapply(1:1000, function(x) {
chi(baseFrame, x)
})
plot(1:500, chis[1:500], xlab = "Feature #", ylab = "Pearson coefficient")
abline(v = clusterBoundaries, col = "red")
# extremely small Pearson means high correlation
goodchis = lapply(chis, function(x) {
if (x < 0.003)
x
else
NA
})
# they are at which position?
which(!is.na(goodchis))
plot(500:1000, chis[500:1000], xlab = "Feature #", ylab = "Pearson coefficient")
#--------------------------------------------------------------------------------
# Randomly distributed signals
#--------------------------------------------------------------------------------
options = list(
attribCount = 1000,
# number of attributes
clientCount = 15000,
# number of clients
foldingAttribCount = 50,
# number of significant attribs
targetCount = 3,
# number of targets or classes
percentageUnclassified = 50 # percent of clientCount not classified, i.e. value 0
)
makeRandomPartition = function(options) {
splitIntervalTo = ceiling((1 - options$percentageUnclassified / 100) * options$clientCount)
starts = as.vector(sort(sample(
2:(splitIntervalTo - 1), options$targetCount - 1
)))
starts = c(1, starts, splitIntervalTo)
r = c()
for (k in 1:(options$targetCount)) {
r = c(r, rep(k, starts[k + 1] - starts[k]))
}
r = c(r, rep(0, options$clientCount - splitIntervalTo + 1))
return(r)
}
# creates one attrib vector for all clients
makeAttrib = function(target, partition, options) {
if (is.null(target) || target == 0) {
sample(0:1, options$clientCount, replace = TRUE)
} else{
series = lapply(partition, function(x) {
if (x == target) {
sample(0:1, 1, prob = c(0.3, 0.7))
}
else{
sample(0:1, 1)
}
})
return(as.numeric(series))
}
}
makeRandomFrame = function(options){
mem = list() # keeps track of attribs contributing to which target
significantColumns = sample(1:options$attribCount, options$foldingAttribCount)
partition = makeRandomPartition(options) # the target column
df = data.frame(matrix(NA, nrow = options$clientCount, ncol = 0))
for(k in 1:options$attribCount){
if(is.element(k, significantColumns)) {
whichTarget = sample(1:options$targetCount,1)
memName = paste("T", whichTarget, sep="")
mem[[memName]] = c(mem[[memName]], k) # col k leads to target whichTarget
mem[["All"]] = c(mem[["All"]], k)
attrib = makeAttrib(whichTarget, partition, options)
df = bind_cols(df, as.data.frame(attrib))
}
else{
attrib = makeAttrib(0, partition, options)
df = bind_cols(df, as.data.frame(attrib))
}
}
df = bind_cols(df, as.data.frame(partition))
colnames(df) = c(paste("A" , 1:options$attribCount, sep=""), "Target")
result = list(data = df, contrib = mem)
return(result)
}
chi = function(frame, index, options){
colName = paste("A", index, sep = "")
colIndex = as.numeric( which(colnames(frame) == colName))
targetIndex = as.numeric( which(colnames(frame) == "Target"))
q = frame[,c(colIndex, targetIndex)]
df = data.frame(matrix(NA, nrow = 2, ncol = 0))
#for(k in 1:options$targetCount){
for(k in c(0,2)){
count01s = count_(filter(q, Target == k), colName )$n
tcolName = paste("T", index, sep = "")
appen = as.data.frame(count01s)
colnames(appen) = c(tcolName)
df = bind_cols(df, appen)
}
chisq.test(df)$p.value
}
frame = makeRandomFrame(options)
frameData = frame$data
contrib = frame$contrib
chis = lapply(1:options$attribCount, function(x){chi(frameData, x, options)})
#hoera!
plot(1:options$attribCount, chis, ylim=c(0, 0.05))
abline(v= contrib[["All"]], col = "red")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment