Orbifold/Chi2NoiseReduction.R

## Chi2NoiseReduction.R
probabilityInsideCluster = 0.5
probabilityOutsideCluster = 0.2
probabilityNoise = 0.3
clusterBoundaries = c(10, 25, 56, 81, 151, 293)
clientTypeCount = c(0, 0, 0, 0)
library(dplyr)
library(rpart)
# this create a frame of clients
makeClientsFrame = function(attribCount = 1000,
                            clientCount = 1000,
                            typeDistribution = c(0.0, 0.3, 0.3, 0.4)) {
  # a stochastic cluster variable
  # if k is null or 0 the noise is uniform
  stoch = function(k, i) {
    if (is.null(k) || k == 0)
    {
      return(ifelse(runif(1) < probabilityNoise, 1, 0))
    }
    if (clusterBoundaries[2 * k - 1] < i &&
        i <= clusterBoundaries[2 * k]) {
      return(ifelse(runif(1) < probabilityInsideCluster, 1, 0))
    }
    return(ifelse(runif(1) < probabilityOutsideCluster, 1, 0))
  }
  # make a single row of values
  makeClient = function(clus) {
    if (is.null(clus) ||
        clus == 0)
      clientTypeCount[1] <<- clientTypeCount[1] + 1
    else
      clientTypeCount[clus] <<- clientTypeCount[clus] + 1
    r = sapply(1:attribCount, function(x) {
      stoch(clus, x)
    })
    r = c(r, clus) # keep the type of client at the end
    return(r)
  }

  # create the initial frame
  df = data_frame(makeClient(0))
  addClient = function(clus) {
    df <<- data.frame(df, data_frame(makeClient(clus)))
  }

  invisible(replicate(clientCount, addClient(sample(0:3, 1, prob = typeDistribution))))
  df = data.frame(t(df))
  # adding a target column
  #df = mutate(df, Target = sample(c(1,0), clientCount + 1, replace = TRUE))
  colnames(df) = c(paste("A" , 1:attribCount, sep = ""), "Target")
  return(df)
}
baseFrame = makeClientsFrame()

chi = function(frame, index) {
  colName = paste("A", index, sep = "")
  colIndex = as.numeric(which(colnames(frame) == colName))
  targetIndex = as.numeric(which(colnames(frame) == "Target"))
  q = frame[, c(colIndex, targetIndex)]
  col = as.symbol(colnames(q)[1])
  c1 = count_(filter(q, Target == 1), colName)$n
  c2 = count_(filter(q, Target == 2), colName)$n
  c3 = count_(filter(q, Target == 3), colName)$n
  df = data.frame(c1, c2, c3)
  chisq.test(df)$p.value
}
# look at the Pearson coefficient for each of the first 500 attribute
chis = lapply(1:1000, function(x) {
  chi(baseFrame, x)
})
plot(1:500, chis[1:500], xlab = "Feature #", ylab = "Pearson coefficient")
abline(v = clusterBoundaries, col = "red")

# extremely small Pearson means high correlation
goodchis = lapply(chis, function(x) {
  if (x < 0.003)
    x
  else
    NA
})
# they are at which position?
which(!is.na(goodchis))

plot(500:1000, chis[500:1000], xlab = "Feature #", ylab = "Pearson coefficient")
#--------------------------------------------------------------------------------
# Randomly distributed signals
#--------------------------------------------------------------------------------
options = list(
  attribCount = 1000,
  # number of attributes
  clientCount = 15000,
  # number of clients
  foldingAttribCount = 50,
  # number of significant attribs
  targetCount = 3,
  # number of targets or classes
  percentageUnclassified = 50 # percent of clientCount not classified, i.e. value 0
)

makeRandomPartition = function(options) {
  splitIntervalTo = ceiling((1 - options$percentageUnclassified / 100) * options$clientCount)
  starts = as.vector(sort(sample(
    2:(splitIntervalTo - 1), options$targetCount - 1
  )))
  starts = c(1, starts, splitIntervalTo)
  r = c()
  for (k in 1:(options$targetCount)) {
    r = c(r, rep(k, starts[k + 1] - starts[k]))
  }
  r = c(r, rep(0, options$clientCount - splitIntervalTo + 1))
  return(r)
}
# creates one attrib vector for all clients
makeAttrib = function(target, partition, options) {
  if (is.null(target) || target == 0) {
    sample(0:1, options$clientCount, replace = TRUE)
  } else{
    series = lapply(partition, function(x) {
      if (x == target) {
        sample(0:1, 1, prob = c(0.3, 0.7))
      }
      else{
        sample(0:1, 1)
      }
    })
    return(as.numeric(series))
  }
}

makeRandomFrame = function(options){
  mem = list() # keeps track of attribs contributing to which target
  significantColumns = sample(1:options$attribCount, options$foldingAttribCount)
  partition = makeRandomPartition(options) # the target column
  df = data.frame(matrix(NA, nrow = options$clientCount, ncol = 0))
  for(k in 1:options$attribCount){
    if(is.element(k, significantColumns)) {
      whichTarget = sample(1:options$targetCount,1)
      memName = paste("T", whichTarget, sep="")
      mem[[memName]] = c(mem[[memName]], k) # col k leads to target whichTarget
      mem[["All"]] = c(mem[["All"]], k)
      attrib = makeAttrib(whichTarget, partition, options)
      df = bind_cols(df, as.data.frame(attrib))
    }
    else{
      attrib = makeAttrib(0, partition, options)
      df = bind_cols(df, as.data.frame(attrib))
    }
  }
  df = bind_cols(df, as.data.frame(partition))
  colnames(df) = c(paste("A" , 1:options$attribCount, sep=""), "Target")
  result = list(data = df, contrib = mem)
  return(result)
}


chi = function(frame, index, options){
  colName = paste("A", index, sep = "")
  colIndex = as.numeric( which(colnames(frame) == colName))
  targetIndex = as.numeric( which(colnames(frame) == "Target"))
  q = frame[,c(colIndex, targetIndex)]
  df = data.frame(matrix(NA, nrow = 2, ncol = 0))
  #for(k in 1:options$targetCount){
  for(k in c(0,2)){
    count01s = count_(filter(q, Target == k), colName )$n
    tcolName = paste("T", index, sep = "")
    appen = as.data.frame(count01s)
    colnames(appen) = c(tcolName)
    df = bind_cols(df, appen)
  }
  chisq.test(df)$p.value
}


frame = makeRandomFrame(options)
frameData = frame$data
contrib = frame$contrib
chis = lapply(1:options$attribCount, function(x){chi(frameData, x, options)})
#hoera!
plot(1:options$attribCount, chis, ylim=c(0, 0.05))
abline(v= contrib[["All"]], col = "red")
	probabilityInsideCluster = 0.5
	probabilityOutsideCluster = 0.2
	probabilityNoise = 0.3
	clusterBoundaries = c(10, 25, 56, 81, 151, 293)
	clientTypeCount = c(0, 0, 0, 0)
	library(dplyr)
	library(rpart)
	# this create a frame of clients
	makeClientsFrame = function(attribCount = 1000,
	clientCount = 1000,
	typeDistribution = c(0.0, 0.3, 0.3, 0.4)) {
	# a stochastic cluster variable
	# if k is null or 0 the noise is uniform
	stoch = function(k, i) {
	if (is.null(k) \|\| k == 0)
	{
	return(ifelse(runif(1) < probabilityNoise, 1, 0))
	}
	if (clusterBoundaries[2 * k - 1] < i &&
	i <= clusterBoundaries[2 * k]) {
	return(ifelse(runif(1) < probabilityInsideCluster, 1, 0))
	}
	return(ifelse(runif(1) < probabilityOutsideCluster, 1, 0))
	}
	# make a single row of values
	makeClient = function(clus) {
	if (is.null(clus) \|\|
	clus == 0)
	clientTypeCount[1] <<- clientTypeCount[1] + 1
	else
	clientTypeCount[clus] <<- clientTypeCount[clus] + 1
	r = sapply(1:attribCount, function(x) {
	stoch(clus, x)
	})
	r = c(r, clus) # keep the type of client at the end
	return(r)
	}

	# create the initial frame
	df = data_frame(makeClient(0))
	addClient = function(clus) {
	df <<- data.frame(df, data_frame(makeClient(clus)))
	}

	invisible(replicate(clientCount, addClient(sample(0:3, 1, prob = typeDistribution))))
	df = data.frame(t(df))
	# adding a target column
	#df = mutate(df, Target = sample(c(1,0), clientCount + 1, replace = TRUE))
	colnames(df) = c(paste("A" , 1:attribCount, sep = ""), "Target")
	return(df)
	}
	baseFrame = makeClientsFrame()

	chi = function(frame, index) {
	colName = paste("A", index, sep = "")
	colIndex = as.numeric(which(colnames(frame) == colName))
	targetIndex = as.numeric(which(colnames(frame) == "Target"))
	q = frame[, c(colIndex, targetIndex)]
	col = as.symbol(colnames(q)[1])
	c1 = count_(filter(q, Target == 1), colName)$n
	c2 = count_(filter(q, Target == 2), colName)$n
	c3 = count_(filter(q, Target == 3), colName)$n
	df = data.frame(c1, c2, c3)
	chisq.test(df)$p.value
	}
	# look at the Pearson coefficient for each of the first 500 attribute
	chis = lapply(1:1000, function(x) {
	chi(baseFrame, x)
	})
	plot(1:500, chis[1:500], xlab = "Feature #", ylab = "Pearson coefficient")
	abline(v = clusterBoundaries, col = "red")

	# extremely small Pearson means high correlation
	goodchis = lapply(chis, function(x) {
	if (x < 0.003)
	x
	else
	NA
	})
	# they are at which position?
	which(!is.na(goodchis))

	plot(500:1000, chis[500:1000], xlab = "Feature #", ylab = "Pearson coefficient")
	#--------------------------------------------------------------------------------
	# Randomly distributed signals
	#--------------------------------------------------------------------------------
	options = list(
	attribCount = 1000,
	# number of attributes
	clientCount = 15000,
	# number of clients
	foldingAttribCount = 50,
	# number of significant attribs
	targetCount = 3,
	# number of targets or classes
	percentageUnclassified = 50 # percent of clientCount not classified, i.e. value 0
	)

	makeRandomPartition = function(options) {
	splitIntervalTo = ceiling((1 - options$percentageUnclassified / 100) * options$clientCount)
	starts = as.vector(sort(sample(
	2:(splitIntervalTo - 1), options$targetCount - 1
	)))
	starts = c(1, starts, splitIntervalTo)
	r = c()
	for (k in 1:(options$targetCount)) {
	r = c(r, rep(k, starts[k + 1] - starts[k]))
	}
	r = c(r, rep(0, options$clientCount - splitIntervalTo + 1))
	return(r)
	}
	# creates one attrib vector for all clients
	makeAttrib = function(target, partition, options) {
	if (is.null(target) \|\| target == 0) {
	sample(0:1, options$clientCount, replace = TRUE)
	} else{
	series = lapply(partition, function(x) {
	if (x == target) {
	sample(0:1, 1, prob = c(0.3, 0.7))
	}
	else{
	sample(0:1, 1)
	}
	})
	return(as.numeric(series))
	}
	}

	makeRandomFrame = function(options){
	mem = list() # keeps track of attribs contributing to which target
	significantColumns = sample(1:options$attribCount, options$foldingAttribCount)
	partition = makeRandomPartition(options) # the target column
	df = data.frame(matrix(NA, nrow = options$clientCount, ncol = 0))
	for(k in 1:options$attribCount){
	if(is.element(k, significantColumns)) {
	whichTarget = sample(1:options$targetCount,1)
	memName = paste("T", whichTarget, sep="")
	mem[[memName]] = c(mem[[memName]], k) # col k leads to target whichTarget
	mem[["All"]] = c(mem[["All"]], k)
	attrib = makeAttrib(whichTarget, partition, options)
	df = bind_cols(df, as.data.frame(attrib))
	}
	else{
	attrib = makeAttrib(0, partition, options)
	df = bind_cols(df, as.data.frame(attrib))
	}
	}
	df = bind_cols(df, as.data.frame(partition))
	colnames(df) = c(paste("A" , 1:options$attribCount, sep=""), "Target")
	result = list(data = df, contrib = mem)
	return(result)
	}


	chi = function(frame, index, options){
	colName = paste("A", index, sep = "")
	colIndex = as.numeric( which(colnames(frame) == colName))
	targetIndex = as.numeric( which(colnames(frame) == "Target"))
	q = frame[,c(colIndex, targetIndex)]
	df = data.frame(matrix(NA, nrow = 2, ncol = 0))
	#for(k in 1:options$targetCount){
	for(k in c(0,2)){
	count01s = count_(filter(q, Target == k), colName )$n
	tcolName = paste("T", index, sep = "")
	appen = as.data.frame(count01s)
	colnames(appen) = c(tcolName)
	df = bind_cols(df, appen)
	}
	chisq.test(df)$p.value
	}


	frame = makeRandomFrame(options)
	frameData = frame$data
	contrib = frame$contrib
	chis = lapply(1:options$attribCount, function(x){chi(frameData, x, options)})
	#hoera!
	plot(1:options$attribCount, chis, ylim=c(0, 0.05))
	abline(v= contrib[["All"]], col = "red")