xiaodaigh/corr.ilic.r

## corr.ilic.r
#
#
cor.ilic <- function(x,y) {
#   tryCatch(t <- table(x,y,exclude=NULL,useNA="ifany")
# 			 , error = function(e) {
#
# 			 }
# 			 )
	bt <- table.p(data.frame(x=x,y=y),32)

	#t <- reshape(bt,idvar=names(bt)[1],timevar=names(bt)[2],direction="wide") #takes too fucking long

	t <- bt$Freq
	sumt <- sum(t)
	t1 <- t / sumt

	hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
	#x.margin <- apply(t1,1,sum)
	#y.margin <- apply(t1,2,sum)

	x.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[1]]))),sum)$x
	y.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[2]]))),sum)$x

	hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
	hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
	(hx + hy - hxy) / min(hx,hy)
}

z  = d$default
x = d1$AVG_CASA_AGG_BAL_L12MWs
y = d1$UTL_OD_L12MWs
x1 = x[z==1]
y1 = y[z==1]
cor.ilic(x,y)
x0 = x[z==0]
y0 = x[z==0]


x = runif(length(d3$UTL_OD_L12M))
y = runif(length(d3$UTL_OD_L12M))
d3$AVG_CASA_AGG_BAL_L12M

x <- d3$AVG_CASA_BAL_L6MW
y <- d3$AVG_CASA_AGG_BAL_L12MW

y = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
x = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))

cor.ilic(x,y)


d4 <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
system.time(bt <- table.p(d,chunks=32))

#require(bigtabulate)


cor.ilic(x,y)
#d2 <- subset(d1,z == 0)
d2 <-data
all.pairs <- combn(names(d2),2)
icorr <- apply(all.pairs,2,function(n2) {xy <- d2[n2]; x<-xy[[1]]; y<-xy[[2]];cor.ilic(x,y)})
ns <- apply(all.pairs,2,function(n2) {paste(n2,collapse="+")})
names(icorr) <- ns
icorr <- sort(icorr,decreasing=TRUE)
icorr.df <- as.data.frame(icorr)
fix(icorr.df)


#d1[all.pairs[,1]]

t <- table(x,y,z,exclude=NULL,useNA="ifany")
t1 <- t / sum(t)
hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
x.margin <- apply(t1,1,sum)
y.margin <- apply(t1,2,sum)
z.margin <- apply(t1,3,sum)
hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
hz <- sum(-z.margin * ifelse(z.margin==0,0,log(z.margin)))
ilic.corr <- (hx + hy + hz - hxy) / min(hx,hy,hz)


set.seed(3)
system.time(s <- replicate(10000,diff(c(0,sort(sample(seq(0,1,0.01),9,replace=TRUE)),1))))
which(apply(s,2,sum)==1)
all(apply(s,2,sum)-1 < 0.001)

## gini.pamp.r
# d <- read.csv("c:/temp/gini.csv")
# pred <- d$score
# actual <- d$bad_outcome

d3 <- read.csv("c:/temp/datalah.csv")
d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))

d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","default"))
bt <- table.p(d,chunks=32)
#paralellized table
table.p <- function(d,chunks=NULL) {
  require(parallel)
	require(doSNOW)

	# determine the number of workers available
	n.threads <- parallel::detectCores()

	if (is.null(chunks)) chunks <- n.threads

	#grouping <- rep_len(1:n.threads,length(d[[1]]))
	grouping <- rep(1:chunks,length.out=length(d[[1]]))

	#doing multiple tables
	print("multiple table")
	haha <- foreach(i = 1:chunks) %dopar% (
		table(d[grouping==i,],exclude=NULL, useNA ="always")
	)
	print("FINISHED: multiple tables")

    base = NULL

	print("merging datasets")
	for (i in 1:chunks) {
		ddd <- data.frame(haha[[i]])
		names.l <- length(names(ddd))
		names(ddd) <- c(names(ddd)[-names.l],"Freq")
		ddd <- subset(ddd,Freq!=0) #for sparse matrix this will vastly improve the performance
		names(ddd) <- c(names(ddd)[-names.l],paste("Freq",i))

		if (i==1) {
			base <- ddd
		}
		else base <- merge(base,ddd,all=TRUE)
		print(paste(i,chunks,sep=" out of "))
		gc()
	}
	print("FINISHED: merging datasets")


	print("Creating Freq")
	freq.i <- base[[names.l - 1 +1]]
	base$Freq <- ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)

	for (i in 2:chunks) {
		freq.i <- base[[names.l - 1 +i]]
		base$Freq <- 	base$Freq + ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
		#print(i)
	}
	print("FINISHED: Creating Freq")

	#clean up
	print("clean up")
	for (i in 1:chunks) {
		base[[names.l - 1 +1]] <- NULL
	}
	print("FINISHED: clean up")
	#

	return(base)
}

# it is assumed that d has a number of data
gini.pamp <- function(pred, actual,chunks=NULL .parallel = FALSE) {
	if (.parallel) {
		require(parallel)
		require(doSNOW)
		# determine the number of workers available
		n.threads <- getDoParWorkers()
	}

	# compute the table first
	if (.parallel) {
		system.time(table.d1 <- acast(table.p(data.frame(pred=pred,actual=actual)),pred~actual,sum,value.var="Freq"))
		table.d <- table.d1[order(as.numeric(dimnames(table.d1)[[1]])),]
	}
	else
		system.time(table.d <- table(data.frame(pred=pred,actual=actual)))


	#print("cumsum")
	# cumsum
	good.cumsum <- cumsum(table.d[,1]) / sum(table.d[,1])
	bad.cumsum <- cumsum(table.d[,2])  / sum(table.d[,2])

	tmp.l <- length(good.cumsum)

	#1 - sum(diff(bad.cumsum) * (good.cumsum[-1] + good.cumsum[-tmp.l]))
	1 - crossprod(diff(bad.cumsum) , (good.cumsum[-1] + good.cumsum[-tmp.l]))
	# divide the data up into threads and compute the Gini
}


#system.time(print(gini.pamp(d$score, d$bad_outcome, .parallel = TRUE)))
#system.time(print(gini.pamp(d$score, d$bad_outcome)))
gc()
x <- runif(2^23)
y <- ifelse(runif(2^23)<0.5,0,1)
#x <- runif(100000)
#y <- ifelse(runif(100000)<0.5,0,1)
system.time(print(gini.pamp(x, y)))


# the package that allows for parallel processing
require(parallel)
require(doSNOW)
require(reshape2)
# i have 8 cpu cores
cores = 2
# some generic code to tell R to use the 8 cores
#browser()
cl = makeCluster(min(8,cores))
registerDoSNOW(cl)
getDoParWorkers()
system.time(print(gini.pamp(x, y, .parallel = TRUE)))
#seems like merging takes the longest time
stopCluster(cl)
	#
	#
	cor.ilic <- function(x,y) {
	# tryCatch(t <- table(x,y,exclude=NULL,useNA="ifany")
	# , error = function(e) {
	#
	# }
	# )
	bt <- table.p(data.frame(x=x,y=y),32)

	#t <- reshape(bt,idvar=names(bt)[1],timevar=names(bt)[2],direction="wide") #takes too fucking long

	t <- bt$Freq
	sumt <- sum(t)
	t1 <- t / sumt

	hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
	#x.margin <- apply(t1,1,sum)
	#y.margin <- apply(t1,2,sum)

	x.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[1]]))),sum)$x
	y.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[2]]))),sum)$x

	hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
	hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
	(hx + hy - hxy) / min(hx,hy)
	}

	z = d$default
	x = d1$AVG_CASA_AGG_BAL_L12MWs
	y = d1$UTL_OD_L12MWs
	x1 = x[z==1]
	y1 = y[z==1]
	cor.ilic(x,y)
	x0 = x[z==0]
	y0 = x[z==0]



	x = runif(length(d3$UTL_OD_L12M))
	y = runif(length(d3$UTL_OD_L12M))
	d3$AVG_CASA_AGG_BAL_L12M

	x <- d3$AVG_CASA_BAL_L6MW
	y <- d3$AVG_CASA_AGG_BAL_L12MW

	y = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
	x = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))

	cor.ilic(x,y)


	d4 <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
	system.time(bt <- table.p(d,chunks=32))

	#require(bigtabulate)


	cor.ilic(x,y)
	#d2 <- subset(d1,z == 0)
	d2 <-data
	all.pairs <- combn(names(d2),2)
	icorr <- apply(all.pairs,2,function(n2) {xy <- d2[n2]; x<-xy[[1]]; y<-xy[[2]];cor.ilic(x,y)})
	ns <- apply(all.pairs,2,function(n2) {paste(n2,collapse="+")})
	names(icorr) <- ns
	icorr <- sort(icorr,decreasing=TRUE)
	icorr.df <- as.data.frame(icorr)
	fix(icorr.df)


	#d1[all.pairs[,1]]

	t <- table(x,y,z,exclude=NULL,useNA="ifany")
	t1 <- t / sum(t)
	hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
	x.margin <- apply(t1,1,sum)
	y.margin <- apply(t1,2,sum)
	z.margin <- apply(t1,3,sum)
	hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
	hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
	hz <- sum(-z.margin * ifelse(z.margin==0,0,log(z.margin)))
	ilic.corr <- (hx + hy + hz - hxy) / min(hx,hy,hz)


	set.seed(3)
	system.time(s <- replicate(10000,diff(c(0,sort(sample(seq(0,1,0.01),9,replace=TRUE)),1))))
	which(apply(s,2,sum)==1)
	all(apply(s,2,sum)-1 < 0.001)
	# d <- read.csv("c:/temp/gini.csv")
	# pred <- d$score
	# actual <- d$bad_outcome

	d3 <- read.csv("c:/temp/datalah.csv")
	d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))

	d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","default"))
	bt <- table.p(d,chunks=32)
	#paralellized table
	table.p <- function(d,chunks=NULL) {
	require(parallel)
	require(doSNOW)

	# determine the number of workers available
	n.threads <- parallel::detectCores()

	if (is.null(chunks)) chunks <- n.threads

	#grouping <- rep_len(1:n.threads,length(d[[1]]))
	grouping <- rep(1:chunks,length.out=length(d[[1]]))

	#doing multiple tables
	print("multiple table")
	haha <- foreach(i = 1:chunks) %dopar% (
	table(d[grouping==i,],exclude=NULL, useNA ="always")
	)
	print("FINISHED: multiple tables")

	base = NULL

	print("merging datasets")
	for (i in 1:chunks) {
	ddd <- data.frame(haha[[i]])
	names.l <- length(names(ddd))
	names(ddd) <- c(names(ddd)[-names.l],"Freq")
	ddd <- subset(ddd,Freq!=0) #for sparse matrix this will vastly improve the performance
	names(ddd) <- c(names(ddd)[-names.l],paste("Freq",i))

	if (i==1) {
	base <- ddd
	}
	else base <- merge(base,ddd,all=TRUE)
	print(paste(i,chunks,sep=" out of "))
	gc()
	}
	print("FINISHED: merging datasets")


	print("Creating Freq")
	freq.i <- base[[names.l - 1 +1]]
	base$Freq <- ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)

	for (i in 2:chunks) {
	freq.i <- base[[names.l - 1 +i]]
	base$Freq <- base$Freq + ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
	#print(i)
	}
	print("FINISHED: Creating Freq")

	#clean up
	print("clean up")
	for (i in 1:chunks) {
	base[[names.l - 1 +1]] <- NULL
	}
	print("FINISHED: clean up")
	#

	return(base)
	}

	# it is assumed that d has a number of data
	gini.pamp <- function(pred, actual,chunks=NULL .parallel = FALSE) {
	if (.parallel) {
	require(parallel)
	require(doSNOW)
	# determine the number of workers available
	n.threads <- getDoParWorkers()
	}

	# compute the table first
	if (.parallel) {
	system.time(table.d1 <- acast(table.p(data.frame(pred=pred,actual=actual)),pred~actual,sum,value.var="Freq"))
	table.d <- table.d1[order(as.numeric(dimnames(table.d1)[[1]])),]
	}
	else
	system.time(table.d <- table(data.frame(pred=pred,actual=actual)))


	#print("cumsum")
	# cumsum
	good.cumsum <- cumsum(table.d[,1]) / sum(table.d[,1])
	bad.cumsum <- cumsum(table.d[,2]) / sum(table.d[,2])

	tmp.l <- length(good.cumsum)

	#1 - sum(diff(bad.cumsum) * (good.cumsum[-1] + good.cumsum[-tmp.l]))
	1 - crossprod(diff(bad.cumsum) , (good.cumsum[-1] + good.cumsum[-tmp.l]))
	# divide the data up into threads and compute the Gini
	}


	#system.time(print(gini.pamp(d$score, d$bad_outcome, .parallel = TRUE)))
	#system.time(print(gini.pamp(d$score, d$bad_outcome)))
	gc()
	x <- runif(2^23)
	y <- ifelse(runif(2^23)<0.5,0,1)
	#x <- runif(100000)
	#y <- ifelse(runif(100000)<0.5,0,1)
	system.time(print(gini.pamp(x, y)))


	# the package that allows for parallel processing
	require(parallel)
	require(doSNOW)
	require(reshape2)
	# i have 8 cpu cores
	cores = 2
	# some generic code to tell R to use the 8 cores
	#browser()
	cl = makeCluster(min(8,cores))
	registerDoSNOW(cl)
	getDoParWorkers()
	system.time(print(gini.pamp(x, y, .parallel = TRUE)))
	#seems like merging takes the longest time
	stopCluster(cl)