Skip to content

Instantly share code, notes, and snippets.

@xiaodaigh
Created September 30, 2013 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xiaodaigh/6760005 to your computer and use it in GitHub Desktop.
Save xiaodaigh/6760005 to your computer and use it in GitHub Desktop.
pamp gini
#
#
cor.ilic <- function(x,y) {
# tryCatch(t <- table(x,y,exclude=NULL,useNA="ifany")
# , error = function(e) {
#
# }
# )
bt <- table.p(data.frame(x=x,y=y),32)
#t <- reshape(bt,idvar=names(bt)[1],timevar=names(bt)[2],direction="wide") #takes too fucking long
t <- bt$Freq
sumt <- sum(t)
t1 <- t / sumt
hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
#x.margin <- apply(t1,1,sum)
#y.margin <- apply(t1,2,sum)
x.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[1]]))),sum)$x
y.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[2]]))),sum)$x
hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
(hx + hy - hxy) / min(hx,hy)
}
z = d$default
x = d1$AVG_CASA_AGG_BAL_L12MWs
y = d1$UTL_OD_L12MWs
x1 = x[z==1]
y1 = y[z==1]
cor.ilic(x,y)
x0 = x[z==0]
y0 = x[z==0]
x = runif(length(d3$UTL_OD_L12M))
y = runif(length(d3$UTL_OD_L12M))
d3$AVG_CASA_AGG_BAL_L12M
x <- d3$AVG_CASA_BAL_L6MW
y <- d3$AVG_CASA_AGG_BAL_L12MW
y = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
x = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
cor.ilic(x,y)
d4 <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
system.time(bt <- table.p(d,chunks=32))
#require(bigtabulate)
cor.ilic(x,y)
#d2 <- subset(d1,z == 0)
d2 <-data
all.pairs <- combn(names(d2),2)
icorr <- apply(all.pairs,2,function(n2) {xy <- d2[n2]; x<-xy[[1]]; y<-xy[[2]];cor.ilic(x,y)})
ns <- apply(all.pairs,2,function(n2) {paste(n2,collapse="+")})
names(icorr) <- ns
icorr <- sort(icorr,decreasing=TRUE)
icorr.df <- as.data.frame(icorr)
fix(icorr.df)
#d1[all.pairs[,1]]
t <- table(x,y,z,exclude=NULL,useNA="ifany")
t1 <- t / sum(t)
hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
x.margin <- apply(t1,1,sum)
y.margin <- apply(t1,2,sum)
z.margin <- apply(t1,3,sum)
hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
hz <- sum(-z.margin * ifelse(z.margin==0,0,log(z.margin)))
ilic.corr <- (hx + hy + hz - hxy) / min(hx,hy,hz)
set.seed(3)
system.time(s <- replicate(10000,diff(c(0,sort(sample(seq(0,1,0.01),9,replace=TRUE)),1))))
which(apply(s,2,sum)==1)
all(apply(s,2,sum)-1 < 0.001)
# d <- read.csv("c:/temp/gini.csv")
# pred <- d$score
# actual <- d$bad_outcome
d3 <- read.csv("c:/temp/datalah.csv")
d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","default"))
bt <- table.p(d,chunks=32)
#paralellized table
table.p <- function(d,chunks=NULL) {
require(parallel)
require(doSNOW)
# determine the number of workers available
n.threads <- parallel::detectCores()
if (is.null(chunks)) chunks <- n.threads
#grouping <- rep_len(1:n.threads,length(d[[1]]))
grouping <- rep(1:chunks,length.out=length(d[[1]]))
#doing multiple tables
print("multiple table")
haha <- foreach(i = 1:chunks) %dopar% (
table(d[grouping==i,],exclude=NULL, useNA ="always")
)
print("FINISHED: multiple tables")
base = NULL
print("merging datasets")
for (i in 1:chunks) {
ddd <- data.frame(haha[[i]])
names.l <- length(names(ddd))
names(ddd) <- c(names(ddd)[-names.l],"Freq")
ddd <- subset(ddd,Freq!=0) #for sparse matrix this will vastly improve the performance
names(ddd) <- c(names(ddd)[-names.l],paste("Freq",i))
if (i==1) {
base <- ddd
}
else base <- merge(base,ddd,all=TRUE)
print(paste(i,chunks,sep=" out of "))
gc()
}
print("FINISHED: merging datasets")
print("Creating Freq")
freq.i <- base[[names.l - 1 +1]]
base$Freq <- ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
for (i in 2:chunks) {
freq.i <- base[[names.l - 1 +i]]
base$Freq <- base$Freq + ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
#print(i)
}
print("FINISHED: Creating Freq")
#clean up
print("clean up")
for (i in 1:chunks) {
base[[names.l - 1 +1]] <- NULL
}
print("FINISHED: clean up")
#
return(base)
}
# it is assumed that d has a number of data
gini.pamp <- function(pred, actual,chunks=NULL .parallel = FALSE) {
if (.parallel) {
require(parallel)
require(doSNOW)
# determine the number of workers available
n.threads <- getDoParWorkers()
}
# compute the table first
if (.parallel) {
system.time(table.d1 <- acast(table.p(data.frame(pred=pred,actual=actual)),pred~actual,sum,value.var="Freq"))
table.d <- table.d1[order(as.numeric(dimnames(table.d1)[[1]])),]
}
else
system.time(table.d <- table(data.frame(pred=pred,actual=actual)))
#print("cumsum")
# cumsum
good.cumsum <- cumsum(table.d[,1]) / sum(table.d[,1])
bad.cumsum <- cumsum(table.d[,2]) / sum(table.d[,2])
tmp.l <- length(good.cumsum)
#1 - sum(diff(bad.cumsum) * (good.cumsum[-1] + good.cumsum[-tmp.l]))
1 - crossprod(diff(bad.cumsum) , (good.cumsum[-1] + good.cumsum[-tmp.l]))
# divide the data up into threads and compute the Gini
}
#system.time(print(gini.pamp(d$score, d$bad_outcome, .parallel = TRUE)))
#system.time(print(gini.pamp(d$score, d$bad_outcome)))
gc()
x <- runif(2^23)
y <- ifelse(runif(2^23)<0.5,0,1)
#x <- runif(100000)
#y <- ifelse(runif(100000)<0.5,0,1)
system.time(print(gini.pamp(x, y)))
# the package that allows for parallel processing
require(parallel)
require(doSNOW)
require(reshape2)
# i have 8 cpu cores
cores = 2
# some generic code to tell R to use the 8 cores
#browser()
cl = makeCluster(min(8,cores))
registerDoSNOW(cl)
getDoParWorkers()
system.time(print(gini.pamp(x, y, .parallel = TRUE)))
#seems like merging takes the longest time
stopCluster(cl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment