Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
pamp gini
#
#
cor.ilic <- function(x,y) {
# tryCatch(t <- table(x,y,exclude=NULL,useNA="ifany")
# , error = function(e) {
#
# }
# )
bt <- table.p(data.frame(x=x,y=y),32)
#t <- reshape(bt,idvar=names(bt)[1],timevar=names(bt)[2],direction="wide") #takes too fucking long
t <- bt$Freq
sumt <- sum(t)
t1 <- t / sumt
hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
#x.margin <- apply(t1,1,sum)
#y.margin <- apply(t1,2,sum)
x.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[1]]))),sum)$x
y.margin <- aggregate(bt$Freq/sumt,by=list(addNA(as.factor(bt[[2]]))),sum)$x
hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
(hx + hy - hxy) / min(hx,hy)
}
z = d$default
x = d1$AVG_CASA_AGG_BAL_L12MWs
y = d1$UTL_OD_L12MWs
x1 = x[z==1]
y1 = y[z==1]
cor.ilic(x,y)
x0 = x[z==0]
y0 = x[z==0]
x = runif(length(d3$UTL_OD_L12M))
y = runif(length(d3$UTL_OD_L12M))
d3$AVG_CASA_AGG_BAL_L12M
x <- d3$AVG_CASA_BAL_L6MW
y <- d3$AVG_CASA_AGG_BAL_L12MW
y = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
x = runif(length(unique(d3$AVG_CASA_BAL_L6MW)))
cor.ilic(x,y)
d4 <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
system.time(bt <- table.p(d,chunks=32))
#require(bigtabulate)
cor.ilic(x,y)
#d2 <- subset(d1,z == 0)
d2 <-data
all.pairs <- combn(names(d2),2)
icorr <- apply(all.pairs,2,function(n2) {xy <- d2[n2]; x<-xy[[1]]; y<-xy[[2]];cor.ilic(x,y)})
ns <- apply(all.pairs,2,function(n2) {paste(n2,collapse="+")})
names(icorr) <- ns
icorr <- sort(icorr,decreasing=TRUE)
icorr.df <- as.data.frame(icorr)
fix(icorr.df)
#d1[all.pairs[,1]]
t <- table(x,y,z,exclude=NULL,useNA="ifany")
t1 <- t / sum(t)
hxy <- sum(-t1 * ifelse(t1==0,0,log(t1)))
x.margin <- apply(t1,1,sum)
y.margin <- apply(t1,2,sum)
z.margin <- apply(t1,3,sum)
hx <- sum(-x.margin * ifelse(x.margin==0,0,log(x.margin)))
hy <- sum(-y.margin * ifelse(y.margin==0,0,log(y.margin)))
hz <- sum(-z.margin * ifelse(z.margin==0,0,log(z.margin)))
ilic.corr <- (hx + hy + hz - hxy) / min(hx,hy,hz)
set.seed(3)
system.time(s <- replicate(10000,diff(c(0,sort(sample(seq(0,1,0.01),9,replace=TRUE)),1))))
which(apply(s,2,sum)==1)
all(apply(s,2,sum)-1 < 0.001)
# d <- read.csv("c:/temp/gini.csv")
# pred <- d$score
# actual <- d$bad_outcome
d3 <- read.csv("c:/temp/datalah.csv")
d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","AVG_CASA_BAL_L6M"))
d <- subset(d3,select=c("AVG_CASA_AGG_BAL_L12M","default"))
bt <- table.p(d,chunks=32)
#paralellized table
table.p <- function(d,chunks=NULL) {
require(parallel)
require(doSNOW)
# determine the number of workers available
n.threads <- parallel::detectCores()
if (is.null(chunks)) chunks <- n.threads
#grouping <- rep_len(1:n.threads,length(d[[1]]))
grouping <- rep(1:chunks,length.out=length(d[[1]]))
#doing multiple tables
print("multiple table")
haha <- foreach(i = 1:chunks) %dopar% (
table(d[grouping==i,],exclude=NULL, useNA ="always")
)
print("FINISHED: multiple tables")
base = NULL
print("merging datasets")
for (i in 1:chunks) {
ddd <- data.frame(haha[[i]])
names.l <- length(names(ddd))
names(ddd) <- c(names(ddd)[-names.l],"Freq")
ddd <- subset(ddd,Freq!=0) #for sparse matrix this will vastly improve the performance
names(ddd) <- c(names(ddd)[-names.l],paste("Freq",i))
if (i==1) {
base <- ddd
}
else base <- merge(base,ddd,all=TRUE)
print(paste(i,chunks,sep=" out of "))
gc()
}
print("FINISHED: merging datasets")
print("Creating Freq")
freq.i <- base[[names.l - 1 +1]]
base$Freq <- ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
for (i in 2:chunks) {
freq.i <- base[[names.l - 1 +i]]
base$Freq <- base$Freq + ifelse(is.na(freq.i),rep(0,length(freq.i)),freq.i)
#print(i)
}
print("FINISHED: Creating Freq")
#clean up
print("clean up")
for (i in 1:chunks) {
base[[names.l - 1 +1]] <- NULL
}
print("FINISHED: clean up")
#
return(base)
}
# it is assumed that d has a number of data
gini.pamp <- function(pred, actual,chunks=NULL .parallel = FALSE) {
if (.parallel) {
require(parallel)
require(doSNOW)
# determine the number of workers available
n.threads <- getDoParWorkers()
}
# compute the table first
if (.parallel) {
system.time(table.d1 <- acast(table.p(data.frame(pred=pred,actual=actual)),pred~actual,sum,value.var="Freq"))
table.d <- table.d1[order(as.numeric(dimnames(table.d1)[[1]])),]
}
else
system.time(table.d <- table(data.frame(pred=pred,actual=actual)))
#print("cumsum")
# cumsum
good.cumsum <- cumsum(table.d[,1]) / sum(table.d[,1])
bad.cumsum <- cumsum(table.d[,2]) / sum(table.d[,2])
tmp.l <- length(good.cumsum)
#1 - sum(diff(bad.cumsum) * (good.cumsum[-1] + good.cumsum[-tmp.l]))
1 - crossprod(diff(bad.cumsum) , (good.cumsum[-1] + good.cumsum[-tmp.l]))
# divide the data up into threads and compute the Gini
}
#system.time(print(gini.pamp(d$score, d$bad_outcome, .parallel = TRUE)))
#system.time(print(gini.pamp(d$score, d$bad_outcome)))
gc()
x <- runif(2^23)
y <- ifelse(runif(2^23)<0.5,0,1)
#x <- runif(100000)
#y <- ifelse(runif(100000)<0.5,0,1)
system.time(print(gini.pamp(x, y)))
# the package that allows for parallel processing
require(parallel)
require(doSNOW)
require(reshape2)
# i have 8 cpu cores
cores = 2
# some generic code to tell R to use the 8 cores
#browser()
cl = makeCluster(min(8,cores))
registerDoSNOW(cl)
getDoParWorkers()
system.time(print(gini.pamp(x, y, .parallel = TRUE)))
#seems like merging takes the longest time
stopCluster(cl)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.