Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created November 14, 2016 13:59
Show Gist options
  • Save yabyzq/3ea4b234504cb947f8c73fe494c0c1f3 to your computer and use it in GitHub Desktop.
Save yabyzq/3ea4b234504cb947f8c73fe494c0c1f3 to your computer and use it in GitHub Desktop.
R - caret data handling
library(caret)
#looking at missing value
options(digits=2)
stats <- data.frame(missing = sapply(iris, function(x) sum(is.na(x))),
mean = sapply(iris, function(x) if(is.numeric(x)) {mean(x, na.rm = T)} else names(table(x)[order(table(x), decreasing = T)])[1])
)
#Generate Dummy Variable
head(predict(dummyVars(Sepal.Length ~ ., data = iris), newdata= iris))
#nearZeroVar
nzv <- nearZeroVar(iris, freqCut = 50/51, saveMetrics= TRUE)
nzv[nzv$nzv,][1:5,]
#find Correlation
cor.index <- findCorrelation(cor(iris[,-5]), cutoff = .75)
names(iris[,cor.index])
#find Linear combo
linear.index <- findLinearCombos(iris[,-5])
names(iris[, linear.index])
#
#preProcess
pre.value <- preProcess(iris[1:100,], method = c("center", "scale"))
iris.train.trans <- predict(pre.value, iris[1:100,])
iris.test.trans <- predict(pre.value, iris[101:150,])
#age group annomally value
assign to normally threshold
#assign missing value by group
new.incomes <-df %>%
select(nomprov) %>%
merge(df %>%
group_by(nomprov) %>%
summarise(med.income=median(renta,na.rm=TRUE)),by="nomprov") %>%
select(nomprov,med.income) %>%
arrange(nomprov)
df <- arrange(df,nomprov)
df$renta[is.na(df$renta)] <- new.incomes$med.income[is.na(df$renta)]
rm(new.incomes)
df$renta[is.na(df$renta)] <- median(df$renta,na.rm=TRUE)
df <- arrange(df,fecha_dato)
#print out unique values
char.cols <- names(iris)[sapply(iris,is.character)]
for (name in char.cols){
print(sprintf("Unique values for %s:", name))
print(unique(iris[[name]]))
cat('\n')
}
#Convert to binary then apply below
head(diamonds)
a <- predict(dummyVars(~ color + cut, data = diamonds, fullRank = T, drop2nd = T), newdata= diamonds)
head(sapply(a, function(x) if (x>0) {1} else {0}))
dummies <- predict(dummyVars(~ Species, data = iris), newdata = iris)
head(dummies, n = 10)
dummies <- predict(dummyVars(~., data = diamonds), newdata = diamonds)
head(dummies, n = 10)
#A
totals.by.feature <- df %>%
group_by(month,feature) %>%
summarise(counts=n())
https://www.kaggle.com/apryor6/santander-product-recommendation/detailed-cleaning-visualization
df %>%
group_by(month,feature,status) %>%
summarise(counts=n())%>%
ungroup() %>%
inner_join(totals.by.feature,by=c("month","feature")) %>%
mutate(counts=counts.x/counts.y) %>%
ggplot(aes(y=counts,x=factor(month.abb[month],levels=month.abb[seq(12,1,-1)]))) +
geom_bar(aes(fill=status), stat="identity") +
facet_wrap(facets=~feature,ncol = 6) +
coord_flip() +
my_theme_dark +
ylab("Count") +
xlab("") +
ylim(limits=c(0,1)) +
ggtitle("Relative Service \nChanges by Month") +
theme(axis.text = element_text(size=10),
legend.text = element_text(size=14),
legend.title= element_blank() ,
strip.text = element_text(face="bold")) +
scale_fill_manual(values=c("cyan","magenta"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment