Created
November 14, 2016 13:59
-
-
Save yabyzq/3ea4b234504cb947f8c73fe494c0c1f3 to your computer and use it in GitHub Desktop.
R - caret data handling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(caret) | |
#looking at missing value | |
options(digits=2) | |
stats <- data.frame(missing = sapply(iris, function(x) sum(is.na(x))), | |
mean = sapply(iris, function(x) if(is.numeric(x)) {mean(x, na.rm = T)} else names(table(x)[order(table(x), decreasing = T)])[1]) | |
) | |
#Generate Dummy Variable | |
head(predict(dummyVars(Sepal.Length ~ ., data = iris), newdata= iris)) | |
#nearZeroVar | |
nzv <- nearZeroVar(iris, freqCut = 50/51, saveMetrics= TRUE) | |
nzv[nzv$nzv,][1:5,] | |
#find Correlation | |
cor.index <- findCorrelation(cor(iris[,-5]), cutoff = .75) | |
names(iris[,cor.index]) | |
#find Linear combo | |
linear.index <- findLinearCombos(iris[,-5]) | |
names(iris[, linear.index]) | |
# | |
#preProcess | |
pre.value <- preProcess(iris[1:100,], method = c("center", "scale")) | |
iris.train.trans <- predict(pre.value, iris[1:100,]) | |
iris.test.trans <- predict(pre.value, iris[101:150,]) | |
#age group annomally value | |
assign to normally threshold | |
#assign missing value by group | |
new.incomes <-df %>% | |
select(nomprov) %>% | |
merge(df %>% | |
group_by(nomprov) %>% | |
summarise(med.income=median(renta,na.rm=TRUE)),by="nomprov") %>% | |
select(nomprov,med.income) %>% | |
arrange(nomprov) | |
df <- arrange(df,nomprov) | |
df$renta[is.na(df$renta)] <- new.incomes$med.income[is.na(df$renta)] | |
rm(new.incomes) | |
df$renta[is.na(df$renta)] <- median(df$renta,na.rm=TRUE) | |
df <- arrange(df,fecha_dato) | |
#print out unique values | |
char.cols <- names(iris)[sapply(iris,is.character)] | |
for (name in char.cols){ | |
print(sprintf("Unique values for %s:", name)) | |
print(unique(iris[[name]])) | |
cat('\n') | |
} | |
#Convert to binary then apply below | |
head(diamonds) | |
a <- predict(dummyVars(~ color + cut, data = diamonds, fullRank = T, drop2nd = T), newdata= diamonds) | |
head(sapply(a, function(x) if (x>0) {1} else {0})) | |
dummies <- predict(dummyVars(~ Species, data = iris), newdata = iris) | |
head(dummies, n = 10) | |
dummies <- predict(dummyVars(~., data = diamonds), newdata = diamonds) | |
head(dummies, n = 10) | |
#A | |
totals.by.feature <- df %>% | |
group_by(month,feature) %>% | |
summarise(counts=n()) | |
https://www.kaggle.com/apryor6/santander-product-recommendation/detailed-cleaning-visualization | |
df %>% | |
group_by(month,feature,status) %>% | |
summarise(counts=n())%>% | |
ungroup() %>% | |
inner_join(totals.by.feature,by=c("month","feature")) %>% | |
mutate(counts=counts.x/counts.y) %>% | |
ggplot(aes(y=counts,x=factor(month.abb[month],levels=month.abb[seq(12,1,-1)]))) + | |
geom_bar(aes(fill=status), stat="identity") + | |
facet_wrap(facets=~feature,ncol = 6) + | |
coord_flip() + | |
my_theme_dark + | |
ylab("Count") + | |
xlab("") + | |
ylim(limits=c(0,1)) + | |
ggtitle("Relative Service \nChanges by Month") + | |
theme(axis.text = element_text(size=10), | |
legend.text = element_text(size=14), | |
legend.title= element_blank() , | |
strip.text = element_text(face="bold")) + | |
scale_fill_manual(values=c("cyan","magenta")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment