Created
March 26, 2015 21:54
-
-
Save motus/cabe36187c4b7fa751ea to your computer and use it in GitHub Desktop.
Winning solution for https://inclass.kaggle.com/c/competition-1-mipt-fivt-ml-spring-2015
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
# library(cluster) | |
# library(Boruta) | |
# library(ggplot2) | |
clean <- function(dd) return (within(dd, { | |
AWKSTAT <- NULL | |
WKSWORK <- NULL | |
levels(ARACE) <- c(levels(ARACE), "Latino") | |
ARACE[AREORGN != "Allother" & AREORGN != "Donotknow"] <- "Latino" | |
})) | |
removeFields <- function(dd, fields) { | |
for (f in fields) { | |
dd[,f] <- NULL | |
} | |
return (dd) | |
} | |
na.strings <- NA | |
# na.strings <- c("Not_in_universe", "Not_in_universeunder1yearold") | |
############################################################### | |
train <- clean(read.csv("train.csv.bz2", na.strings=na.strings)) | |
test <- clean(read.csv("test.csv.bz2", na.strings=na.strings)) | |
############################################################### | |
states <- read.csv("AmericanHumanDevelopmentProject_States2013-14.csv.bz2") | |
states <- states[states$Year == "2010" & !is.na(states$Year),] | |
states$Year <- NULL | |
for (i in 5:ncol(states)) { | |
states[,i] <- as.numeric(gsub("[, ]", "", as.character(states[,i]))) | |
} | |
names(states)[names(states) == "Median.Earnings..2010.dollars."] <- "MedianEarnings" | |
# statesDataFields <- names(states)[4:ncol(states)] | |
states$State <- as.factor(gsub(" ", "", as.character(states$State))) | |
# ------------------------------------------------------------- | |
# Cluster US states: | |
# | |
# allStates <- states[states$Gender == "ALL" & states$Race.Ethnicity == "ALL",] | |
# | |
# allStates$group <- cutree(hclust( | |
# daisy(allStates[,5:ncol(allStates)], metric="euclidean"), | |
# method="complete"), k=30) | |
# | |
# extraStateFactors <- c("Abroad", "Not_in_universe") | |
# allStates$State <- factor( | |
# allStates$State, levels=c(levels(allStates$State), extraStateFactors)) | |
# | |
# allStates <- rbind( | |
# allStates[,c("State", "group")], | |
# data.frame(State=extraStateFactors, | |
# group=factor(extraStateFactors))) | |
# | |
# allStates$group <- factor(allStates$group) | |
# | |
# ------------------------------------------------------------- | |
states$Sex <- factor(NA, levels=c("Female", "Male")) | |
states$Sex[states$Gender == "WOMEN"] <- "Female" | |
states$Sex[states$Gender == "MEN"] <- "Male" | |
states$Race <- factor("Other", levels=c("ALL", levels(train$ARACE))) | |
states$Race[states$Race.Ethnicity == "WHITE"] <- "White" | |
states$Race[states$Race.Ethnicity == "AFRICAN AMERICAN"] <- "Black" | |
states$Race[states$Race.Ethnicity == "ASIAN AMERICAN"] <- "AsianorPacificIslander" | |
states$Race[states$Race.Ethnicity == "NATIVE AMERICAN"] <- "AmerIndianAleutorEskimo" | |
states$Race[states$Race.Ethnicity == "LATINO"] <- "Latino" | |
states$Race[states$Race.Ethnicity == "ALL"] <- "ALL" | |
# Pick fields to augment the input data set: | |
# statesDataFields <- names(states)[sapply( | |
# states[!is.na(states$MedianEarnings),], | |
# function(col) is.numeric(col) && all(!is.na(col)))] | |
statesDataFields <- c( | |
"MedianEarnings", "HD.Index", "Health.Index", "Income.Index", "Education.Index") | |
states <- states[,c("Sex", "Race", "State", statesDataFields)] | |
############################################################### | |
mergeStatePriors <- function(dd) { | |
# dd <- train | |
fields <- statesDataFields | |
fields.y <- as.vector(sapply(fields, function(s) paste(sep=".", s, "y"))) | |
# Assign average US numbers to every row: | |
dd[,fields] <- states[states$State == "UNITEDSTATES", fields][1,] | |
# Assign averages by state only: | |
dd <- merge( | |
dd, states[is.na(states$Sex) & states$Race == "ALL", 3:ncol(states)], | |
by.x="GRINST", by.y="State", all.x=T, all.y=F, suffixes=c("", ".y")) | |
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y] | |
dd <- removeFields(dd, fields.y) | |
# Assign state + gender: | |
dd <- merge( | |
dd, states[!is.na(states$Sex) & states$Race == "ALL", c(1, 3:ncol(states))], | |
by.x=c("ASEX", "GRINST"), by.y=c("Sex", "State"), | |
all.x=T, all.y=F, suffixes=c("", ".y")) | |
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y] | |
dd <- removeFields(dd, fields.y) | |
# Assign state + gender + race: | |
dd <- merge( | |
dd, states[!is.na(states$Sex) & states$Race != "ALL",], | |
by.x=c("ASEX", "ARACE", "GRINST"), by.y=c("Sex", "Race", "State"), | |
all.x=T, all.y=F, suffixes=c("", ".y")) | |
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y] | |
dd <- removeFields(dd, fields.y) | |
return (dd) | |
} | |
############################################################### | |
printLargeFactors <- function(dd, count=30) { | |
for (n in names(dd)) { | |
if (is.factor(dd[,n]) && length(levels(dd[,n])) >= count) | |
print(c(n, length(levels(dd[,n])))) | |
} | |
} | |
printDegenerateFactors <- function(dd) { | |
for (n in names(dd)) { | |
if (is.factor(dd[,n]) && length(levels(dd[,n])) <= 2) | |
print(c(n, length(levels(dd[,n])))) | |
} | |
} | |
# state: GRINST | |
# country: PEFNTVTY, PEMNTVTY, PENATVTY | |
# countries <- data.frame( | |
# country=union(levels(train$PEFNTVTY), | |
# union(levels(train$PEMNTVTY), levels(train$PENATVTY))), | |
# group=NA) | |
# write.csv(countries, "countries.csv") | |
# stateGroups <- data.frame(state=levels(train$GRINST), group=NA) | |
# write.csv(stateGroups, "states.csv") | |
# educationLevels <- data.frame(state=levels(train$AHGA), group=NA) | |
# write.csv(educationLevels, "education.csv") | |
mergeGazetteer <- function(dd, gazetteer, fields, field.y) { | |
for (f in fields) { | |
dd <- merge(dd, gazetteer, by.x=f, by.y=field.y, all.x=T, all.y=F) | |
dd[,paste(sep=".", f, "group")] <- dd$group | |
dd[,"group"] <- NULL | |
dd[,field.y] <- NULL | |
dd[,f] <- NULL | |
} | |
return (dd) | |
} | |
############################################################### | |
countries <- read.csv("countries.csv") | |
stateGroups <- read.csv("states.csv") | |
# educationLevels <- read.csv("education.csv") | |
countryFields <- c("PEFNTVTY", "PEMNTVTY", "PENATVTY") | |
train <- mergeStatePriors(train) | |
test <- mergeStatePriors(test) | |
train <- mergeGazetteer(train, countries, countryFields, "country") | |
test <- mergeGazetteer(test, countries, countryFields, "country") | |
train <- mergeGazetteer(train, stateGroups, "GRINST", "state") | |
test <- mergeGazetteer(test, stateGroups, "GRINST", "state") | |
# train <- mergeGazetteer(train, educationLevels, "AHGA", "education") | |
# test <- mergeGazetteer(test, educationLevels, "AHGA", "education") | |
# train <- mergeGazetteer(train, allStates, "GRINST", "State") | |
# test <- mergeGazetteer(test, allStates, "GRINST", "State") | |
for (f in intersect(names(test), names(train))) { | |
if (is.factor(train[,f])) { | |
lev <- union(levels(train[,f]), levels(test[,f])) | |
levels(train[,f]) <- lev | |
levels(test[,f]) <- lev | |
} | |
} | |
############################################################### | |
# Takes > 2 hours: | |
# features <- Boruta(CLASS ~ ., train) | |
# | |
# importantFeatures <- data.frame( | |
# feature=labels(tail(features$ImpHistory, 1)), | |
# weight=c(tail(features$ImpHistory, 1)))[,2:3] | |
# | |
# names(importantFeatures) <- c("feature", "weight") | |
# | |
# importantFeatures$feature <- factor( | |
# importantFeatures$feature, | |
# levels=importantFeatures$feature[order(importantFeatures$weight, decreasing=T)]) | |
# | |
# ggplot(importantFeatures[importantFeatures$weight >= 10,], | |
# aes(x=feature, y=weight)) + | |
# geom_bar(stat="identity", position="dodge") | |
# | |
# selectFeatures <- function(features, weight) { | |
# return (as.vector(features$feature[features$weight >= weight])) | |
# } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
set.seed(987654) | |
source("loadData.R") | |
library(cvTools) | |
library(randomForest) | |
# train <- train[,c("CLASS", selectFeatures(importantFeatures, 20))] | |
# test <- test[,c("Id", selectFeatures(importantFeatures, 20))] | |
############################################################### | |
# pcaFields <- names(train)[sapply( | |
# train[!is.na(train$MedianEarnings),], | |
# function(col) is.numeric(col) && all(!is.na(col)))] | |
# | |
# pca <- prcomp(rbind(train[,pcaFields], test[,pcaFields]), | |
# center=T, scale.=T) | |
# | |
# pcaPick <- 10 | |
# nTrain <- nrow(train) | |
# | |
# train[,colnames(pca$x)[1:pcaPick]] <- pca$x[1:nTrain, 1:pcaPick] | |
# test[,colnames(pca$x)[1:pcaPick]] <- pca$x[(nTrain + 1):nrow(pca$x), 1:pcaPick] | |
# train <- removeFields(train, pcaFields) | |
# test <- removeFields(test, pcaFields) | |
############################################################### | |
ntreeRange <- 600 | |
mtryRange <- 7 | |
# ntreeRange <- c(50, 100, 400, 600, 1000) | |
# mtryRange <- c(2, 4, 5, 6, 7, 8, 10, 12) | |
# Best results so far: | |
best_error <- Inf | |
best_ntree <- 600 | |
best_mtry <- 7 | |
nFolds <- 10 | |
folds <- cvFolds(nrow(train), nFolds, 1, "random") | |
for (ntree in ntreeRange) { | |
for (mtry in mtryRange) { | |
error <- 0 | |
for (i in 1:nFolds) { | |
trainFold <- train[folds$subsets[folds$which != i],] | |
testFold <- train[folds$subsets[folds$which == i],] | |
fit <- randomForest(CLASS ~ ., trainFold, ntree=ntree, mtry=mtry) | |
res <- predict(fit, testFold) | |
foldErr <- sum(testFold$CLASS != res) / length(res) | |
error <- error + foldErr | |
# cat("\nFold", i, "error", foldErr) | |
} | |
error <- error / nFolds | |
cat("\nntree =", ntree, "mtry =", mtry, "XV error =", error) | |
if (error < best_error) { | |
best_error <- error | |
best_ntree <- ntree | |
best_mtry <- mtry | |
} | |
} | |
} | |
fit <- randomForest(CLASS ~ ., train, ntree=best_ntree, mtry=best_mtry) | |
res <- predict(fit, train) | |
cat("\nRandom forest:\n best mean", nFolds, "fold XV error =", best_error, | |
"\n mean training error =", sum(train$CLASS != res) / nrow(train), | |
"\n ntree =", best_ntree, "mtry =", best_mtry) | |
############################################################### | |
res <- predict(fit, test) | |
test$Prediction <- as.numeric(res == "50000+") | |
test$res <- res | |
write.csv(test[order(test$Id), c("Id", "Prediction")], | |
"answer.csv", quote=F, row.names=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment