Skip to content

Instantly share code, notes, and snippets.

@motus
Created March 26, 2015 21:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save motus/cabe36187c4b7fa751ea to your computer and use it in GitHub Desktop.
Save motus/cabe36187c4b7fa751ea to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
# library(cluster)
# library(Boruta)
# library(ggplot2)
clean <- function(dd) return (within(dd, {
AWKSTAT <- NULL
WKSWORK <- NULL
levels(ARACE) <- c(levels(ARACE), "Latino")
ARACE[AREORGN != "Allother" & AREORGN != "Donotknow"] <- "Latino"
}))
removeFields <- function(dd, fields) {
for (f in fields) {
dd[,f] <- NULL
}
return (dd)
}
na.strings <- NA
# na.strings <- c("Not_in_universe", "Not_in_universeunder1yearold")
###############################################################
train <- clean(read.csv("train.csv.bz2", na.strings=na.strings))
test <- clean(read.csv("test.csv.bz2", na.strings=na.strings))
###############################################################
states <- read.csv("AmericanHumanDevelopmentProject_States2013-14.csv.bz2")
states <- states[states$Year == "2010" & !is.na(states$Year),]
states$Year <- NULL
for (i in 5:ncol(states)) {
states[,i] <- as.numeric(gsub("[, ]", "", as.character(states[,i])))
}
names(states)[names(states) == "Median.Earnings..2010.dollars."] <- "MedianEarnings"
# statesDataFields <- names(states)[4:ncol(states)]
states$State <- as.factor(gsub(" ", "", as.character(states$State)))
# -------------------------------------------------------------
# Cluster US states:
#
# allStates <- states[states$Gender == "ALL" & states$Race.Ethnicity == "ALL",]
#
# allStates$group <- cutree(hclust(
# daisy(allStates[,5:ncol(allStates)], metric="euclidean"),
# method="complete"), k=30)
#
# extraStateFactors <- c("Abroad", "Not_in_universe")
# allStates$State <- factor(
# allStates$State, levels=c(levels(allStates$State), extraStateFactors))
#
# allStates <- rbind(
# allStates[,c("State", "group")],
# data.frame(State=extraStateFactors,
# group=factor(extraStateFactors)))
#
# allStates$group <- factor(allStates$group)
#
# -------------------------------------------------------------
states$Sex <- factor(NA, levels=c("Female", "Male"))
states$Sex[states$Gender == "WOMEN"] <- "Female"
states$Sex[states$Gender == "MEN"] <- "Male"
states$Race <- factor("Other", levels=c("ALL", levels(train$ARACE)))
states$Race[states$Race.Ethnicity == "WHITE"] <- "White"
states$Race[states$Race.Ethnicity == "AFRICAN AMERICAN"] <- "Black"
states$Race[states$Race.Ethnicity == "ASIAN AMERICAN"] <- "AsianorPacificIslander"
states$Race[states$Race.Ethnicity == "NATIVE AMERICAN"] <- "AmerIndianAleutorEskimo"
states$Race[states$Race.Ethnicity == "LATINO"] <- "Latino"
states$Race[states$Race.Ethnicity == "ALL"] <- "ALL"
# Pick fields to augment the input data set:
# statesDataFields <- names(states)[sapply(
# states[!is.na(states$MedianEarnings),],
# function(col) is.numeric(col) && all(!is.na(col)))]
statesDataFields <- c(
"MedianEarnings", "HD.Index", "Health.Index", "Income.Index", "Education.Index")
states <- states[,c("Sex", "Race", "State", statesDataFields)]
###############################################################
mergeStatePriors <- function(dd) {
# dd <- train
fields <- statesDataFields
fields.y <- as.vector(sapply(fields, function(s) paste(sep=".", s, "y")))
# Assign average US numbers to every row:
dd[,fields] <- states[states$State == "UNITEDSTATES", fields][1,]
# Assign averages by state only:
dd <- merge(
dd, states[is.na(states$Sex) & states$Race == "ALL", 3:ncol(states)],
by.x="GRINST", by.y="State", all.x=T, all.y=F, suffixes=c("", ".y"))
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
dd <- removeFields(dd, fields.y)
# Assign state + gender:
dd <- merge(
dd, states[!is.na(states$Sex) & states$Race == "ALL", c(1, 3:ncol(states))],
by.x=c("ASEX", "GRINST"), by.y=c("Sex", "State"),
all.x=T, all.y=F, suffixes=c("", ".y"))
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
dd <- removeFields(dd, fields.y)
# Assign state + gender + race:
dd <- merge(
dd, states[!is.na(states$Sex) & states$Race != "ALL",],
by.x=c("ASEX", "ARACE", "GRINST"), by.y=c("Sex", "Race", "State"),
all.x=T, all.y=F, suffixes=c("", ".y"))
dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
dd <- removeFields(dd, fields.y)
return (dd)
}
###############################################################
printLargeFactors <- function(dd, count=30) {
for (n in names(dd)) {
if (is.factor(dd[,n]) && length(levels(dd[,n])) >= count)
print(c(n, length(levels(dd[,n]))))
}
}
printDegenerateFactors <- function(dd) {
for (n in names(dd)) {
if (is.factor(dd[,n]) && length(levels(dd[,n])) <= 2)
print(c(n, length(levels(dd[,n]))))
}
}
# state: GRINST
# country: PEFNTVTY, PEMNTVTY, PENATVTY
# countries <- data.frame(
# country=union(levels(train$PEFNTVTY),
# union(levels(train$PEMNTVTY), levels(train$PENATVTY))),
# group=NA)
# write.csv(countries, "countries.csv")
# stateGroups <- data.frame(state=levels(train$GRINST), group=NA)
# write.csv(stateGroups, "states.csv")
# educationLevels <- data.frame(state=levels(train$AHGA), group=NA)
# write.csv(educationLevels, "education.csv")
mergeGazetteer <- function(dd, gazetteer, fields, field.y) {
for (f in fields) {
dd <- merge(dd, gazetteer, by.x=f, by.y=field.y, all.x=T, all.y=F)
dd[,paste(sep=".", f, "group")] <- dd$group
dd[,"group"] <- NULL
dd[,field.y] <- NULL
dd[,f] <- NULL
}
return (dd)
}
###############################################################
countries <- read.csv("countries.csv")
stateGroups <- read.csv("states.csv")
# educationLevels <- read.csv("education.csv")
countryFields <- c("PEFNTVTY", "PEMNTVTY", "PENATVTY")
train <- mergeStatePriors(train)
test <- mergeStatePriors(test)
train <- mergeGazetteer(train, countries, countryFields, "country")
test <- mergeGazetteer(test, countries, countryFields, "country")
train <- mergeGazetteer(train, stateGroups, "GRINST", "state")
test <- mergeGazetteer(test, stateGroups, "GRINST", "state")
# train <- mergeGazetteer(train, educationLevels, "AHGA", "education")
# test <- mergeGazetteer(test, educationLevels, "AHGA", "education")
# train <- mergeGazetteer(train, allStates, "GRINST", "State")
# test <- mergeGazetteer(test, allStates, "GRINST", "State")
for (f in intersect(names(test), names(train))) {
if (is.factor(train[,f])) {
lev <- union(levels(train[,f]), levels(test[,f]))
levels(train[,f]) <- lev
levels(test[,f]) <- lev
}
}
###############################################################
# Takes > 2 hours:
# features <- Boruta(CLASS ~ ., train)
#
# importantFeatures <- data.frame(
# feature=labels(tail(features$ImpHistory, 1)),
# weight=c(tail(features$ImpHistory, 1)))[,2:3]
#
# names(importantFeatures) <- c("feature", "weight")
#
# importantFeatures$feature <- factor(
# importantFeatures$feature,
# levels=importantFeatures$feature[order(importantFeatures$weight, decreasing=T)])
#
# ggplot(importantFeatures[importantFeatures$weight >= 10,],
# aes(x=feature, y=weight)) +
# geom_bar(stat="identity", position="dodge")
#
# selectFeatures <- function(features, weight) {
# return (as.vector(features$feature[features$weight >= weight]))
# }
#!/usr/bin/env Rscript
set.seed(987654)
source("loadData.R")
library(cvTools)
library(randomForest)
# train <- train[,c("CLASS", selectFeatures(importantFeatures, 20))]
# test <- test[,c("Id", selectFeatures(importantFeatures, 20))]
###############################################################
# pcaFields <- names(train)[sapply(
# train[!is.na(train$MedianEarnings),],
# function(col) is.numeric(col) && all(!is.na(col)))]
#
# pca <- prcomp(rbind(train[,pcaFields], test[,pcaFields]),
# center=T, scale.=T)
#
# pcaPick <- 10
# nTrain <- nrow(train)
#
# train[,colnames(pca$x)[1:pcaPick]] <- pca$x[1:nTrain, 1:pcaPick]
# test[,colnames(pca$x)[1:pcaPick]] <- pca$x[(nTrain + 1):nrow(pca$x), 1:pcaPick]
# train <- removeFields(train, pcaFields)
# test <- removeFields(test, pcaFields)
###############################################################
ntreeRange <- 600
mtryRange <- 7
# ntreeRange <- c(50, 100, 400, 600, 1000)
# mtryRange <- c(2, 4, 5, 6, 7, 8, 10, 12)
# Best results so far:
best_error <- Inf
best_ntree <- 600
best_mtry <- 7
nFolds <- 10
folds <- cvFolds(nrow(train), nFolds, 1, "random")
for (ntree in ntreeRange) {
for (mtry in mtryRange) {
error <- 0
for (i in 1:nFolds) {
trainFold <- train[folds$subsets[folds$which != i],]
testFold <- train[folds$subsets[folds$which == i],]
fit <- randomForest(CLASS ~ ., trainFold, ntree=ntree, mtry=mtry)
res <- predict(fit, testFold)
foldErr <- sum(testFold$CLASS != res) / length(res)
error <- error + foldErr
# cat("\nFold", i, "error", foldErr)
}
error <- error / nFolds
cat("\nntree =", ntree, "mtry =", mtry, "XV error =", error)
if (error < best_error) {
best_error <- error
best_ntree <- ntree
best_mtry <- mtry
}
}
}
fit <- randomForest(CLASS ~ ., train, ntree=best_ntree, mtry=best_mtry)
res <- predict(fit, train)
cat("\nRandom forest:\n best mean", nFolds, "fold XV error =", best_error,
"\n mean training error =", sum(train$CLASS != res) / nrow(train),
"\n ntree =", best_ntree, "mtry =", best_mtry)
###############################################################
res <- predict(fit, test)
test$Prediction <- as.numeric(res == "50000+")
test$res <- res
write.csv(test[order(test$Id), c("Id", "Prediction")],
"answer.csv", quote=F, row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment