motus/loadData.R

## loadData.R
#!/usr/bin/env Rscript

# library(cluster)
# library(Boruta)
# library(ggplot2)

clean <- function(dd) return (within(dd, {
    AWKSTAT <- NULL
    WKSWORK <- NULL
    levels(ARACE) <- c(levels(ARACE), "Latino")
    ARACE[AREORGN != "Allother" & AREORGN != "Donotknow"] <- "Latino"
}))

removeFields <- function(dd, fields) {
  for (f in fields) {
    dd[,f] <- NULL
  }
  return (dd)
}

na.strings <- NA
# na.strings <- c("Not_in_universe", "Not_in_universeunder1yearold")

###############################################################

train <- clean(read.csv("train.csv.bz2", na.strings=na.strings))
test <- clean(read.csv("test.csv.bz2", na.strings=na.strings))

###############################################################

states <- read.csv("AmericanHumanDevelopmentProject_States2013-14.csv.bz2")
states <- states[states$Year == "2010" & !is.na(states$Year),]
states$Year <- NULL

for (i in 5:ncol(states)) {
  states[,i] <- as.numeric(gsub("[, ]", "", as.character(states[,i])))
}

names(states)[names(states) == "Median.Earnings..2010.dollars."] <- "MedianEarnings"

# statesDataFields <- names(states)[4:ncol(states)]

states$State <- as.factor(gsub(" ", "", as.character(states$State)))

# -------------------------------------------------------------
# Cluster US states:
#
# allStates <- states[states$Gender == "ALL" & states$Race.Ethnicity == "ALL",]
#
# allStates$group <- cutree(hclust(
#   daisy(allStates[,5:ncol(allStates)], metric="euclidean"),
#   method="complete"), k=30)
#
# extraStateFactors <- c("Abroad", "Not_in_universe")
# allStates$State <- factor(
#   allStates$State, levels=c(levels(allStates$State), extraStateFactors))
#
# allStates <- rbind(
#   allStates[,c("State", "group")],
#   data.frame(State=extraStateFactors,
#              group=factor(extraStateFactors)))
#
# allStates$group <- factor(allStates$group)
#
# -------------------------------------------------------------

states$Sex <- factor(NA, levels=c("Female", "Male"))
states$Sex[states$Gender == "WOMEN"] <- "Female"
states$Sex[states$Gender == "MEN"] <- "Male"

states$Race <- factor("Other", levels=c("ALL", levels(train$ARACE)))
states$Race[states$Race.Ethnicity == "WHITE"] <- "White"
states$Race[states$Race.Ethnicity == "AFRICAN AMERICAN"] <- "Black"
states$Race[states$Race.Ethnicity == "ASIAN AMERICAN"] <- "AsianorPacificIslander"
states$Race[states$Race.Ethnicity == "NATIVE AMERICAN"] <- "AmerIndianAleutorEskimo"
states$Race[states$Race.Ethnicity == "LATINO"] <- "Latino"
states$Race[states$Race.Ethnicity == "ALL"] <- "ALL"

# Pick fields to augment the input data set:

# statesDataFields <- names(states)[sapply(
#   states[!is.na(states$MedianEarnings),],
#   function(col) is.numeric(col) && all(!is.na(col)))]

statesDataFields <- c(
  "MedianEarnings", "HD.Index", "Health.Index", "Income.Index", "Education.Index")

states <- states[,c("Sex", "Race", "State", statesDataFields)]

###############################################################

mergeStatePriors <- function(dd) {

  # dd <- train
  fields <- statesDataFields
  fields.y <- as.vector(sapply(fields, function(s) paste(sep=".", s, "y")))

  # Assign average US numbers to every row:
  dd[,fields] <- states[states$State == "UNITEDSTATES", fields][1,]

  # Assign averages by state only:
  dd <- merge(
    dd, states[is.na(states$Sex) & states$Race == "ALL", 3:ncol(states)],
    by.x="GRINST", by.y="State", all.x=T, all.y=F, suffixes=c("", ".y"))

  dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
  dd <- removeFields(dd, fields.y)

  # Assign state + gender:
  dd <- merge(
    dd, states[!is.na(states$Sex) & states$Race == "ALL", c(1, 3:ncol(states))],
    by.x=c("ASEX", "GRINST"), by.y=c("Sex", "State"),
    all.x=T, all.y=F, suffixes=c("", ".y"))

  dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
  dd <- removeFields(dd, fields.y)

  # Assign state + gender + race:
  dd <- merge(
    dd, states[!is.na(states$Sex) & states$Race != "ALL",],
    by.x=c("ASEX", "ARACE", "GRINST"), by.y=c("Sex", "Race", "State"),
    all.x=T, all.y=F, suffixes=c("", ".y"))

  dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
  dd <- removeFields(dd, fields.y)

  return (dd)
}

###############################################################

printLargeFactors <- function(dd, count=30) {
  for (n in names(dd)) {
    if (is.factor(dd[,n]) && length(levels(dd[,n])) >= count)
    print(c(n, length(levels(dd[,n]))))
  }
}

printDegenerateFactors <- function(dd) {
  for (n in names(dd)) {
    if (is.factor(dd[,n]) && length(levels(dd[,n])) <= 2)
      print(c(n, length(levels(dd[,n]))))
  }
}

# state: GRINST
# country: PEFNTVTY, PEMNTVTY, PENATVTY

# countries <- data.frame(
#   country=union(levels(train$PEFNTVTY),
#       union(levels(train$PEMNTVTY), levels(train$PENATVTY))),
#   group=NA)

# write.csv(countries, "countries.csv")

# stateGroups <- data.frame(state=levels(train$GRINST), group=NA)
# write.csv(stateGroups, "states.csv")

# educationLevels <- data.frame(state=levels(train$AHGA), group=NA)
# write.csv(educationLevels, "education.csv")

mergeGazetteer <- function(dd, gazetteer, fields, field.y) {
  for (f in fields) {
    dd <- merge(dd, gazetteer, by.x=f, by.y=field.y, all.x=T, all.y=F)
    dd[,paste(sep=".", f, "group")] <- dd$group
    dd[,"group"] <- NULL
    dd[,field.y] <- NULL
    dd[,f] <- NULL
  }
  return (dd)
}

###############################################################

countries <- read.csv("countries.csv")
stateGroups <- read.csv("states.csv")
# educationLevels <- read.csv("education.csv")

countryFields <- c("PEFNTVTY", "PEMNTVTY", "PENATVTY")

train <- mergeStatePriors(train)
test <- mergeStatePriors(test)

train <- mergeGazetteer(train, countries, countryFields, "country")
test <- mergeGazetteer(test, countries, countryFields, "country")

train <- mergeGazetteer(train, stateGroups, "GRINST", "state")
test <- mergeGazetteer(test, stateGroups, "GRINST", "state")

# train <- mergeGazetteer(train, educationLevels, "AHGA", "education")
# test <- mergeGazetteer(test, educationLevels, "AHGA", "education")

# train <- mergeGazetteer(train, allStates, "GRINST", "State")
# test <- mergeGazetteer(test, allStates, "GRINST", "State")

for (f in intersect(names(test), names(train))) {
  if (is.factor(train[,f])) {
    lev <- union(levels(train[,f]), levels(test[,f]))
    levels(train[,f]) <- lev
    levels(test[,f]) <- lev
  }
}

###############################################################

# Takes > 2 hours:
# features <- Boruta(CLASS ~ ., train)
#
# importantFeatures <- data.frame(
#   feature=labels(tail(features$ImpHistory, 1)),
#   weight=c(tail(features$ImpHistory, 1)))[,2:3]
#
# names(importantFeatures) <- c("feature", "weight")
#
# importantFeatures$feature <- factor(
#   importantFeatures$feature,
#   levels=importantFeatures$feature[order(importantFeatures$weight, decreasing=T)])
#
# ggplot(importantFeatures[importantFeatures$weight >= 10,],
#        aes(x=feature, y=weight)) +
#   geom_bar(stat="identity", position="dodge")
#
# selectFeatures <- function(features, weight) {
#   return (as.vector(features$feature[features$weight >= weight]))
# }

## take2_rf.R
#!/usr/bin/env Rscript

set.seed(987654)

source("loadData.R")

library(cvTools)
library(randomForest)

# train <- train[,c("CLASS", selectFeatures(importantFeatures, 20))]
# test <- test[,c("Id", selectFeatures(importantFeatures, 20))]

###############################################################

# pcaFields <- names(train)[sapply(
#   train[!is.na(train$MedianEarnings),],
#   function(col) is.numeric(col) && all(!is.na(col)))]
#
# pca <- prcomp(rbind(train[,pcaFields], test[,pcaFields]),
#               center=T, scale.=T)
#
# pcaPick <- 10
# nTrain <- nrow(train)
#
# train[,colnames(pca$x)[1:pcaPick]] <- pca$x[1:nTrain, 1:pcaPick]
# test[,colnames(pca$x)[1:pcaPick]] <- pca$x[(nTrain + 1):nrow(pca$x), 1:pcaPick]
# train <- removeFields(train, pcaFields)
# test <- removeFields(test, pcaFields)

###############################################################

ntreeRange <- 600
mtryRange <- 7

# ntreeRange <- c(50, 100, 400, 600, 1000)
# mtryRange <- c(2, 4, 5, 6, 7, 8, 10, 12)

# Best results so far:
best_error <- Inf
best_ntree <- 600
best_mtry <- 7

nFolds <- 10
folds <- cvFolds(nrow(train), nFolds, 1, "random")

for (ntree in ntreeRange) {
  for (mtry in mtryRange) {

    error <- 0
    for (i in 1:nFolds) {

      trainFold <- train[folds$subsets[folds$which != i],]
      testFold <- train[folds$subsets[folds$which == i],]

      fit <- randomForest(CLASS ~ ., trainFold, ntree=ntree, mtry=mtry)
      res <- predict(fit, testFold)

      foldErr <- sum(testFold$CLASS != res) / length(res)
      error <- error + foldErr
      # cat("\nFold", i, "error", foldErr)
    }

    error <- error / nFolds
    cat("\nntree =", ntree, "mtry =", mtry, "XV error =",  error)

    if (error < best_error) {
      best_error <- error
      best_ntree <- ntree
      best_mtry <- mtry
    }
  }
}

fit <- randomForest(CLASS ~ ., train, ntree=best_ntree, mtry=best_mtry)
res <- predict(fit, train)
cat("\nRandom forest:\n  best mean", nFolds, "fold XV error =", best_error,
    "\n  mean training error =", sum(train$CLASS != res) / nrow(train),
    "\n  ntree =", best_ntree, "mtry =", best_mtry)

###############################################################

res <- predict(fit, test)
test$Prediction <- as.numeric(res == "50000+")
test$res <- res

write.csv(test[order(test$Id), c("Id", "Prediction")],
          "answer.csv", quote=F, row.names=F)
	#!/usr/bin/env Rscript

	# library(cluster)
	# library(Boruta)
	# library(ggplot2)

	clean <- function(dd) return (within(dd, {
	AWKSTAT <- NULL
	WKSWORK <- NULL
	levels(ARACE) <- c(levels(ARACE), "Latino")
	ARACE[AREORGN != "Allother" & AREORGN != "Donotknow"] <- "Latino"
	}))

	removeFields <- function(dd, fields) {
	for (f in fields) {
	dd[,f] <- NULL
	}
	return (dd)
	}

	na.strings <- NA
	# na.strings <- c("Not_in_universe", "Not_in_universeunder1yearold")

	###############################################################

	train <- clean(read.csv("train.csv.bz2", na.strings=na.strings))
	test <- clean(read.csv("test.csv.bz2", na.strings=na.strings))

	###############################################################

	states <- read.csv("AmericanHumanDevelopmentProject_States2013-14.csv.bz2")
	states <- states[states$Year == "2010" & !is.na(states$Year),]
	states$Year <- NULL

	for (i in 5:ncol(states)) {
	states[,i] <- as.numeric(gsub("[, ]", "", as.character(states[,i])))
	}

	names(states)[names(states) == "Median.Earnings..2010.dollars."] <- "MedianEarnings"

	# statesDataFields <- names(states)[4:ncol(states)]

	states$State <- as.factor(gsub(" ", "", as.character(states$State)))

	# -------------------------------------------------------------
	# Cluster US states:
	#
	# allStates <- states[states$Gender == "ALL" & states$Race.Ethnicity == "ALL",]
	#
	# allStates$group <- cutree(hclust(
	# daisy(allStates[,5:ncol(allStates)], metric="euclidean"),
	# method="complete"), k=30)
	#
	# extraStateFactors <- c("Abroad", "Not_in_universe")
	# allStates$State <- factor(
	# allStates$State, levels=c(levels(allStates$State), extraStateFactors))
	#
	# allStates <- rbind(
	# allStates[,c("State", "group")],
	# data.frame(State=extraStateFactors,
	# group=factor(extraStateFactors)))
	#
	# allStates$group <- factor(allStates$group)
	#
	# -------------------------------------------------------------

	states$Sex <- factor(NA, levels=c("Female", "Male"))
	states$Sex[states$Gender == "WOMEN"] <- "Female"
	states$Sex[states$Gender == "MEN"] <- "Male"

	states$Race <- factor("Other", levels=c("ALL", levels(train$ARACE)))
	states$Race[states$Race.Ethnicity == "WHITE"] <- "White"
	states$Race[states$Race.Ethnicity == "AFRICAN AMERICAN"] <- "Black"
	states$Race[states$Race.Ethnicity == "ASIAN AMERICAN"] <- "AsianorPacificIslander"
	states$Race[states$Race.Ethnicity == "NATIVE AMERICAN"] <- "AmerIndianAleutorEskimo"
	states$Race[states$Race.Ethnicity == "LATINO"] <- "Latino"
	states$Race[states$Race.Ethnicity == "ALL"] <- "ALL"

	# Pick fields to augment the input data set:

	# statesDataFields <- names(states)[sapply(
	# states[!is.na(states$MedianEarnings),],
	# function(col) is.numeric(col) && all(!is.na(col)))]

	statesDataFields <- c(
	"MedianEarnings", "HD.Index", "Health.Index", "Income.Index", "Education.Index")

	states <- states[,c("Sex", "Race", "State", statesDataFields)]

	###############################################################

	mergeStatePriors <- function(dd) {

	# dd <- train
	fields <- statesDataFields
	fields.y <- as.vector(sapply(fields, function(s) paste(sep=".", s, "y")))

	# Assign average US numbers to every row:
	dd[,fields] <- states[states$State == "UNITEDSTATES", fields][1,]

	# Assign averages by state only:
	dd <- merge(
	dd, states[is.na(states$Sex) & states$Race == "ALL", 3:ncol(states)],
	by.x="GRINST", by.y="State", all.x=T, all.y=F, suffixes=c("", ".y"))

	dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
	dd <- removeFields(dd, fields.y)

	# Assign state + gender:
	dd <- merge(
	dd, states[!is.na(states$Sex) & states$Race == "ALL", c(1, 3:ncol(states))],
	by.x=c("ASEX", "GRINST"), by.y=c("Sex", "State"),
	all.x=T, all.y=F, suffixes=c("", ".y"))

	dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
	dd <- removeFields(dd, fields.y)

	# Assign state + gender + race:
	dd <- merge(
	dd, states[!is.na(states$Sex) & states$Race != "ALL",],
	by.x=c("ASEX", "ARACE", "GRINST"), by.y=c("Sex", "Race", "State"),
	all.x=T, all.y=F, suffixes=c("", ".y"))

	dd[!is.na(dd$MedianEarnings.y), fields] <- dd[!is.na(dd$MedianEarnings.y), fields.y]
	dd <- removeFields(dd, fields.y)

	return (dd)
	}

	###############################################################

	printLargeFactors <- function(dd, count=30) {
	for (n in names(dd)) {
	if (is.factor(dd[,n]) && length(levels(dd[,n])) >= count)
	print(c(n, length(levels(dd[,n]))))
	}
	}

	printDegenerateFactors <- function(dd) {
	for (n in names(dd)) {
	if (is.factor(dd[,n]) && length(levels(dd[,n])) <= 2)
	print(c(n, length(levels(dd[,n]))))
	}
	}

	# state: GRINST
	# country: PEFNTVTY, PEMNTVTY, PENATVTY

	# countries <- data.frame(
	# country=union(levels(train$PEFNTVTY),
	# union(levels(train$PEMNTVTY), levels(train$PENATVTY))),
	# group=NA)

	# write.csv(countries, "countries.csv")

	# stateGroups <- data.frame(state=levels(train$GRINST), group=NA)
	# write.csv(stateGroups, "states.csv")

	# educationLevels <- data.frame(state=levels(train$AHGA), group=NA)
	# write.csv(educationLevels, "education.csv")

	mergeGazetteer <- function(dd, gazetteer, fields, field.y) {
	for (f in fields) {
	dd <- merge(dd, gazetteer, by.x=f, by.y=field.y, all.x=T, all.y=F)
	dd[,paste(sep=".", f, "group")] <- dd$group
	dd[,"group"] <- NULL
	dd[,field.y] <- NULL
	dd[,f] <- NULL
	}
	return (dd)
	}

	###############################################################

	countries <- read.csv("countries.csv")
	stateGroups <- read.csv("states.csv")
	# educationLevels <- read.csv("education.csv")

	countryFields <- c("PEFNTVTY", "PEMNTVTY", "PENATVTY")

	train <- mergeStatePriors(train)
	test <- mergeStatePriors(test)

	train <- mergeGazetteer(train, countries, countryFields, "country")
	test <- mergeGazetteer(test, countries, countryFields, "country")

	train <- mergeGazetteer(train, stateGroups, "GRINST", "state")
	test <- mergeGazetteer(test, stateGroups, "GRINST", "state")

	# train <- mergeGazetteer(train, educationLevels, "AHGA", "education")
	# test <- mergeGazetteer(test, educationLevels, "AHGA", "education")

	# train <- mergeGazetteer(train, allStates, "GRINST", "State")
	# test <- mergeGazetteer(test, allStates, "GRINST", "State")

	for (f in intersect(names(test), names(train))) {
	if (is.factor(train[,f])) {
	lev <- union(levels(train[,f]), levels(test[,f]))
	levels(train[,f]) <- lev
	levels(test[,f]) <- lev
	}
	}

	###############################################################

	# Takes > 2 hours:
	# features <- Boruta(CLASS ~ ., train)
	#
	# importantFeatures <- data.frame(
	# feature=labels(tail(features$ImpHistory, 1)),
	# weight=c(tail(features$ImpHistory, 1)))[,2:3]
	#
	# names(importantFeatures) <- c("feature", "weight")
	#
	# importantFeatures$feature <- factor(
	# importantFeatures$feature,
	# levels=importantFeatures$feature[order(importantFeatures$weight, decreasing=T)])
	#
	# ggplot(importantFeatures[importantFeatures$weight >= 10,],
	# aes(x=feature, y=weight)) +
	# geom_bar(stat="identity", position="dodge")
	#
	# selectFeatures <- function(features, weight) {
	# return (as.vector(features$feature[features$weight >= weight]))
	# }
	#!/usr/bin/env Rscript

	set.seed(987654)

	source("loadData.R")

	library(cvTools)
	library(randomForest)

	# train <- train[,c("CLASS", selectFeatures(importantFeatures, 20))]
	# test <- test[,c("Id", selectFeatures(importantFeatures, 20))]

	###############################################################

	# pcaFields <- names(train)[sapply(
	# train[!is.na(train$MedianEarnings),],
	# function(col) is.numeric(col) && all(!is.na(col)))]
	#
	# pca <- prcomp(rbind(train[,pcaFields], test[,pcaFields]),
	# center=T, scale.=T)
	#
	# pcaPick <- 10
	# nTrain <- nrow(train)
	#
	# train[,colnames(pca$x)[1:pcaPick]] <- pca$x[1:nTrain, 1:pcaPick]
	# test[,colnames(pca$x)[1:pcaPick]] <- pca$x[(nTrain + 1):nrow(pca$x), 1:pcaPick]
	# train <- removeFields(train, pcaFields)
	# test <- removeFields(test, pcaFields)

	###############################################################

	ntreeRange <- 600
	mtryRange <- 7

	# ntreeRange <- c(50, 100, 400, 600, 1000)
	# mtryRange <- c(2, 4, 5, 6, 7, 8, 10, 12)

	# Best results so far:
	best_error <- Inf
	best_ntree <- 600
	best_mtry <- 7

	nFolds <- 10
	folds <- cvFolds(nrow(train), nFolds, 1, "random")

	for (ntree in ntreeRange) {
	for (mtry in mtryRange) {

	error <- 0
	for (i in 1:nFolds) {

	trainFold <- train[folds$subsets[folds$which != i],]
	testFold <- train[folds$subsets[folds$which == i],]

	fit <- randomForest(CLASS ~ ., trainFold, ntree=ntree, mtry=mtry)
	res <- predict(fit, testFold)

	foldErr <- sum(testFold$CLASS != res) / length(res)
	error <- error + foldErr
	# cat("\nFold", i, "error", foldErr)
	}

	error <- error / nFolds
	cat("\nntree =", ntree, "mtry =", mtry, "XV error =", error)

	if (error < best_error) {
	best_error <- error
	best_ntree <- ntree
	best_mtry <- mtry
	}
	}
	}

	fit <- randomForest(CLASS ~ ., train, ntree=best_ntree, mtry=best_mtry)
	res <- predict(fit, train)
	cat("\nRandom forest:\n best mean", nFolds, "fold XV error =", best_error,
	"\n mean training error =", sum(train$CLASS != res) / nrow(train),
	"\n ntree =", best_ntree, "mtry =", best_mtry)

	###############################################################

	res <- predict(fit, test)
	test$Prediction <- as.numeric(res == "50000+")
	test$res <- res

	write.csv(test[order(test$Id), c("Id", "Prediction")],
	"answer.csv", quote=F, row.names=F)