Skip to content

Instantly share code, notes, and snippets.

@szilard
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save szilard/9d75ad09918d338624ad to your computer and use it in GitHub Desktop.
Save szilard/9d75ad09918d338624ad to your computer and use it in GitHub Desktop.
Generate data for machine learning benchmark for Spark
## get the data
for yr in 2005 2006 2007; do
wget http://stat-computing.org/dataexpo/2009/$yr.csv.bz2
bunzip2 $yr.csv.bz2
done
## install R and data.table
echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" > /etc/apt/sources.list.d/r.list
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
apt-get update
apt-get install r-base-dev libcurl4-openssl-dev
R --vanilla << EOF
install.packages(c("data.table"), repos="http://cran.rstudio.com")
EOF
## generate standard format dataset
time R --vanilla --quiet << EOF
library(data.table)
set.seed(123)
d1a <- fread("2005.csv")
d1b <- fread("2006.csv")
d2 <- fread("2007.csv")
d1 <- rbind(d1a, d1b)
d1 <- d1[!is.na(DepDelay)]
d2 <- d2[!is.na(DepDelay)]
for (k in c("Month","DayofMonth","DayOfWeek")) {
d1[[k]] <- as.character(d1[[k]])
d2[[k]] <- as.character(d2[[k]])
}
d1[["dep_delayed_15min"]] <- ifelse(d1[["DepDelay"]]>=15,"Y","N")
d2[["dep_delayed_15min"]] <- ifelse(d2[["DepDelay"]]>=15,"Y","N")
cols <- c("Month", "DayofMonth", "DayOfWeek", "DepTime", "UniqueCarrier",
"Origin", "Dest", "Distance","dep_delayed_15min")
d1 <- d1[, cols, with = FALSE]
d2 <- d2[, cols, with = FALSE]
for (n in c(1e6)) {
write.table(d1[sample(nrow(d1),n),], file = paste0("train-",n/1e6,"m.csv"), row.names = FALSE, sep = ",")
}
write.table(d2[sample(nrow(d2),1e5),], file = "test.csv", row.names = FALSE, sep = ",")
EOF
## generate 1-hot encoded dataset
for SIZE in 1; do
time R --vanilla --quiet << EOF
library(data.table)
d1 <- fread("train-${SIZE}m.csv")
d2 <- fread("test.csv")
d <- rbind(d1,d2)
X <- model.matrix(dep_delayed_15min ~ ., d)
y <- ifelse(d[["dep_delayed_15min"]]=="Y",1,0)
dd <- cbind(y,X)
dd1 <- dd[1:nrow(d1),]
dd2 <- dd[(nrow(d1)+1):(nrow(d1)+nrow(d2)),]
write.table(dd1, "train-1hot-${SIZE}m.csv", row.names=FALSE, col.names=FALSE, sep=",")
write.table(dd2, "test-1hot-${SIZE}m.csv", row.names=FALSE, col.names=FALSE, sep=",")
EOF
done
## generate integer-encoded categoricals
for SIZE in 1; do
time R --vanilla --quiet << EOF
library(data.table)
d1 <- as.data.frame(fread("train-${SIZE}m.csv"))
d2 <- as.data.frame(fread("test.csv"))
d <- rbind(d1,d2)
for (k in c("Month","DayofMonth","DayOfWeek","UniqueCarrier","Origin","Dest")) {
d[,k] <- as.numeric(as.factor(d[,k]))-1
}
d[["dep_delayed_15min"]] <- ifelse(d[["dep_delayed_15min"]]=="Y",1,0)
s <- ""
for (k in c("Month","DayofMonth","DayOfWeek","UniqueCarrier","Origin","Dest")) {
s <- paste0(s, which(names(d)==k)-1, " -> ", length(unique(d[,k])) ,", ")
}
s
## 0 -> 12, 1 -> 31, 2 -> 7, 4 -> 23, 5 -> 307, 6 -> 308
dd1 <- d[1:nrow(d1),]
dd2 <- d[(nrow(d1)+1):(nrow(d1)+nrow(d2)),]
write.table(dd1, "train-intcateg-${SIZE}m.csv", row.names=FALSE, col.names=FALSE, sep=",")
write.table(dd2, "test-intcateg-${SIZE}m.csv", row.names=FALSE, col.names=FALSE, sep=",")
EOF
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment