Skip to content

Instantly share code, notes, and snippets.

@cdeterman
Created August 19, 2015 20:36
Show Gist options
  • Save cdeterman/6c151f1b3f56ee8aaf3f to your computer and use it in GitHub Desktop.
Save cdeterman/6c151f1b3f56ee8aaf3f to your computer and use it in GitHub Desktop.
load the airlines dataset with bigmemory
for (year in 1987:2008) {
file.name <- paste(year, "csv.bz2", sep = ".")
if ( !file.exists(file.name) ) {
url.text <- paste("http://stat-computing.org/dataexpo/2009/",
year, ".csv.bz2", sep = "")
cat("Downloading missing data file ", file.name, "\n", sep = "")
download.file(url.text, file.name)
}
}
## Read sample file to get column names and types
d <- read.csv("2008.csv.bz2")
integer.columns <- sapply(d, is.integer)
factor.columns <- sapply(d, is.factor)
factor.levels <- lapply(d[, factor.columns], levels)
n.rows <- 0L
## Process each file determining the factor levels
## TODO: Combine with next loop
for (year in 1987:2008) {
file.name <- paste(year, "csv.bz2", sep = ".")
cat("Processing ", file.name, "\n", sep = "")
d <- read.csv(file.name)
n.rows <- n.rows + NROW(d)
new.levels <- lapply(d[, factor.columns], levels)
for ( i in seq(1, length(factor.levels)) ) {
factor.levels[[i]] <- c(factor.levels[[i]], new.levels[[i]])
}
rm(d)
gc()
}
save(integer.columns, factor.columns, factor.levels, file = "factors.RData")
## Now convert all factors to integers so we can create a bigmatrix of the data
col.classes <- rep("integer", length(integer.columns))
col.classes[factor.columns] <- "character"
cols <- which(factor.columns)
first <- TRUE
csv.file <- "airlines.csv" # Write combined integer-only data to this file
csv.con <- file(csv.file, open = "w")
for (year in 1987:2008) {
file.name <- paste(year, "csv.bz2", sep = ".")
cat("Processing ", file.name, "\n", sep = "")
d <- read.csv(file.name, colClasses = col.classes)
## Convert the strings to integers
for ( i in seq(1, length(factor.levels)) ) {
col <- cols[i]
d[, col] <- match(d[, col], factor.levels[[i]])
}
write.table(d, file = csv.con, sep = ",",
row.names = FALSE, col.names = first)
first <- FALSE
rm(d)
gc()
}
close(csv.con)
backing.file <- "airlines.bin"
descriptor.file <- "airlines.des"
data <- read.big.matrix("airlines.csv", header = TRUE,
type = "integer",
backingfile = backing.file,
descriptorfile = descriptor.file,
extraCols = c("age"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment