Skip to content

Instantly share code, notes, and snippets.

@e-kotov
Last active September 18, 2015 23:03
Show Gist options
  • Save e-kotov/8403e95759a83d58a415 to your computer and use it in GitHub Desktop.
Save e-kotov/8403e95759a83d58a415 to your computer and use it in GitHub Desktop.
Benchmark the speed of reading and writing spatial data in R - find the best way to store data for analysis (spoiler - best speed is for spatial objects stored in *.rds files and shapefiles are the slowest to load)
# spatial data loading time benchmark
# Warning : you may need to manually install rgdal from source, if it fails to install automatically
# first install missing packages
# thanks to Pratik Patil for the package installation code ( http://stackoverflow.com/a/29622385 )
# Specify the list of required packages to be installed and load
Required_Packages = c("data.table", "rgdal", "sp", "ggplot2", "reshape2")
Install_And_Load <- function(Required_Packages) {
Remaining_Packages <- Required_Packages[!(Required_Packages %in% installed.packages()[,"Package"])]
if(length(Remaining_Packages))
{
install.packages(Remaining_Packages)
}
for(package_name in Required_Packages)
{
library(package_name, character.only = T, quietly = F)
}
}
Install_And_Load(Required_Packages) # Call the function to install packages
# actual code for benchmarking below
runSpLoadBench <- function() {
rmTempFiles <- removeTempFiles() # ask if the user wants to delete the temp files after becnhmark
# we will be using approximate bounding box for Moscow
# top left = 55.944657 , 37.251006
# bottom right = 55.534036 , 37.976099
# set desired number of points
# let us assume 300 000 is enough to notice the difference
n = 3*10^5 # change this to your liking, but be careful - 1e+05 is already quite large
# create a data table with random points
x <- data.table( lat = runif(n, min = 55.534036, max = 55.944657),
lon = runif(n, min = 37.251006, max = 37.976099),
value = runif(n, min = 0, max = 1))
xs <- x # copy data table
coordinates(xs) = ~ lon + lat # create spatial object
proj4string(xs) = CRS("+init=epsg:4326") # set coordinate system to WGS84
dir.create("sp_data_speed_test") # create temp dir
# time write times
time_saveRDS <- system.time( saveRDS(x, "sp_data_speed_test/x.rds") ) # write text data to rds file
time_write.csv <- system.time( write.csv(x, "sp_data_speed_test/x.csv") ) # write text data to csv file
time_writeOGR_shp <- system.time( writeOGR(xs, dsn = "sp_data_speed_test",
layer = "xs",
driver = "ESRI Shapefile",
overwrite_layer = T) ) # write sp object to ESRI shapefile
time_saveRDS_sp <- system.time( saveRDS(xs, "sp_data_speed_test/xs.rds") ) # write sp object to rds file
rm(x,xs) # remove variables from memory
gc() # force clean up memory
# let us test the load times
time_readRDS <- system.time( { x <- readRDS("sp_data_speed_test/x.rds")
coordinates(x) = ~ lon + lat
proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from rds file and convert to spatial object
rm(x) # remove variables from memory
gc() # force clean up memory
time_fread <- system.time( { x <- fread("sp_data_speed_test/x.csv")
coordinates(x) = ~ lon + lat
proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from csv file and convert to spatial object
# fread is used as it is marginally faster than read.csv - no point in comparing read.csv at all
rm(x) # remove variables from memory
gc() # force clean up memory
time_readOGR_shp <- system.time( xs <- readOGR(dsn = "sp_data_speed_test",
layer = "xs") ) # read spatial object from ESRI shapefile
rm(xs) # remove variables from memory
gc() # force clean up memory
time_readRDS_sp <- system.time( xs <- readRDS("sp_data_speed_test/xs.rds") ) # read spatial object from rds shapefile
rm(xs) # remove variables from memory
gc() # force clean up memory
# construct the run times table
runTimes <- data.table(rbind(time_saveRDS, time_write.csv, time_writeOGR_shp, time_saveRDS_sp, time_readRDS, time_fread, time_readOGR_shp, time_readRDS_sp))
runTimes[ , action := c("saveRDS", "write.csv", "writeOGR_shp", "saveRDS_sp", "readRDS + convert to sp object", "fread + convert to sp object", "readOGR_shp", "readRDS_sp") ]
runTimes[ action %like% "read" , rw := "read" , ]
runTimes[ action %like% "write|save" , rw := "write" , ]
# plot and print the graph with read-write speed comparison
print( ggplot(runTimes, aes(x = action, y = elapsed, fill = rw)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(text = element_text(size = 16),
axis.text = element_text(colour = "black")) +
ylab("Elapsed time (seconds)") +
xlab("Action") +
ggtitle("Comparison of read/write times of large spatial dataframe.\n(Less is better)\n") +
coord_flip() )
# check if user wanted to clean up the temp dir and delete it if so
if (rmTempFiles == 'y') {
unlink("sp_data_speed_test", recursive = T)
}
runTimes # return or print out the table with run times
}
removeTempFiles <- function() {
print("The test will take about 1-2 miuntes depending on your hardware and needs approximately 150 megabytes of free disk space (for a sample of 1 000 000 spatial points). A temporary directory 'sp_data_speed_test' will be created in your current working directory and files 'x.csv', 'xs.dbf', 'xs.prj', 'x.rds', 'xs.rds', 'xs.shp', 'xs.shx' will be created in that directory in the process of benchmarking. Do you want to delete the files and the directory after the benchmark is complete?")
answ = ""
while (!answ %in% c("y", "n")){
answ <- readline(prompt = "Please input 'y' or 'n' and press Enter/Return: ")
if (! answ %in% c("y", "n")){ print ("Error, you did not print 'y' or 'n'. Try again.")}
}
return(answ)
}
runSpLoadBench() # run the test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment