Last active
September 18, 2015 23:03
-
-
Save e-kotov/8403e95759a83d58a415 to your computer and use it in GitHub Desktop.
Benchmark the speed of reading and writing spatial data in R - find the best way to store data for analysis (spoiler - best speed is for spatial objects stored in *.rds files and shapefiles are the slowest to load)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# spatial data loading time benchmark | |
# Warning : you may need to manually install rgdal from source, if it fails to install automatically | |
# first install missing packages | |
# thanks to Pratik Patil for the package installation code ( http://stackoverflow.com/a/29622385 ) | |
# Specify the list of required packages to be installed and load | |
Required_Packages = c("data.table", "rgdal", "sp", "ggplot2", "reshape2") | |
Install_And_Load <- function(Required_Packages) { | |
Remaining_Packages <- Required_Packages[!(Required_Packages %in% installed.packages()[,"Package"])] | |
if(length(Remaining_Packages)) | |
{ | |
install.packages(Remaining_Packages) | |
} | |
for(package_name in Required_Packages) | |
{ | |
library(package_name, character.only = T, quietly = F) | |
} | |
} | |
Install_And_Load(Required_Packages) # Call the function to install packages | |
# actual code for benchmarking below | |
runSpLoadBench <- function() { | |
rmTempFiles <- removeTempFiles() # ask if the user wants to delete the temp files after becnhmark | |
# we will be using approximate bounding box for Moscow | |
# top left = 55.944657 , 37.251006 | |
# bottom right = 55.534036 , 37.976099 | |
# set desired number of points | |
# let us assume 300 000 is enough to notice the difference | |
n = 3*10^5 # change this to your liking, but be careful - 1e+05 is already quite large | |
# create a data table with random points | |
x <- data.table( lat = runif(n, min = 55.534036, max = 55.944657), | |
lon = runif(n, min = 37.251006, max = 37.976099), | |
value = runif(n, min = 0, max = 1)) | |
xs <- x # copy data table | |
coordinates(xs) = ~ lon + lat # create spatial object | |
proj4string(xs) = CRS("+init=epsg:4326") # set coordinate system to WGS84 | |
dir.create("sp_data_speed_test") # create temp dir | |
# time write times | |
time_saveRDS <- system.time( saveRDS(x, "sp_data_speed_test/x.rds") ) # write text data to rds file | |
time_write.csv <- system.time( write.csv(x, "sp_data_speed_test/x.csv") ) # write text data to csv file | |
time_writeOGR_shp <- system.time( writeOGR(xs, dsn = "sp_data_speed_test", | |
layer = "xs", | |
driver = "ESRI Shapefile", | |
overwrite_layer = T) ) # write sp object to ESRI shapefile | |
time_saveRDS_sp <- system.time( saveRDS(xs, "sp_data_speed_test/xs.rds") ) # write sp object to rds file | |
rm(x,xs) # remove variables from memory | |
gc() # force clean up memory | |
# let us test the load times | |
time_readRDS <- system.time( { x <- readRDS("sp_data_speed_test/x.rds") | |
coordinates(x) = ~ lon + lat | |
proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from rds file and convert to spatial object | |
rm(x) # remove variables from memory | |
gc() # force clean up memory | |
time_fread <- system.time( { x <- fread("sp_data_speed_test/x.csv") | |
coordinates(x) = ~ lon + lat | |
proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from csv file and convert to spatial object | |
# fread is used as it is marginally faster than read.csv - no point in comparing read.csv at all | |
rm(x) # remove variables from memory | |
gc() # force clean up memory | |
time_readOGR_shp <- system.time( xs <- readOGR(dsn = "sp_data_speed_test", | |
layer = "xs") ) # read spatial object from ESRI shapefile | |
rm(xs) # remove variables from memory | |
gc() # force clean up memory | |
time_readRDS_sp <- system.time( xs <- readRDS("sp_data_speed_test/xs.rds") ) # read spatial object from rds shapefile | |
rm(xs) # remove variables from memory | |
gc() # force clean up memory | |
# construct the run times table | |
runTimes <- data.table(rbind(time_saveRDS, time_write.csv, time_writeOGR_shp, time_saveRDS_sp, time_readRDS, time_fread, time_readOGR_shp, time_readRDS_sp)) | |
runTimes[ , action := c("saveRDS", "write.csv", "writeOGR_shp", "saveRDS_sp", "readRDS + convert to sp object", "fread + convert to sp object", "readOGR_shp", "readRDS_sp") ] | |
runTimes[ action %like% "read" , rw := "read" , ] | |
runTimes[ action %like% "write|save" , rw := "write" , ] | |
# plot and print the graph with read-write speed comparison | |
print( ggplot(runTimes, aes(x = action, y = elapsed, fill = rw)) + | |
geom_bar(stat = "identity") + | |
theme_bw() + | |
theme(text = element_text(size = 16), | |
axis.text = element_text(colour = "black")) + | |
ylab("Elapsed time (seconds)") + | |
xlab("Action") + | |
ggtitle("Comparison of read/write times of large spatial dataframe.\n(Less is better)\n") + | |
coord_flip() ) | |
# check if user wanted to clean up the temp dir and delete it if so | |
if (rmTempFiles == 'y') { | |
unlink("sp_data_speed_test", recursive = T) | |
} | |
runTimes # return or print out the table with run times | |
} | |
removeTempFiles <- function() { | |
print("The test will take about 1-2 miuntes depending on your hardware and needs approximately 150 megabytes of free disk space (for a sample of 1 000 000 spatial points). A temporary directory 'sp_data_speed_test' will be created in your current working directory and files 'x.csv', 'xs.dbf', 'xs.prj', 'x.rds', 'xs.rds', 'xs.shp', 'xs.shx' will be created in that directory in the process of benchmarking. Do you want to delete the files and the directory after the benchmark is complete?") | |
answ = "" | |
while (!answ %in% c("y", "n")){ | |
answ <- readline(prompt = "Please input 'y' or 'n' and press Enter/Return: ") | |
if (! answ %in% c("y", "n")){ print ("Error, you did not print 'y' or 'n'. Try again.")} | |
} | |
return(answ) | |
} | |
runSpLoadBench() # run the test |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment