e-kotov/spatial_data_read_write_speed_test.R

## spatial_data_read_write_speed_test.R
# spatial data loading time benchmark

# Warning : you may need to manually install rgdal from source, if it fails to install automatically

# first install missing packages
# thanks to Pratik Patil for the package installation code ( http://stackoverflow.com/a/29622385 )
# Specify the list of required packages to be installed and load
Required_Packages = c("data.table", "rgdal", "sp", "ggplot2", "reshape2")

Install_And_Load <- function(Required_Packages) {
        Remaining_Packages <- Required_Packages[!(Required_Packages %in% installed.packages()[,"Package"])]

        if(length(Remaining_Packages))
        {
                install.packages(Remaining_Packages)
        }
        for(package_name in Required_Packages)
        {
                library(package_name, character.only = T, quietly = F)
        }
}


Install_And_Load(Required_Packages) # Call the function to install packages


# actual code for benchmarking below

runSpLoadBench <- function() {

        rmTempFiles <- removeTempFiles() # ask if the user wants to delete the temp files after becnhmark

        # we will be using approximate bounding box for Moscow
        # top left = 55.944657 , 37.251006
        # bottom right = 55.534036 , 37.976099

        # set desired number of points
        # let us assume 300 000 is enough to notice the difference

        n = 3*10^5 # change this to your liking, but be careful - 1e+05 is already quite large

        # create a data table with random points
        x <- data.table( lat = runif(n, min = 55.534036, max = 55.944657),
                         lon = runif(n, min = 37.251006, max = 37.976099),
                         value = runif(n, min = 0, max = 1))

        xs <- x # copy data table
        coordinates(xs) = ~ lon + lat # create spatial object
        proj4string(xs) = CRS("+init=epsg:4326") # set coordinate system to WGS84

        dir.create("sp_data_speed_test") # create temp dir

        # time write times
        time_saveRDS <- system.time( saveRDS(x, "sp_data_speed_test/x.rds") ) # write text data to rds file
        time_write.csv <- system.time( write.csv(x, "sp_data_speed_test/x.csv") ) # write text data to csv file
        time_writeOGR_shp <- system.time( writeOGR(xs, dsn = "sp_data_speed_test",
                                                   layer = "xs",
                                                   driver = "ESRI Shapefile",
                                                   overwrite_layer = T) ) # write sp object to ESRI shapefile
        time_saveRDS_sp <- system.time( saveRDS(xs, "sp_data_speed_test/xs.rds") ) # write sp object to rds file


        rm(x,xs) # remove variables from memory
        gc() # force clean up memory


        # let us test the load times
        time_readRDS <- system.time( { x <- readRDS("sp_data_speed_test/x.rds")
                                     coordinates(x) = ~ lon + lat
                                     proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from rds file and convert to spatial object
        rm(x) # remove variables from memory
        gc() # force clean up memory

        time_fread <- system.time( { x <- fread("sp_data_speed_test/x.csv")
                                   coordinates(x) = ~ lon + lat
                                   proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from csv file and convert to spatial object
        # fread is used as it is marginally faster than read.csv - no point in comparing read.csv at all
        rm(x) # remove variables from memory
        gc() # force clean up memory

        time_readOGR_shp <- system.time( xs <- readOGR(dsn = "sp_data_speed_test",
                                                       layer = "xs") ) # read spatial object from ESRI shapefile
        rm(xs) # remove variables from memory
        gc() # force clean up memory

        time_readRDS_sp <- system.time( xs <- readRDS("sp_data_speed_test/xs.rds") ) # read spatial object from rds shapefile
        rm(xs) # remove variables from memory
        gc() # force clean up memory

        # construct the run times table
        runTimes <- data.table(rbind(time_saveRDS, time_write.csv, time_writeOGR_shp, time_saveRDS_sp, time_readRDS, time_fread, time_readOGR_shp, time_readRDS_sp))
        runTimes[ , action := c("saveRDS", "write.csv", "writeOGR_shp", "saveRDS_sp", "readRDS + convert to sp object", "fread + convert to sp object", "readOGR_shp", "readRDS_sp") ]
        runTimes[ action %like% "read" , rw := "read" , ]
        runTimes[ action %like% "write|save" , rw := "write" , ]

        # plot and print the graph with read-write speed comparison
        print( ggplot(runTimes, aes(x = action, y = elapsed, fill = rw)) +
                geom_bar(stat = "identity") +
                theme_bw() +
                theme(text = element_text(size = 16),
                      axis.text = element_text(colour = "black")) +
                ylab("Elapsed time (seconds)") +
                xlab("Action") +
                ggtitle("Comparison of read/write times of large spatial dataframe.\n(Less is better)\n") +
                coord_flip() )

        # check if user wanted to clean up the temp dir and delete it if so
        if (rmTempFiles == 'y') {
                unlink("sp_data_speed_test", recursive = T)
        }

        runTimes # return or print out the table with run times
}


removeTempFiles <- function() {

        print("The test will take about 1-2 miuntes depending on your hardware and needs approximately 150 megabytes of free disk space (for a sample of 1 000 000 spatial points). A temporary directory 'sp_data_speed_test' will be created in your current working directory and files 'x.csv', 'xs.dbf', 'xs.prj', 'x.rds', 'xs.rds', 'xs.shp', 'xs.shx' will be created in that directory in the process of benchmarking. Do you want to delete the files and the directory after the benchmark is complete?")
        answ = ""
        while (!answ %in% c("y", "n")){
                answ <- readline(prompt = "Please input 'y' or 'n' and press Enter/Return: ")
                if (! answ %in% c("y", "n")){ print ("Error, you did not print 'y' or 'n'. Try again.")}
        }

        return(answ)
}

runSpLoadBench() # run the test
	# spatial data loading time benchmark

	# Warning : you may need to manually install rgdal from source, if it fails to install automatically

	# first install missing packages
	# thanks to Pratik Patil for the package installation code ( http://stackoverflow.com/a/29622385 )
	# Specify the list of required packages to be installed and load
	Required_Packages = c("data.table", "rgdal", "sp", "ggplot2", "reshape2")

	Install_And_Load <- function(Required_Packages) {
	Remaining_Packages <- Required_Packages[!(Required_Packages %in% installed.packages()[,"Package"])]

	if(length(Remaining_Packages))
	{
	install.packages(Remaining_Packages)
	}
	for(package_name in Required_Packages)
	{
	library(package_name, character.only = T, quietly = F)
	}
	}


	Install_And_Load(Required_Packages) # Call the function to install packages



	# actual code for benchmarking below

	runSpLoadBench <- function() {

	rmTempFiles <- removeTempFiles() # ask if the user wants to delete the temp files after becnhmark

	# we will be using approximate bounding box for Moscow
	# top left = 55.944657 , 37.251006
	# bottom right = 55.534036 , 37.976099

	# set desired number of points
	# let us assume 300 000 is enough to notice the difference

	n = 3*10^5 # change this to your liking, but be careful - 1e+05 is already quite large

	# create a data table with random points
	x <- data.table( lat = runif(n, min = 55.534036, max = 55.944657),
	lon = runif(n, min = 37.251006, max = 37.976099),
	value = runif(n, min = 0, max = 1))

	xs <- x # copy data table
	coordinates(xs) = ~ lon + lat # create spatial object
	proj4string(xs) = CRS("+init=epsg:4326") # set coordinate system to WGS84

	dir.create("sp_data_speed_test") # create temp dir

	# time write times
	time_saveRDS <- system.time( saveRDS(x, "sp_data_speed_test/x.rds") ) # write text data to rds file
	time_write.csv <- system.time( write.csv(x, "sp_data_speed_test/x.csv") ) # write text data to csv file
	time_writeOGR_shp <- system.time( writeOGR(xs, dsn = "sp_data_speed_test",
	layer = "xs",
	driver = "ESRI Shapefile",
	overwrite_layer = T) ) # write sp object to ESRI shapefile
	time_saveRDS_sp <- system.time( saveRDS(xs, "sp_data_speed_test/xs.rds") ) # write sp object to rds file


	rm(x,xs) # remove variables from memory
	gc() # force clean up memory



	# let us test the load times
	time_readRDS <- system.time( { x <- readRDS("sp_data_speed_test/x.rds")
	coordinates(x) = ~ lon + lat
	proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from rds file and convert to spatial object
	rm(x) # remove variables from memory
	gc() # force clean up memory

	time_fread <- system.time( { x <- fread("sp_data_speed_test/x.csv")
	coordinates(x) = ~ lon + lat
	proj4string(x) = CRS("+init=epsg:4326") } ) # read text data from csv file and convert to spatial object
	# fread is used as it is marginally faster than read.csv - no point in comparing read.csv at all
	rm(x) # remove variables from memory
	gc() # force clean up memory

	time_readOGR_shp <- system.time( xs <- readOGR(dsn = "sp_data_speed_test",
	layer = "xs") ) # read spatial object from ESRI shapefile
	rm(xs) # remove variables from memory
	gc() # force clean up memory

	time_readRDS_sp <- system.time( xs <- readRDS("sp_data_speed_test/xs.rds") ) # read spatial object from rds shapefile
	rm(xs) # remove variables from memory
	gc() # force clean up memory

	# construct the run times table
	runTimes <- data.table(rbind(time_saveRDS, time_write.csv, time_writeOGR_shp, time_saveRDS_sp, time_readRDS, time_fread, time_readOGR_shp, time_readRDS_sp))
	runTimes[ , action := c("saveRDS", "write.csv", "writeOGR_shp", "saveRDS_sp", "readRDS + convert to sp object", "fread + convert to sp object", "readOGR_shp", "readRDS_sp") ]
	runTimes[ action %like% "read" , rw := "read" , ]
	runTimes[ action %like% "write\|save" , rw := "write" , ]

	# plot and print the graph with read-write speed comparison
	print( ggplot(runTimes, aes(x = action, y = elapsed, fill = rw)) +
	geom_bar(stat = "identity") +
	theme_bw() +
	theme(text = element_text(size = 16),
	axis.text = element_text(colour = "black")) +
	ylab("Elapsed time (seconds)") +
	xlab("Action") +
	ggtitle("Comparison of read/write times of large spatial dataframe.\n(Less is better)\n") +
	coord_flip() )

	# check if user wanted to clean up the temp dir and delete it if so
	if (rmTempFiles == 'y') {
	unlink("sp_data_speed_test", recursive = T)
	}

	runTimes # return or print out the table with run times
	}



	removeTempFiles <- function() {

	print("The test will take about 1-2 miuntes depending on your hardware and needs approximately 150 megabytes of free disk space (for a sample of 1 000 000 spatial points). A temporary directory 'sp_data_speed_test' will be created in your current working directory and files 'x.csv', 'xs.dbf', 'xs.prj', 'x.rds', 'xs.rds', 'xs.shp', 'xs.shx' will be created in that directory in the process of benchmarking. Do you want to delete the files and the directory after the benchmark is complete?")
	answ = ""
	while (!answ %in% c("y", "n")){
	answ <- readline(prompt = "Please input 'y' or 'n' and press Enter/Return: ")
	if (! answ %in% c("y", "n")){ print ("Error, you did not print 'y' or 'n'. Try again.")}
	}

	return(answ)
	}

	runSpLoadBench() # run the test