rmarrotte/zonal_stats_parallel.R

## zonal_stats_parallel.R
####################################
# Run zonal statistics in parallel #
####################################

# Clear workspace
rm(list=ls())

# Libraries
library(raster)
library(gstat)
library(scales)
library(ggplot2)
library(snowfall)

# Make some fake data
xy <- expand.grid(1:500, 1:500)
names(xy) <- c("x","y")

# define the gstat object (spatial model)
g.dummy <- gstat(formula=z~1, locations=~x+y, dummy=T, beta=1,
                 model=vgm(psill=0.025,range=100,model="Exp"), nmax=20)

# make a simulation based on the gstat object
yy <- predict(g.dummy, newdata=xy, nsim=1)

# Scale the values between 500 and 1000
yy$sim1 <- rescale(x = yy$sim1, to = c(500,1000))

# Make integer raster
yy$sim1 <- round(yy$sim1,0)

# show one realisation
gridded(yy) <- ~x+y
DEM <- raster(yy)
rm(xy,g.dummy,yy)


# Plot this data
x11()
plot(DEM)

# I need to collect the zonal statistic at each point,
# I am using a 10 unit buffer, I collect the statistic using summary and
# add in the standard deviation also
# n is the number of random points to collect data for
output <- c()
for(n in c(10,100,1000,10000)){
  # Make some fake points
  pnts <- sampleRandom(x = DEM, size = n, sp=T)

  # Turn on timer
  ptm <- proc.time()

  # Extract the values
  vals <- extract(DEM,pnts,buffer=100)

  # Make a table
  stats <- data.frame(do.call("rbind",lapply(vals,summary)))
  stats$std <- unlist(lapply(vals,sd))

  # Print it
  print(head(stats))

  # Output
  output <- rbind(output,c(n,(proc.time() - ptm)[3]))
}

# The output df
output <- data.frame(output)
colnames(output)[1] <- c("Number_pnts")

# Zonal statistics functions,
# rat = raster
# pnts = Your points
# buff = the buffer length
# nthreads = Number of parallel processes
# bsize = The number of elements per thread
zonal_stats <- function(rat, pnts, buff,
                        nthreads = 4){
  # Bunch sizes, how many points per batch
  bsize <- length(pnts) %/% nthreads

  # Turn pnts into list of n batches
  coords <- coordinates(pnts)
  quotient <- dim(coords)[1]%/%bsize # quotient

  # Sequences to make batches of points
  seq1 <- seq(from = 1,to = dim(coords)[1],by = bsize)
  seq2 <- c(seq(from = bsize,to = dim(coords)[1],by = bsize),dim(coords)[1])

  # Make List of batches of points
  pnt_list <- list()
  for(i in 1:length(seq1)){
    # These are the batches of bsize points
    pnt_list[[length(pnt_list)+1]] <- SpatialPoints(coords = coords[seq1[i]:seq2[i],])
  }
  rm(i,seq1,seq2,quotient)

  # Inside function to get statistics for 1 single point within a buffer
  get_pnt_stats <- function(pnts, rat, buff){
    # Libraries
    require(raster)
    require(sp)

    # zonal vals
    pnt_vals <- extract(x = rat, y = pnts, buffer = buff)

    # Get stats for each
    pnt_sums <- lapply(X = pnt_vals, FUN = function(vals){
      # Count NA's
      nberNAs <- length(which(is.na(vals)))
      nbvals <- length(which(!is.na(vals)))
      # Remove NA's
      vals <- vals[!is.na(vals)]
      if(length(vals) > 0){
        stats <- summary(vals)
        stats <- as.numeric(stats)[1:6]
        std <- sd(vals)
        stats <- c(stats,std,nbvals,nberNAs)
      }else{
        stats <- c(rep(NA,7),nbvals,nberNAs)
      }
      names(stats) <- c("Min","Qu1","Median","Mean","Qu3","Max","Std","NbVals","NAs")
      return(stats)
      })

    # Rbind everything together to make a df
    pnt_sums <- data.frame(do.call("rbind",pnt_sums))
    row.names(pnt_sums) <- row.names(pnts)
    return(pnt_sums)
  }

  # Apply this little function over all the points
  sfInit(parallel = T, cpus = nthreads)
  stats_out <- sfLapply(pnt_list,
                        get_pnt_stats,
                        rat = rat,
                        buff = buff)
  sfStop()

  # Fix output
  stats_out <- data.frame(do.call("rbind",stats_out))

  # return it
  return(stats_out)
}

output_p <- c()
for(n in c(10,100,1000,10000)){
  # Make some fake points
  pnts <- sampleRandom(x = DEM, size = n, sp=T)

  # Turn on timer
  ptm <- proc.time()

  # Extract the values
  stats <- zonal_stats(rat = DEM,
                      pnts = pnts,
                      buff = 100,
                      nthreads = 4)

  # Print it
  print(head(stats))

  # Output
  output_p <- rbind(output_p,c(n,(proc.time() - ptm)[3]))
}

# The output df
output_p <- data.frame(output_p)
colnames(output_p)[1] <- c("Number_pnts")

# Bind both together
output_p$Processing <- "Parallel"
output$Processing <- "Sequential"
output_both <- rbind(output,output_p)

# ggplot
x11(12,12)
ggplot(data = output_both, aes(y = elapsed, x = Number_pnts)) +
  geom_point(aes(shape=Processing),size = 3) +
  scale_x_log10() +
  ylab("Processing time") +
  xlab("Number of points") +
  theme_bw()
	####################################
	# Run zonal statistics in parallel #
	####################################

	# Clear workspace
	rm(list=ls())

	# Libraries
	library(raster)
	library(gstat)
	library(scales)
	library(ggplot2)
	library(snowfall)

	# Make some fake data
	xy <- expand.grid(1:500, 1:500)
	names(xy) <- c("x","y")

	# define the gstat object (spatial model)
	g.dummy <- gstat(formula=z~1, locations=~x+y, dummy=T, beta=1,
	model=vgm(psill=0.025,range=100,model="Exp"), nmax=20)

	# make a simulation based on the gstat object
	yy <- predict(g.dummy, newdata=xy, nsim=1)

	# Scale the values between 500 and 1000
	yy$sim1 <- rescale(x = yy$sim1, to = c(500,1000))

	# Make integer raster
	yy$sim1 <- round(yy$sim1,0)

	# show one realisation
	gridded(yy) <- ~x+y
	DEM <- raster(yy)
	rm(xy,g.dummy,yy)


	# Plot this data
	x11()
	plot(DEM)

	# I need to collect the zonal statistic at each point,
	# I am using a 10 unit buffer, I collect the statistic using summary and
	# add in the standard deviation also
	# n is the number of random points to collect data for
	output <- c()
	for(n in c(10,100,1000,10000)){
	# Make some fake points
	pnts <- sampleRandom(x = DEM, size = n, sp=T)

	# Turn on timer
	ptm <- proc.time()

	# Extract the values
	vals <- extract(DEM,pnts,buffer=100)

	# Make a table
	stats <- data.frame(do.call("rbind",lapply(vals,summary)))
	stats$std <- unlist(lapply(vals,sd))

	# Print it
	print(head(stats))

	# Output
	output <- rbind(output,c(n,(proc.time() - ptm)[3]))
	}

	# The output df
	output <- data.frame(output)
	colnames(output)[1] <- c("Number_pnts")

	# Zonal statistics functions,
	# rat = raster
	# pnts = Your points
	# buff = the buffer length
	# nthreads = Number of parallel processes
	# bsize = The number of elements per thread
	zonal_stats <- function(rat, pnts, buff,
	nthreads = 4){
	# Bunch sizes, how many points per batch
	bsize <- length(pnts) %/% nthreads

	# Turn pnts into list of n batches
	coords <- coordinates(pnts)
	quotient <- dim(coords)[1]%/%bsize # quotient

	# Sequences to make batches of points
	seq1 <- seq(from = 1,to = dim(coords)[1],by = bsize)
	seq2 <- c(seq(from = bsize,to = dim(coords)[1],by = bsize),dim(coords)[1])

	# Make List of batches of points
	pnt_list <- list()
	for(i in 1:length(seq1)){
	# These are the batches of bsize points
	pnt_list[[length(pnt_list)+1]] <- SpatialPoints(coords = coords[seq1[i]:seq2[i],])
	}
	rm(i,seq1,seq2,quotient)

	# Inside function to get statistics for 1 single point within a buffer
	get_pnt_stats <- function(pnts, rat, buff){
	# Libraries
	require(raster)
	require(sp)

	# zonal vals
	pnt_vals <- extract(x = rat, y = pnts, buffer = buff)

	# Get stats for each
	pnt_sums <- lapply(X = pnt_vals, FUN = function(vals){
	# Count NA's
	nberNAs <- length(which(is.na(vals)))
	nbvals <- length(which(!is.na(vals)))
	# Remove NA's
	vals <- vals[!is.na(vals)]
	if(length(vals) > 0){
	stats <- summary(vals)
	stats <- as.numeric(stats)[1:6]
	std <- sd(vals)
	stats <- c(stats,std,nbvals,nberNAs)
	}else{
	stats <- c(rep(NA,7),nbvals,nberNAs)
	}
	names(stats) <- c("Min","Qu1","Median","Mean","Qu3","Max","Std","NbVals","NAs")
	return(stats)
	})

	# Rbind everything together to make a df
	pnt_sums <- data.frame(do.call("rbind",pnt_sums))
	row.names(pnt_sums) <- row.names(pnts)
	return(pnt_sums)
	}

	# Apply this little function over all the points
	sfInit(parallel = T, cpus = nthreads)
	stats_out <- sfLapply(pnt_list,
	get_pnt_stats,
	rat = rat,
	buff = buff)
	sfStop()

	# Fix output
	stats_out <- data.frame(do.call("rbind",stats_out))

	# return it
	return(stats_out)
	}

	output_p <- c()
	for(n in c(10,100,1000,10000)){
	# Make some fake points
	pnts <- sampleRandom(x = DEM, size = n, sp=T)

	# Turn on timer
	ptm <- proc.time()

	# Extract the values
	stats <- zonal_stats(rat = DEM,
	pnts = pnts,
	buff = 100,
	nthreads = 4)

	# Print it
	print(head(stats))

	# Output
	output_p <- rbind(output_p,c(n,(proc.time() - ptm)[3]))
	}

	# The output df
	output_p <- data.frame(output_p)
	colnames(output_p)[1] <- c("Number_pnts")

	# Bind both together
	output_p$Processing <- "Parallel"
	output$Processing <- "Sequential"
	output_both <- rbind(output,output_p)

	# ggplot
	x11(12,12)
	ggplot(data = output_both, aes(y = elapsed, x = Number_pnts)) +
	geom_point(aes(shape=Processing),size = 3) +
	scale_x_log10() +
	ylab("Processing time") +
	xlab("Number of points") +
	theme_bw()