Skip to content

Instantly share code, notes, and snippets.

@rmarrotte
Created July 27, 2016 05:13
Show Gist options
  • Save rmarrotte/fe7cccbaea7d921a6381d8574500621b to your computer and use it in GitHub Desktop.
Save rmarrotte/fe7cccbaea7d921a6381d8574500621b to your computer and use it in GitHub Desktop.
####################################
# Run zonal statistics in parallel #
####################################
# Clear workspace
rm(list=ls())
# Libraries
library(raster)
library(gstat)
library(scales)
library(ggplot2)
library(snowfall)
# Make some fake data
xy <- expand.grid(1:500, 1:500)
names(xy) <- c("x","y")
# define the gstat object (spatial model)
g.dummy <- gstat(formula=z~1, locations=~x+y, dummy=T, beta=1,
model=vgm(psill=0.025,range=100,model="Exp"), nmax=20)
# make a simulation based on the gstat object
yy <- predict(g.dummy, newdata=xy, nsim=1)
# Scale the values between 500 and 1000
yy$sim1 <- rescale(x = yy$sim1, to = c(500,1000))
# Make integer raster
yy$sim1 <- round(yy$sim1,0)
# show one realisation
gridded(yy) <- ~x+y
DEM <- raster(yy)
rm(xy,g.dummy,yy)
# Plot this data
x11()
plot(DEM)
# I need to collect the zonal statistic at each point,
# I am using a 10 unit buffer, I collect the statistic using summary and
# add in the standard deviation also
# n is the number of random points to collect data for
output <- c()
for(n in c(10,100,1000,10000)){
# Make some fake points
pnts <- sampleRandom(x = DEM, size = n, sp=T)
# Turn on timer
ptm <- proc.time()
# Extract the values
vals <- extract(DEM,pnts,buffer=100)
# Make a table
stats <- data.frame(do.call("rbind",lapply(vals,summary)))
stats$std <- unlist(lapply(vals,sd))
# Print it
print(head(stats))
# Output
output <- rbind(output,c(n,(proc.time() - ptm)[3]))
}
# The output df
output <- data.frame(output)
colnames(output)[1] <- c("Number_pnts")
# Zonal statistics functions,
# rat = raster
# pnts = Your points
# buff = the buffer length
# nthreads = Number of parallel processes
# bsize = The number of elements per thread
zonal_stats <- function(rat, pnts, buff,
nthreads = 4){
# Bunch sizes, how many points per batch
bsize <- length(pnts) %/% nthreads
# Turn pnts into list of n batches
coords <- coordinates(pnts)
quotient <- dim(coords)[1]%/%bsize # quotient
# Sequences to make batches of points
seq1 <- seq(from = 1,to = dim(coords)[1],by = bsize)
seq2 <- c(seq(from = bsize,to = dim(coords)[1],by = bsize),dim(coords)[1])
# Make List of batches of points
pnt_list <- list()
for(i in 1:length(seq1)){
# These are the batches of bsize points
pnt_list[[length(pnt_list)+1]] <- SpatialPoints(coords = coords[seq1[i]:seq2[i],])
}
rm(i,seq1,seq2,quotient)
# Inside function to get statistics for 1 single point within a buffer
get_pnt_stats <- function(pnts, rat, buff){
# Libraries
require(raster)
require(sp)
# zonal vals
pnt_vals <- extract(x = rat, y = pnts, buffer = buff)
# Get stats for each
pnt_sums <- lapply(X = pnt_vals, FUN = function(vals){
# Count NA's
nberNAs <- length(which(is.na(vals)))
nbvals <- length(which(!is.na(vals)))
# Remove NA's
vals <- vals[!is.na(vals)]
if(length(vals) > 0){
stats <- summary(vals)
stats <- as.numeric(stats)[1:6]
std <- sd(vals)
stats <- c(stats,std,nbvals,nberNAs)
}else{
stats <- c(rep(NA,7),nbvals,nberNAs)
}
names(stats) <- c("Min","Qu1","Median","Mean","Qu3","Max","Std","NbVals","NAs")
return(stats)
})
# Rbind everything together to make a df
pnt_sums <- data.frame(do.call("rbind",pnt_sums))
row.names(pnt_sums) <- row.names(pnts)
return(pnt_sums)
}
# Apply this little function over all the points
sfInit(parallel = T, cpus = nthreads)
stats_out <- sfLapply(pnt_list,
get_pnt_stats,
rat = rat,
buff = buff)
sfStop()
# Fix output
stats_out <- data.frame(do.call("rbind",stats_out))
# return it
return(stats_out)
}
output_p <- c()
for(n in c(10,100,1000,10000)){
# Make some fake points
pnts <- sampleRandom(x = DEM, size = n, sp=T)
# Turn on timer
ptm <- proc.time()
# Extract the values
stats <- zonal_stats(rat = DEM,
pnts = pnts,
buff = 100,
nthreads = 4)
# Print it
print(head(stats))
# Output
output_p <- rbind(output_p,c(n,(proc.time() - ptm)[3]))
}
# The output df
output_p <- data.frame(output_p)
colnames(output_p)[1] <- c("Number_pnts")
# Bind both together
output_p$Processing <- "Parallel"
output$Processing <- "Sequential"
output_both <- rbind(output,output_p)
# ggplot
x11(12,12)
ggplot(data = output_both, aes(y = elapsed, x = Number_pnts)) +
geom_point(aes(shape=Processing),size = 3) +
scale_x_log10() +
ylab("Processing time") +
xlab("Number of points") +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment