Skip to content

Instantly share code, notes, and snippets.

@samuelbosch
Last active February 5, 2016 11:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samuelbosch/2beaa23e92db132e9550 to your computer and use it in GitHub Desktop.
Save samuelbosch/2beaa23e92db132e9550 to your computer and use it in GitHub Desktop.
MarineSPEED.org filtering data and adding environmental data
library(sdmpredictors)
library(raster)
library(sp)
library(bit64)
source("utils.R")
filter_qc <- function(data, qc) {
qc <- setdiff(qc, c(8, 9, 20))
qc <- qc[qc > 1 & qc <= 30]
qc_col <- data$qc
qc_col <- bit64::as.bitstring(as.integer64(qc_col))
qc <- bit64::as.bitstring(as.integer64(sum(2^(qc-1))))
qc <- gsub("0", ".", qc)
qc_ok <- grepl(qc, qc_col)
data[qc_ok,]
}
## for(i in 1:30) { print(paste(NROW(filter_qc(data.frame(qc = 267910719), i)) == 1, i)) }
combine_records <- function(row) {
add_data <- function(data, new_data, src) {
if(NROW(new_data) > 0) {
new_data <- cbind(species=row$ScientificName, new_data[,xycols], src=src)
data <- rbind(data, new_data)
}
data
}
simple_add <- function(data, src) {
file <- list.files(paste0("../dataprep/", src), paste0("^", row$AphiaID, "[.]csv[.gz]?"), full.names = TRUE)
if (length(file) > 1) stop("More than 1 file found")
if(length(file) == 1) {
new_data <- read.csv2(file, stringsAsFactors = FALSE)
data <- add_data(data, new_data, src)
}
data
}
print("combine records")
data <- c()
gbif_file <- list.files("../dataprep/GBIF", paste0("_", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
if(length(gbif_file) > 1) stop("gbif more than 1 file")
if(length(gbif_file) == 0) print("gbif not found")
if(length(gbif_file) == 1) {
gbif_data <- read.csv2(gbif_file, stringsAsFactors = FALSE)
data <- add_data(data, gbif_data, "GBIF")
}
obis_file <- list.files("../dataprep/OBIS", paste0("^", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
if(length(obis_file) > 1) stop("obis more than 1 file")
if(length(obis_file) == 0) print("obis not found")
if(length(obis_file) == 1) {
obis_data <- read.csv2(obis_file, stringsAsFactors = FALSE)
obis_data <- filter_qc(obis_data, qc = c(1, 2, 3, 4, 5, 6))
data <- add_data(data, obis_data, "OBIS")
}
data <- simple_add(data, "RLS")
data <- simple_add(data, "INVASIVES")
data <- simple_add(data, "LAB")
colnames(data) <- c("species", lonlat, "source")
write.csv2(data, paste0("../dataprep/1_combined/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
data
}
filter_rounded_coordinates <- function(data, precision=1.0) {
# precision: 10.0 -> all coordinates rounded to a multiple of 10 are filtered
# precision: 1.0 -> all coordinates without decimal digits are filtered
# precision: 0.5 -> all coordinates rounded to a multiple of 0.5 are filtered
precision_check <- function(values) {
factor <- 1000000
pf <- as.integer(precision * factor)
remainder <- as.integer(values * factor) %% pf
remainder == 0 | remainder == 1 | remainder == (pf-1) # take rounding errors in account
}
if (precision > 0.0) {
data <- data[!(precision_check(data[,lonlat[1]]) & precision_check(data[,lonlat[2]])),]
}
data
}
filter_unique_bad <- function(row, data) {
print("filter unique")
data <- data[!duplicated(data[,lonlat]),] ## remove same coordinates
data = filter_rounded_coordinates(data, 1.0)
write.csv2(data, paste0("2_filter_unique_bad/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
data
}
add_environment <- function(row, data, layers) {
print("add environment")
cells <- cellFromXY(layers, data[,lonlat])
unique_cells <- unique(cells)
matched_cells <- match(cells, unique_cells)
environment <- extract(layers, unique_cells)
data <- cbind(data, environment[matched_cells,])
write.csv2(data, paste0("3_environment/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
data
}
# utility function to allow for returning multiple values
# source https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html
# and http://stackoverflow.com/questions/1826519/function-returning-more-than-one-value
list <- structure(NA,class="result")
"[<-.result" <- function(x,...,value) {
args <- as.list(match.call())
args <- args[-c(1:2,length(args))]
length(value) <- length(args)
for(i in seq(along=args)) {
a <- args[[i]]
if(!missing(a)) eval.parent(substitute(a <- v,list(a=a,v=value[[i]])))
}
x
}
filter_points_without_environment <- function(row, data, layers, max.move.distance=1000){
points <- data[,xycols]
coordinates(points) <- xycols
point.as.char <- function(p) { paste0("(", coordinates(p)[1], ", ", coordinates(p)[2], ")") }
# moves points if necessary based on max.move.distance
grid <- subset(layers, 1)
na.indexes <- which(!complete.cases(data[,names(layers)]))
raster_nas <- is.na(values(raster(combinedlayers, layer=1)))
if(length(na.indexes) > 0) {
na.points <- points[na.indexes]
na.cells <- cellFromXY(grid,na.points)
not.na.cells <- unique(cellFromXY(grid, points[-na.indexes]))
for (i in length(na.points):1) { # decending order to prevent index issues when removing rows
cell <- na.cells[i]
point <- na.points[i]
# margin: correction for the fact that we measure to the center of the adjacent cells
margin <- spDists(coordinates(point), (coordinates(point) + (res(grid) / 2.4)), longlat=TRUE) * 1000
visited <- cell
repeat
{
visited.adjacent <- adjacent(grid, as.integer(visited), directions=4, id=FALSE)[,2]
to.visit <- setdiff(visited.adjacent, visited)
visited <- union(visited, to.visit)
to.visit.xy <- xyFromCell(grid, as.integer(to.visit), spatial=TRUE)
distances <- spDistsN1(to.visit.xy, point, longlat=TRUE) * 1000 # measure distance and convert to meters
distances <- distances - margin # correct for distance from point to center of cell instead of border
dist.ok <- which(distances < max.move.distance)
if (length(dist.ok) == 0) {
print(paste("Reached maximum distance", point.as.char(points[na.indexes[i]]))) # first print then remove
break # reached maximum distance
}
dist.ok.cells <- as.integer(to.visit)[dist.ok]
# check if 1st raster has a value
dist.ok.notna <- !raster_nas[dist.ok.cells]
if (length(dist.ok.cells[dist.ok.notna]) > 0){
# check if all rasters have a value
dist.ok.notna.values <- layers[dist.ok.cells[dist.ok.notna]]
dist.ok.complete <- complete.cases(dist.ok.notna.values)
min.cell.index <- which.min(distances[dist.ok][dist.ok.notna][dist.ok.complete])
closest <- dist.ok.cells[dist.ok.notna][min.cell.index]
if (length(closest) > 0){
not.na.cells <- union(not.na.cells, closest) # update set with not NA cells
# replace point with new coordinates
newXY <- xyFromCell(grid, closest, spatial=TRUE)
newXY@proj4string <- points@proj4string
print(paste("Moved", point.as.char(points[na.indexes[i]]),"to",point.as.char(newXY)))
points <- rbind(points[-na.indexes[i]], newXY)
data[na.indexes[i], xycols] <- coordinates(newXY)
data[na.indexes[i],names(layers)] <- dist.ok.notna.values[min.cell.index,]
break # found the closest value
}
}
}
}
}
write.csv2(data[!complete.cases(data[,names(layers)]),], paste("4_filter_no_environment/missing_envdata ", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
data <- data[complete.cases(data[,names(layers)]),]
write.csv2(data, paste("4_filter_no_environment/", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
data
}
behrmann <- CRS("+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs")
create_behrmann_raster <- function( cellsize_squarekm ){
global_behrmann <- extent(-17367500,17367500,-7342200,7342200)
resolution = (cellsize_squarekm * 1000000)^.5
raster(global_behrmann, resolution = resolution, crs = behrmann)
}
project <- function(data, outputcrs){
spTransform(data, outputcrs)
}
filter_equalarea_grid <- function(row, data, proj, equalarea_grid, cellsize_squarekm = 25, path = NULL) {
print("filter grid")
wgs84 <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
d <- data
# make spatial and project
coordinates(d) <- xycols
d@proj4string <- wgs84
d <- project(d, proj)
cells <- cellFromXY(equalarea_grid, d)
data <- data[!duplicated(cells),]
if(is.null(path)) {
write.csv2(data, paste0("5_filter_grid_", cellsize_squarekm,"km2/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
} else {
write.csv2(data, path, row.names = FALSE)
}
data
}
## data <- read.csv2("D:/a/Dropbox/Invasives/Aquarium/Aquarium paper/data/3_future/Acanthophora spicifera 211768.csv", stringsAsFactors = F)
## row <- species[1,]
process_records <- function(species) {
layers <- get_layers()
equalarea_grid <- create_behrmann_raster(25) ## 5*5 km grid
for(i in 1:nrow(species)) {
row <- species[i,]
print(paste(row$ScientificName, row$AphiaID))
if(!file.exists(paste0("5_filter_grid/", row$ScientificName, " ", row$AphiaID, ".csv"))) {
data <- combine_records(row, obisinfo)
if(NROW(data) > 0) {
data <- filter_unique_bad(row, data)
data <- add_environment(row, data, layers)
data <- filter_points_without_environment(row, data, layers, max.move.distance=1000)
data <- filter_equalarea_grid(row, data, behrmann, equalarea_grid)
}
else {
print("No data")
}
}
}
}
# process_records(species)
library(sdmpredictors)
get_species <- function() {
species <- read.csv2("../dataprep/MarineSpeed_species.csv", stringsAsFactors=F)
species
}
valueOrDefault <- function(x, column, default=NA) {
if(column %in% colnames(x)) {
x[,column]
} else {
default
}
}
xycols <- c("decimalLongitude", "decimalLatitude")
lonlat <- c("longitude", "latitude")
get_layers <- function() {
options(sdmpredictors_datadir = "D:/a/projects/predictors/results")
layers <- list_layers(terrestrial = FALSE)
write.csv2(layers, "../dataprep/layer_info.csv", row.names = FALSE)
load_layers(layers, equalarea = FALSE, standardized = FALSE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment