samuelbosch/combine.R

## combine.R
library(sdmpredictors)
library(raster)
library(sp)
library(bit64)

source("utils.R")

filter_qc <- function(data, qc) {
  qc <- setdiff(qc, c(8, 9, 20))
  qc <- qc[qc > 1 & qc <= 30]

  qc_col <- data$qc
  qc_col <- bit64::as.bitstring(as.integer64(qc_col))
  qc <- bit64::as.bitstring(as.integer64(sum(2^(qc-1))))
  qc <- gsub("0", ".", qc)
  qc_ok <- grepl(qc, qc_col)
  data[qc_ok,]
}
## for(i in 1:30) { print(paste(NROW(filter_qc(data.frame(qc = 267910719), i)) == 1, i)) }

combine_records <- function(row) {
  add_data <- function(data, new_data, src) {
    if(NROW(new_data) > 0) {
      new_data <- cbind(species=row$ScientificName, new_data[,xycols], src=src)
      data <- rbind(data, new_data)
    }
    data
  }
  simple_add <- function(data, src) {
    file <- list.files(paste0("../dataprep/", src), paste0("^", row$AphiaID, "[.]csv[.gz]?"), full.names = TRUE)
    if (length(file) > 1) stop("More than 1 file found")
    if(length(file) == 1) {
      new_data <- read.csv2(file, stringsAsFactors = FALSE)
      data <- add_data(data, new_data, src)
    }
    data
  }

  print("combine records")
  data <- c()

  gbif_file <- list.files("../dataprep/GBIF", paste0("_", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
  if(length(gbif_file) > 1) stop("gbif more than 1 file")
  if(length(gbif_file) == 0) print("gbif not found")
  if(length(gbif_file) == 1) {
    gbif_data <- read.csv2(gbif_file, stringsAsFactors = FALSE)
    data <- add_data(data, gbif_data, "GBIF")
  }


  obis_file <- list.files("../dataprep/OBIS", paste0("^", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
  if(length(obis_file) > 1) stop("obis more than 1 file")
  if(length(obis_file) == 0) print("obis not found")
  if(length(obis_file) == 1) {
    obis_data <- read.csv2(obis_file, stringsAsFactors = FALSE)
    obis_data <- filter_qc(obis_data, qc = c(1, 2, 3, 4, 5, 6))
    data <- add_data(data, obis_data, "OBIS")
  }

  data <- simple_add(data, "RLS")
  data <- simple_add(data, "INVASIVES")
  data <- simple_add(data, "LAB")

  colnames(data) <- c("species", lonlat, "source")

  write.csv2(data, paste0("../dataprep/1_combined/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
  data
}

filter_rounded_coordinates <- function(data, precision=1.0) {
  # precision: 10.0 -> all coordinates rounded to a multiple of 10 are filtered
  # precision: 1.0 -> all coordinates without decimal digits are filtered
  # precision: 0.5 -> all coordinates rounded to a multiple of 0.5 are filtered
  precision_check <- function(values) {
    factor <- 1000000
    pf <- as.integer(precision * factor)
    remainder <- as.integer(values * factor) %% pf
    remainder == 0 | remainder == 1 | remainder == (pf-1) # take rounding errors in account
  }
  if (precision > 0.0) {
    data <- data[!(precision_check(data[,lonlat[1]]) & precision_check(data[,lonlat[2]])),]
  }
  data
}

filter_unique_bad <- function(row, data) {
  print("filter unique")
  data <- data[!duplicated(data[,lonlat]),] ## remove same coordinates

  data = filter_rounded_coordinates(data, 1.0)

  write.csv2(data, paste0("2_filter_unique_bad/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
  data
}

add_environment <- function(row, data, layers) {
  print("add environment")
  cells <- cellFromXY(layers, data[,lonlat])
  unique_cells <- unique(cells)
  matched_cells <- match(cells, unique_cells)
  environment <- extract(layers, unique_cells)
  data <- cbind(data, environment[matched_cells,])
  write.csv2(data, paste0("3_environment/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
  data
}

# utility function to allow for returning multiple values
# source https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html
# and http://stackoverflow.com/questions/1826519/function-returning-more-than-one-value
list <- structure(NA,class="result")
"[<-.result" <- function(x,...,value) {
  args <- as.list(match.call())
  args <- args[-c(1:2,length(args))]
  length(value) <- length(args)
  for(i in seq(along=args)) {
    a <- args[[i]]
    if(!missing(a)) eval.parent(substitute(a <- v,list(a=a,v=value[[i]])))
  }
  x
}

filter_points_without_environment <- function(row, data, layers, max.move.distance=1000){
  points <- data[,xycols]
  coordinates(points) <- xycols
  point.as.char <- function(p) { paste0("(", coordinates(p)[1], ", ", coordinates(p)[2], ")") }
  # moves points if necessary based on max.move.distance
  grid <- subset(layers, 1)
  na.indexes <- which(!complete.cases(data[,names(layers)]))

  raster_nas <- is.na(values(raster(combinedlayers, layer=1)))

  if(length(na.indexes) > 0) {
    na.points <- points[na.indexes]
    na.cells <- cellFromXY(grid,na.points)
    not.na.cells <- unique(cellFromXY(grid, points[-na.indexes]))

    for (i in length(na.points):1) { # decending order to prevent index issues when removing rows
      cell <- na.cells[i]
      point <- na.points[i]
      # margin: correction for the fact that we measure to the center of the adjacent cells
      margin <- spDists(coordinates(point), (coordinates(point) + (res(grid) / 2.4)), longlat=TRUE) * 1000

      visited <- cell

      repeat
      {
        visited.adjacent <- adjacent(grid, as.integer(visited), directions=4, id=FALSE)[,2]
        to.visit <- setdiff(visited.adjacent, visited)

        visited <- union(visited, to.visit)

        to.visit.xy <- xyFromCell(grid, as.integer(to.visit), spatial=TRUE)

        distances <- spDistsN1(to.visit.xy, point, longlat=TRUE) * 1000 # measure distance and convert to meters
        distances <- distances - margin # correct for distance from point to center of cell instead of border

        dist.ok <- which(distances < max.move.distance)
        if (length(dist.ok) == 0) {
          print(paste("Reached maximum distance", point.as.char(points[na.indexes[i]]))) # first print then remove
          break # reached maximum distance
        }

        dist.ok.cells <- as.integer(to.visit)[dist.ok]

        # check if 1st raster has a value
        dist.ok.notna <- !raster_nas[dist.ok.cells]

        if (length(dist.ok.cells[dist.ok.notna]) > 0){

          # check if all rasters have a value
          dist.ok.notna.values <- layers[dist.ok.cells[dist.ok.notna]]

          dist.ok.complete <- complete.cases(dist.ok.notna.values)
          min.cell.index <- which.min(distances[dist.ok][dist.ok.notna][dist.ok.complete])

          closest <- dist.ok.cells[dist.ok.notna][min.cell.index]

          if (length(closest) > 0){
            not.na.cells <- union(not.na.cells, closest) # update set with not NA cells
            # replace point with new coordinates
            newXY <- xyFromCell(grid, closest, spatial=TRUE)
            newXY@proj4string <- points@proj4string
            print(paste("Moved", point.as.char(points[na.indexes[i]]),"to",point.as.char(newXY)))
            points <- rbind(points[-na.indexes[i]], newXY)
            data[na.indexes[i], xycols] <- coordinates(newXY)
            data[na.indexes[i],names(layers)] <- dist.ok.notna.values[min.cell.index,]
            break # found the closest value
          }
        }
      }
    }
  }
  write.csv2(data[!complete.cases(data[,names(layers)]),], paste("4_filter_no_environment/missing_envdata ", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
  data <- data[complete.cases(data[,names(layers)]),]
  write.csv2(data, paste("4_filter_no_environment/", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
  data
}

behrmann <- CRS("+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs")

create_behrmann_raster <- function( cellsize_squarekm ){
  global_behrmann <- extent(-17367500,17367500,-7342200,7342200)
  resolution = (cellsize_squarekm * 1000000)^.5
  raster(global_behrmann, resolution = resolution, crs = behrmann)
}

project <- function(data, outputcrs){
  spTransform(data, outputcrs)
}

filter_equalarea_grid <- function(row, data, proj, equalarea_grid, cellsize_squarekm = 25, path = NULL) {
  print("filter grid")
  wgs84 <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")

  d <- data
  # make spatial and project
  coordinates(d) <- xycols
  d@proj4string <- wgs84
  d <- project(d, proj)
  cells <- cellFromXY(equalarea_grid, d)

  data <- data[!duplicated(cells),]
  if(is.null(path)) {
    write.csv2(data, paste0("5_filter_grid_", cellsize_squarekm,"km2/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
  } else {
    write.csv2(data, path, row.names = FALSE)
  }
  data
}


## data <- read.csv2("D:/a/Dropbox/Invasives/Aquarium/Aquarium paper/data/3_future/Acanthophora spicifera 211768.csv", stringsAsFactors = F)
## row <- species[1,]
process_records <- function(species) {
  layers <- get_layers()

  equalarea_grid <- create_behrmann_raster(25) ## 5*5 km grid
  for(i in 1:nrow(species)) {
    row <- species[i,]
    print(paste(row$ScientificName, row$AphiaID))
    if(!file.exists(paste0("5_filter_grid/", row$ScientificName, " ", row$AphiaID, ".csv"))) {
      data <- combine_records(row, obisinfo)
      if(NROW(data) > 0) {
         data <- filter_unique_bad(row, data)
         data <- add_environment(row, data, layers)
         data <- filter_points_without_environment(row, data, layers, max.move.distance=1000)
         data <- filter_equalarea_grid(row, data, behrmann, equalarea_grid)
      }
      else {
        print("No data")
      }
    }
  }
}
# process_records(species)


## utils.R
library(sdmpredictors)

get_species <- function() {
  species <- read.csv2("../dataprep/MarineSpeed_species.csv", stringsAsFactors=F)
  species
}

valueOrDefault <- function(x, column, default=NA) {
  if(column %in% colnames(x)) {
    x[,column]
  } else {
    default
  }
}

xycols <- c("decimalLongitude", "decimalLatitude")
lonlat <- c("longitude", "latitude")

get_layers <- function() {
  options(sdmpredictors_datadir = "D:/a/projects/predictors/results")
  layers <- list_layers(terrestrial = FALSE)
  write.csv2(layers, "../dataprep/layer_info.csv", row.names = FALSE)

  load_layers(layers, equalarea = FALSE, standardized = FALSE)
}
	library(sdmpredictors)
	library(raster)
	library(sp)
	library(bit64)

	source("utils.R")

	filter_qc <- function(data, qc) {
	qc <- setdiff(qc, c(8, 9, 20))
	qc <- qc[qc > 1 & qc <= 30]

	qc_col <- data$qc
	qc_col <- bit64::as.bitstring(as.integer64(qc_col))
	qc <- bit64::as.bitstring(as.integer64(sum(2^(qc-1))))
	qc <- gsub("0", ".", qc)
	qc_ok <- grepl(qc, qc_col)
	data[qc_ok,]
	}
	## for(i in 1:30) { print(paste(NROW(filter_qc(data.frame(qc = 267910719), i)) == 1, i)) }

	combine_records <- function(row) {
	add_data <- function(data, new_data, src) {
	if(NROW(new_data) > 0) {
	new_data <- cbind(species=row$ScientificName, new_data[,xycols], src=src)
	data <- rbind(data, new_data)
	}
	data
	}
	simple_add <- function(data, src) {
	file <- list.files(paste0("../dataprep/", src), paste0("^", row$AphiaID, "[.]csv[.gz]?"), full.names = TRUE)
	if (length(file) > 1) stop("More than 1 file found")
	if(length(file) == 1) {
	new_data <- read.csv2(file, stringsAsFactors = FALSE)
	data <- add_data(data, new_data, src)
	}
	data
	}

	print("combine records")
	data <- c()

	gbif_file <- list.files("../dataprep/GBIF", paste0("_", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
	if(length(gbif_file) > 1) stop("gbif more than 1 file")
	if(length(gbif_file) == 0) print("gbif not found")
	if(length(gbif_file) == 1) {
	gbif_data <- read.csv2(gbif_file, stringsAsFactors = FALSE)
	data <- add_data(data, gbif_data, "GBIF")
	}


	obis_file <- list.files("../dataprep/OBIS", paste0("^", row$AphiaID, "[.]csv[.]gz"), full.names = TRUE)
	if(length(obis_file) > 1) stop("obis more than 1 file")
	if(length(obis_file) == 0) print("obis not found")
	if(length(obis_file) == 1) {
	obis_data <- read.csv2(obis_file, stringsAsFactors = FALSE)
	obis_data <- filter_qc(obis_data, qc = c(1, 2, 3, 4, 5, 6))
	data <- add_data(data, obis_data, "OBIS")
	}

	data <- simple_add(data, "RLS")
	data <- simple_add(data, "INVASIVES")
	data <- simple_add(data, "LAB")

	colnames(data) <- c("species", lonlat, "source")

	write.csv2(data, paste0("../dataprep/1_combined/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
	data
	}

	filter_rounded_coordinates <- function(data, precision=1.0) {
	# precision: 10.0 -> all coordinates rounded to a multiple of 10 are filtered
	# precision: 1.0 -> all coordinates without decimal digits are filtered
	# precision: 0.5 -> all coordinates rounded to a multiple of 0.5 are filtered
	precision_check <- function(values) {
	factor <- 1000000
	pf <- as.integer(precision * factor)
	remainder <- as.integer(values * factor) %% pf
	remainder == 0 \| remainder == 1 \| remainder == (pf-1) # take rounding errors in account
	}
	if (precision > 0.0) {
	data <- data[!(precision_check(data[,lonlat[1]]) & precision_check(data[,lonlat[2]])),]
	}
	data
	}

	filter_unique_bad <- function(row, data) {
	print("filter unique")
	data <- data[!duplicated(data[,lonlat]),] ## remove same coordinates

	data = filter_rounded_coordinates(data, 1.0)

	write.csv2(data, paste0("2_filter_unique_bad/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
	data
	}

	add_environment <- function(row, data, layers) {
	print("add environment")
	cells <- cellFromXY(layers, data[,lonlat])
	unique_cells <- unique(cells)
	matched_cells <- match(cells, unique_cells)
	environment <- extract(layers, unique_cells)
	data <- cbind(data, environment[matched_cells,])
	write.csv2(data, paste0("3_environment/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
	data
	}

	# utility function to allow for returning multiple values
	# source https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html
	# and http://stackoverflow.com/questions/1826519/function-returning-more-than-one-value
	list <- structure(NA,class="result")
	"[<-.result" <- function(x,...,value) {
	args <- as.list(match.call())
	args <- args[-c(1:2,length(args))]
	length(value) <- length(args)
	for(i in seq(along=args)) {
	a <- args[[i]]
	if(!missing(a)) eval.parent(substitute(a <- v,list(a=a,v=value[[i]])))
	}
	x
	}

	filter_points_without_environment <- function(row, data, layers, max.move.distance=1000){
	points <- data[,xycols]
	coordinates(points) <- xycols
	point.as.char <- function(p) { paste0("(", coordinates(p)[1], ", ", coordinates(p)[2], ")") }
	# moves points if necessary based on max.move.distance
	grid <- subset(layers, 1)
	na.indexes <- which(!complete.cases(data[,names(layers)]))

	raster_nas <- is.na(values(raster(combinedlayers, layer=1)))

	if(length(na.indexes) > 0) {
	na.points <- points[na.indexes]
	na.cells <- cellFromXY(grid,na.points)
	not.na.cells <- unique(cellFromXY(grid, points[-na.indexes]))

	for (i in length(na.points):1) { # decending order to prevent index issues when removing rows
	cell <- na.cells[i]
	point <- na.points[i]
	# margin: correction for the fact that we measure to the center of the adjacent cells
	margin <- spDists(coordinates(point), (coordinates(point) + (res(grid) / 2.4)), longlat=TRUE) * 1000

	visited <- cell

	repeat
	{
	visited.adjacent <- adjacent(grid, as.integer(visited), directions=4, id=FALSE)[,2]
	to.visit <- setdiff(visited.adjacent, visited)

	visited <- union(visited, to.visit)

	to.visit.xy <- xyFromCell(grid, as.integer(to.visit), spatial=TRUE)

	distances <- spDistsN1(to.visit.xy, point, longlat=TRUE) * 1000 # measure distance and convert to meters
	distances <- distances - margin # correct for distance from point to center of cell instead of border

	dist.ok <- which(distances < max.move.distance)
	if (length(dist.ok) == 0) {
	print(paste("Reached maximum distance", point.as.char(points[na.indexes[i]]))) # first print then remove
	break # reached maximum distance
	}

	dist.ok.cells <- as.integer(to.visit)[dist.ok]

	# check if 1st raster has a value
	dist.ok.notna <- !raster_nas[dist.ok.cells]

	if (length(dist.ok.cells[dist.ok.notna]) > 0){

	# check if all rasters have a value
	dist.ok.notna.values <- layers[dist.ok.cells[dist.ok.notna]]

	dist.ok.complete <- complete.cases(dist.ok.notna.values)
	min.cell.index <- which.min(distances[dist.ok][dist.ok.notna][dist.ok.complete])

	closest <- dist.ok.cells[dist.ok.notna][min.cell.index]

	if (length(closest) > 0){
	not.na.cells <- union(not.na.cells, closest) # update set with not NA cells
	# replace point with new coordinates
	newXY <- xyFromCell(grid, closest, spatial=TRUE)
	newXY@proj4string <- points@proj4string
	print(paste("Moved", point.as.char(points[na.indexes[i]]),"to",point.as.char(newXY)))
	points <- rbind(points[-na.indexes[i]], newXY)
	data[na.indexes[i], xycols] <- coordinates(newXY)
	data[na.indexes[i],names(layers)] <- dist.ok.notna.values[min.cell.index,]
	break # found the closest value
	}
	}
	}
	}
	}
	write.csv2(data[!complete.cases(data[,names(layers)]),], paste("4_filter_no_environment/missing_envdata ", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
	data <- data[complete.cases(data[,names(layers)]),]
	write.csv2(data, paste("4_filter_no_environment/", row$ScientificName, " ", row$AphiaID,".csv"), row.names = FALSE)
	data
	}

	behrmann <- CRS("+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs")

	create_behrmann_raster <- function( cellsize_squarekm ){
	global_behrmann <- extent(-17367500,17367500,-7342200,7342200)
	resolution = (cellsize_squarekm * 1000000)^.5
	raster(global_behrmann, resolution = resolution, crs = behrmann)
	}

	project <- function(data, outputcrs){
	spTransform(data, outputcrs)
	}

	filter_equalarea_grid <- function(row, data, proj, equalarea_grid, cellsize_squarekm = 25, path = NULL) {
	print("filter grid")
	wgs84 <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")

	d <- data
	# make spatial and project
	coordinates(d) <- xycols
	d@proj4string <- wgs84
	d <- project(d, proj)
	cells <- cellFromXY(equalarea_grid, d)

	data <- data[!duplicated(cells),]
	if(is.null(path)) {
	write.csv2(data, paste0("5_filter_grid_", cellsize_squarekm,"km2/", row$ScientificName, " ", row$AphiaID, ".csv"), row.names = FALSE)
	} else {
	write.csv2(data, path, row.names = FALSE)
	}
	data
	}


	## data <- read.csv2("D:/a/Dropbox/Invasives/Aquarium/Aquarium paper/data/3_future/Acanthophora spicifera 211768.csv", stringsAsFactors = F)
	## row <- species[1,]
	process_records <- function(species) {
	layers <- get_layers()

	equalarea_grid <- create_behrmann_raster(25) ## 5*5 km grid
	for(i in 1:nrow(species)) {
	row <- species[i,]
	print(paste(row$ScientificName, row$AphiaID))
	if(!file.exists(paste0("5_filter_grid/", row$ScientificName, " ", row$AphiaID, ".csv"))) {
	data <- combine_records(row, obisinfo)
	if(NROW(data) > 0) {
	data <- filter_unique_bad(row, data)
	data <- add_environment(row, data, layers)
	data <- filter_points_without_environment(row, data, layers, max.move.distance=1000)
	data <- filter_equalarea_grid(row, data, behrmann, equalarea_grid)
	}
	else {
	print("No data")
	}
	}
	}
	}
	# process_records(species)
	library(sdmpredictors)

	get_species <- function() {
	species <- read.csv2("../dataprep/MarineSpeed_species.csv", stringsAsFactors=F)
	species
	}

	valueOrDefault <- function(x, column, default=NA) {
	if(column %in% colnames(x)) {
	x[,column]
	} else {
	default
	}
	}

	xycols <- c("decimalLongitude", "decimalLatitude")
	lonlat <- c("longitude", "latitude")

	get_layers <- function() {
	options(sdmpredictors_datadir = "D:/a/projects/predictors/results")
	layers <- list_layers(terrestrial = FALSE)
	write.csv2(layers, "../dataprep/layer_info.csv", row.names = FALSE)

	load_layers(layers, equalarea = FALSE, standardized = FALSE)
	}