Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Last active March 30, 2024 08:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdsumner/d52539bfbe2727df51f45960657a8e1c to your computer and use it in GitHub Desktop.
Save mdsumner/d52539bfbe2727df51f45960657a8e1c to your computer and use it in GitHub Desktop.
dsn <- "/vsicurl/https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
dsn <- basename(dsn)
info <- vapour::vapour_raster_info(dsn)

library(grout)
g <- grout(info$dimension, info$extent, blocksize = c(2048, 2048))

index <- tile_index(g)

## also create the extents that we want
tiles <- tibble::tibble(dsn = sprintf("vrt://%s?srcwin=%i,%i,%i,%i", dsn, 
                 index$offset_x, index$offset_y, index$ncol, index$nrow), 
                 xmin = index$xmin, xmax = index$xmax, ymin = index$ymin, ymax = index$ymax)


## so here's my thinking, each row in this tibble has a tile we set up and an extent for the vector read
## (I think the vector read is all-inclusive but I will check)
for (i in 1:nrow(tiles)) {
s <- dplyr::sample_n(tiles, 1L)
r <- rast(s$dsn)
vcrs <- crs(vect("pad-mobi.parquet", proxy = TRUE))
## project the extent so we can window read the vector
ex <- ext(project(ext(r), to = vcrs, from = crs(r)))

v <- vect("pad-mobi.parquet", extent = ex)



if (nrow(v) > 0) {
  result <- terra::extract(r, project(v, crs(r)))
  stop()
}
}

It took until i = 13 for me to get a result, that looks like this:

str(result)
'data.frame':   384481 obs. of  2 variables:
 $ ID                    : num  1 2 3 4 5 6 7 8 9 10 ...
 $ hfp_2021_100m_v1-2_cog: int  NA NA NA NA NA NA NA NA NA NA ...
> str(na.omit(result))
'data.frame':   377600 obs. of  2 variables:
 $ ID                    : num  39 39 39 39 39 39 39 39 39 39 ...
 $ hfp_2021_100m_v1-2_cog: int  4435 4432 4432 4408 4336 4222 4072 3895 3715 4012 ...
 - attr(*, "na.action")= 'omit' Named int [1:6881] 1 2 3 4 5 6 7 8 9 10 ...
  ..- attr(*, "names")= chr [1:6881] "1" "2" "3" "4" ...

the sample tile looks like this:

nlist(s)
                                                            dsn
"vrt://hfp_2021_100m_v1-2_cog.tif?srcwin=79872,32768,2048,2048"
                                                           xmin
                                                "-10052895.696"
                                                           xmax
                                                 "-9848095.696"
                                                           ymin
                                                  "5343378.852"
                                                           ymax
                                                  "5548178.852"

@mdsumner
Copy link
Author

here's what one of the tiles looks like

raster extent
image

vector extent (so we are getting inclusive window read)
image

@mdsumner
Copy link
Author

mdsumner commented Mar 29, 2024

update the code above to keep the row id and take the mean

dsn <- "/vsicurl/https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
dsn <- basename(dsn)
info <- vapour::vapour_raster_info(dsn)
library(terra)
library(grout)  ## hypertidy/grout
## create an abstract tile from this raster, native tile size is 512x512
g <- grout(info$dimension, info$extent, blocksize = c(2048, 2048))
## the materialized index of each tile
index <- tile_index(g)

## also create the extents that we want
tiles <- tibble::tibble(dsn = sprintf("vrt://%s?srcwin=%i,%i,%i,%i", dsn, 
                                      index$offset_x, index$offset_y, index$ncol, index$nrow), 
                        xmin = index$xmin, xmax = index$xmax, ymin = index$ymin, ymax = index$ymax)


## so here's my thinking, each row in this tibble has a tile we set up and an extent for the vector read
## (I think the vector read is all-inclusive but I will check)
for (i in 1:nrow(tiles)) {
  s <- dplyr::sample_n(tiles, 1L)
  r <- rast(s$dsn)
  vcrs <- crs(vect("pad-mobi.parquet", proxy = TRUE))
  ## project the extent so we can window read the vector
  ex <- ext(project(ext(r), to = vcrs, from = crs(r)))
  
  v <- vect("pad-mobi.parquet", extent = ex)
  
  
  
  if (nrow(v) > 0) {
    result <- terra::extract(r, project(v, crs(r)))
    names(result) <- c("ID", "value")
    result$row_n <- v$row_n[result$ID]
    val <- dplyr::group_by(result, row_n) |> dplyr::summarize(value = mean(value,  na.rm = TRUE)) |> dplyr::ungroup()
    
    stop()
  }
}

@mdsumner
Copy link
Author

mdsumner commented Mar 29, 2024

updated code run in parallel

Notes:

  • row_n is an unique global row id, so we keep it to map the result back to the actual polygon
  • we're taking a mean by polygon ID, but we aren't coalescing those (it might not be possible to do that without keeping n-statistics)
dsn <- "/vsicurl/https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
dsn <- basename(dsn)
info <- vapour::vapour_raster_info(dsn)
library(terra)
library(grout) ## hypertidy/grout
g <- grout(info$dimension, info$extent, blocksize = c(2048, 2048))

index <- tile_index(g)

## inline this into the worker function
#vcrs <- crs(vect("pad-mobi.parquet", proxy = TRUE))

## also create the extents that we want
tiles <- tibble::tibble(dsn = sprintf("vrt://%s?srcwin=%i,%i,%i,%i", dsn, 
                                      index$offset_x, index$offset_y, index$ncol, index$nrow), 
                        xmin = index$xmin, xmax = index$xmax, ymin = index$ymin, ymax = index$ymax)


## so here's my thinking, each row in this tibble has a tile we set up and an extent for the vector read
## ( vector read is all-inclusive, if a polygon overlaps the tile at all you get the whole polygon)
fun <- function(tile) {
vcrs <- "PROJCRS[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n    BASEGEOGCRS[\"NAD83\",\n        DATUM[\"North American Datum 1983\",\n            ELLIPSOID[\"GRS 1980\",6378137,298.257222101,\n                LENGTHUNIT[\"metre\",1]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        ID[\"EPSG\",4269]],\n    CONVERSION[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n        METHOD[\"Albers Equal Area\",\n            ID[\"EPSG\",9822]],\n        PARAMETER[\"Latitude of false origin\",23,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-96,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",29.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",45.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"(E)\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1]],\n        AXIS[\"(N)\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1]],\n    USAGE[\n        SCOPE[\"Not known.\"],\n        AREA[\"United States (USA) - CONUS onshore - Alabama; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming.\"],\n        BBOX[24.41,-124.79,49.38,-66.91]],\n    ID[\"ESRI\",102039]]"
  
  s <- tile
  library(terra)
  library(dplyr)
  r <- terra::rast(s$dsn)
  ## project the extent so we can window read the vector
  ex <- ext(project(ext(r), to = vcrs, from = crs(r)))
  
  v <- vect("pad-mobi.parquet", extent = ex)
  
  if (nrow(v) > 0) {
    result <- terra::extract(r, project(v, crs(r)))
    names(result) <- c("ID", "value")
    result$row_n <- v$row_n[result$ID]
    val <- dplyr::group_by(result, row_n) |> dplyr::summarize(value = mean(value,  na.rm = TRUE)) |> dplyr::ungroup()
    return(val)    
  }
  return(NULL)
}

library(parallel)
cl <- makeCluster(64)
sl <- system.time({
x <<- clusterMap(cl, fun, split(tiles, 1:nrow(tiles)))
})




@mdsumner
Copy link
Author

mdsumner commented Mar 29, 2024

followup suggestions:

for using local files

  • project the vector to the raster first, use "ogr2ogr vectormatchraster.parquet -t_srs "
  • DONE, SEE BELOW: cull the tiles table from the extent of the (re)projected vector layer
  • could potentially tile the vector to the desired raster blocks

for using the remote urls

  • could stream the reprojection from the source url with a bit of crafty GDAL VRT as well as the projected spat extent

@mdsumner
Copy link
Author

mdsumner commented Mar 30, 2024

cull the tiles upfront (there are 450 of them affected compared to nearly 16000 in the entire 2048x2048 view

Now takes 5 minutes, 64 cores.

dsn <- "/vsicurl/https://data.source.coop/vizzuality/hfp-100/hfp_2021_100m_v1-2_cog.tif"
dsn <- basename(dsn)
info <- vapour::vapour_raster_info(dsn)
library(terra)
library(grout) ## hypertidy/grout
g <- grout(info$dimension, info$extent, blocksize = c(2048, 2048))

index <- tile_index(g)

## also create the extents that we want
tiles <- tibble::tibble(dsn = sprintf("vrt://%s?srcwin=%i,%i,%i,%i", dsn, 
                                      index$offset_x, index$offset_y, index$ncol, index$nrow), 
                        xmin = index$xmin, xmax = index$xmax, ymin = index$ymin, ymax = index$ymax)
vcrs <- "PROJCRS[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n    BASEGEOGCRS[\"NAD83\",\n        DATUM[\"North American Datum 1983\",\n            ELLIPSOID[\"GRS 1980\",6378137,298.257222101,\n                LENGTHUNIT[\"metre\",1]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        ID[\"EPSG\",4269]],\n    CONVERSION[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n        METHOD[\"Albers Equal Area\",\n            ID[\"EPSG\",9822]],\n        PARAMETER[\"Latitude of false origin\",23,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-96,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",29.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",45.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"(E)\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1]],\n        AXIS[\"(N)\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1]],\n    USAGE[\n        SCOPE[\"Not known.\"],\n        AREA[\"United States (USA) - CONUS onshore - Alabama; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming.\"],\n        BBOX[24.41,-124.79,49.38,-66.91]],\n    ID[\"ESRI\",102039]]"

## cull tiles to the overal vector extent

vex <- as.vector(ext(project(ext(vapour::vapour_layer_extent("pad-mobi.parquet")), to = info$projection, from = vcrs)))
inside <- tiles$xmin >= vex[1] & tiles$xmax <= vex[2] & tiles$ymin >= vex[3] & tiles$ymax <= vex[4] 

tiles <- tiles[inside, ]
#vcrs <- crs(vect("pad-mobi.parquet", proxy = TRUE))

## so here's my thinking, each row in this tibble has a tile we set up and an extent for the vector read
## (I think the vector read is all-inclusive but I will check)
fun <- function(tile) {
  vcrs <- "PROJCRS[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n    BASEGEOGCRS[\"NAD83\",\n        DATUM[\"North American Datum 1983\",\n            ELLIPSOID[\"GRS 1980\",6378137,298.257222101,\n                LENGTHUNIT[\"metre\",1]]],\n        PRIMEM[\"Greenwich\",0,\n            ANGLEUNIT[\"degree\",0.0174532925199433]],\n        ID[\"EPSG\",4269]],\n    CONVERSION[\"USA_Contiguous_Albers_Equal_Area_Conic_USGS_version\",\n        METHOD[\"Albers Equal Area\",\n            ID[\"EPSG\",9822]],\n        PARAMETER[\"Latitude of false origin\",23,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8821]],\n        PARAMETER[\"Longitude of false origin\",-96,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8822]],\n        PARAMETER[\"Latitude of 1st standard parallel\",29.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8823]],\n        PARAMETER[\"Latitude of 2nd standard parallel\",45.5,\n            ANGLEUNIT[\"degree\",0.0174532925199433],\n            ID[\"EPSG\",8824]],\n        PARAMETER[\"Easting at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8826]],\n        PARAMETER[\"Northing at false origin\",0,\n            LENGTHUNIT[\"metre\",1],\n            ID[\"EPSG\",8827]]],\n    CS[Cartesian,2],\n        AXIS[\"(E)\",east,\n            ORDER[1],\n            LENGTHUNIT[\"metre\",1]],\n        AXIS[\"(N)\",north,\n            ORDER[2],\n            LENGTHUNIT[\"metre\",1]],\n    USAGE[\n        SCOPE[\"Not known.\"],\n        AREA[\"United States (USA) - CONUS onshore - Alabama; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming.\"],\n        BBOX[24.41,-124.79,49.38,-66.91]],\n    ID[\"ESRI\",102039]]"
  s <- tile
  library(terra)
  library(dplyr)
  r <- terra::rast(s$dsn)
 
  ## project the extent so we can window read the vector
  ex <- ext(project(ext(r), to = vcrs, from = crs(r)))
  
  v <- vect("pad-mobi.parquet", extent = ex)
  
  if (nrow(v) > 0) {
    result <- terra::extract(r, project(v, crs(r)))
    names(result) <- c("ID", "value")
    result$row_n <- v$row_n[result$ID]
    val <- dplyr::group_by(result, row_n) |> dplyr::summarize(value = mean(value,  na.rm = TRUE)) |> dplyr::ungroup()
    return(val)    
  }
  return(NULL)
}

library(parallel)
cl <- makeCluster(64)
sl <- system.time({
x <<- clusterMap(cl, fun, split(tiles, 1:nrow(tiles)))
})

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment