obrl-soil/poore_me.R

## poore_me.R
# faffing about with methods to approximate a "lag vertex polygon" (Poore, 1986)
# or "best-fit regular geometric feature" (Stoddard, 1965) for polygon shape
# metric

# References:
# Poore 1986 - MSc Thesis on soil map quantification at
# https://ir.library.oregonstate.edu/downloads/zk51vk04q
# Stoddart, D. (1965). The shape of atolls. Marine Geology, 3(5), 369–383.
# doi:10.1016/0025-3227(65)90025-3 (its on scihub)
# also apparently Fridland 1972 "The pattern of the soil cover" but that's
# not online ;_;

# basically this is looking for a single-polygon simplification that is a)
# as close to n vertices as possible and b) as close to equidistant sides as
# possible. It would appear to assume a relatively high-detail polygon as input,
# with no islands and holes ignored if present.

library(sf)
library(tidyverse)

nc = st_read(system.file("shape/nc.shp", package="sf"))

nc1 <- nc[1, ]
nc1 <- st_transform(nc1, 3857) # shhh don't need max precision, just meters

st_cast(nc1, 'POINT') %>% nrow() # 27 points natively

# add points to a certain level of detail (0.5 km here)
nc1d <- st_segmentize(nc1, 500)
plot(st_cast(nc1d[0], 'POINT'), pch = 20, col = 'red')

# filter to get a set of points x dist apart (20 * 500 here - 10 km)
# but this is counted along the perimeter so the points aren't really
# equidistant as required. More problematic as shape gets more convoluted...

nc1_lp <- st_cast(nc1d, 'POINT') %>%
  # may as well start from a random vertex:
  filter((row_number() + sample(seq(nrow(.)), 1)) %% 20 == 0)

plot(nc1[0], reset = FALSE, border = 'grey60')
plot(nc1_lp[0], pch = 20, col = 'red', add = TRUE)

# surprisingly shape preserving although obs not foolproof
# defo only for simple polygons! try nc[4, ] haha

# back to poly with
lag_ish_poly <-  st_coordinates(nc1_lp) %>%
  rbind(., .[1, ]) %>%
  list(.) %>%
  st_polygon()

plot(lag_ish_poly, col = NA, border = 'red', add = TRUE)

# to get the lag sums after Stoddard's description:
## "The polygon thus formed can be described by summing the distances between
## all the vertices lag-one, lag-two,..., until all unique sums are determined;
## and by squaring and then summing the distances between all vertices lag-one,
## lag-two..., until all unique squared sums are determined. Each shape will
## then have a one-to-one correspondence with one of these sets of sums"
# and
## "Distances lag-one (S1) are given by AC, BD, CE,... HB, i.e., one vertex is
## skipped. Distances lag-two (S2) are given by AD, BE, CF:... HC; i.e., two
## vertices are skipped. Distances lag-three (S3) are given by AE, BF, CG,...
## HD; i.e., three vertices are skipped"
# Poore generalises this to (n-2/2) lag steps, where n is polygon vertices. This
# conveniently(?) gave him ten lag measurements for each 22-sided shape.

lagsums <- function(pts = NULL) {
  nr <- nrow(pts)
  lags <- map_dbl(seq(2, (nr - 2)/2), function(l) {
    # subset in a loop
    start   <- pts[1, ]
    mids    <- pts[seq(nr) %% l == 0, ]
    end     <- pts[l, ]
    lagset  <- rbind(start, mids)
    lagset1 <- rbind(mids, end)
    lag_d   <- as.numeric(st_distance(lagset, lagset1, by_element = TRUE))
    mean(lag_d) # gets around lack of perf. equal side dist
  })
  matrix(c(lags, lags^2), ncol = 2, byrow = FALSE)
}

# anyway apparently this encodes the shape to a given precision...
lagsums(nc1_lp)

df <- as.data.frame(lagsums(nc1_lp)) %>%
  mutate(LAG = seq(n()) + 1)

# but they don't plot to a nice smooth curve like the examples in Poore (p.37ish)
ggplot(df, aes(x = LAG, y = V1)) +
  geom_line() +
  theme_minimal()

# what happens if one segmentises and then runs ms_simplify, not worrying about
# edge length e.g.
nc1_mss <-
  rmapshaper::ms_simplify(nc1d,
                         # gets as close to n vertices as possible:
                         keep = 1 / st_cast(nc1d, 'POINT') %>% nrow() * 25) %>%
  st_cast(., 'POINT')


plot(nc1[0], reset = FALSE, border = 'grey60')
plot(nc1_mss[0], pch = 20, col = 'red', add = TRUE)

# messier!
lagsums(nc1_mss)

df2 <- as.data.frame(lagsums(nc1_mss)) %>%
  mutate(LAG = seq(n()) + 1)

ggplot(df2, aes(x = LAG, y = V1)) +
  geom_line() +
  theme_minimal()

# anyway tbh there are simpler measures of shape/form/complexity but this
# is still intriguing
	# faffing about with methods to approximate a "lag vertex polygon" (Poore, 1986)
	# or "best-fit regular geometric feature" (Stoddard, 1965) for polygon shape
	# metric

	# References:
	# Poore 1986 - MSc Thesis on soil map quantification at
	# https://ir.library.oregonstate.edu/downloads/zk51vk04q
	# Stoddart, D. (1965). The shape of atolls. Marine Geology, 3(5), 369–383.
	# doi:10.1016/0025-3227(65)90025-3 (its on scihub)
	# also apparently Fridland 1972 "The pattern of the soil cover" but that's
	# not online ;_;

	# basically this is looking for a single-polygon simplification that is a)
	# as close to n vertices as possible and b) as close to equidistant sides as
	# possible. It would appear to assume a relatively high-detail polygon as input,
	# with no islands and holes ignored if present.

	library(sf)
	library(tidyverse)

	nc = st_read(system.file("shape/nc.shp", package="sf"))

	nc1 <- nc[1, ]
	nc1 <- st_transform(nc1, 3857) # shhh don't need max precision, just meters

	st_cast(nc1, 'POINT') %>% nrow() # 27 points natively

	# add points to a certain level of detail (0.5 km here)
	nc1d <- st_segmentize(nc1, 500)
	plot(st_cast(nc1d[0], 'POINT'), pch = 20, col = 'red')

	# filter to get a set of points x dist apart (20 * 500 here - 10 km)
	# but this is counted along the perimeter so the points aren't really
	# equidistant as required. More problematic as shape gets more convoluted...

	nc1_lp <- st_cast(nc1d, 'POINT') %>%
	# may as well start from a random vertex:
	filter((row_number() + sample(seq(nrow(.)), 1)) %% 20 == 0)

	plot(nc1[0], reset = FALSE, border = 'grey60')
	plot(nc1_lp[0], pch = 20, col = 'red', add = TRUE)

	# surprisingly shape preserving although obs not foolproof
	# defo only for simple polygons! try nc[4, ] haha

	# back to poly with
	lag_ish_poly <- st_coordinates(nc1_lp) %>%
	rbind(., .[1, ]) %>%
	list(.) %>%
	st_polygon()

	plot(lag_ish_poly, col = NA, border = 'red', add = TRUE)

	# to get the lag sums after Stoddard's description:
	## "The polygon thus formed can be described by summing the distances between
	## all the vertices lag-one, lag-two,..., until all unique sums are determined;
	## and by squaring and then summing the distances between all vertices lag-one,
	## lag-two..., until all unique squared sums are determined. Each shape will
	## then have a one-to-one correspondence with one of these sets of sums"
	# and
	## "Distances lag-one (S1) are given by AC, BD, CE,... HB, i.e., one vertex is
	## skipped. Distances lag-two (S2) are given by AD, BE, CF:... HC; i.e., two
	## vertices are skipped. Distances lag-three (S3) are given by AE, BF, CG,...
	## HD; i.e., three vertices are skipped"
	# Poore generalises this to (n-2/2) lag steps, where n is polygon vertices. This
	# conveniently(?) gave him ten lag measurements for each 22-sided shape.

	lagsums <- function(pts = NULL) {
	nr <- nrow(pts)
	lags <- map_dbl(seq(2, (nr - 2)/2), function(l) {
	# subset in a loop
	start <- pts[1, ]
	mids <- pts[seq(nr) %% l == 0, ]
	end <- pts[l, ]
	lagset <- rbind(start, mids)
	lagset1 <- rbind(mids, end)
	lag_d <- as.numeric(st_distance(lagset, lagset1, by_element = TRUE))
	mean(lag_d) # gets around lack of perf. equal side dist
	})
	matrix(c(lags, lags^2), ncol = 2, byrow = FALSE)
	}

	# anyway apparently this encodes the shape to a given precision...
	lagsums(nc1_lp)

	df <- as.data.frame(lagsums(nc1_lp)) %>%
	mutate(LAG = seq(n()) + 1)

	# but they don't plot to a nice smooth curve like the examples in Poore (p.37ish)
	ggplot(df, aes(x = LAG, y = V1)) +
	geom_line() +
	theme_minimal()

	# what happens if one segmentises and then runs ms_simplify, not worrying about
	# edge length e.g.
	nc1_mss <-
	rmapshaper::ms_simplify(nc1d,
	# gets as close to n vertices as possible:
	keep = 1 / st_cast(nc1d, 'POINT') %>% nrow() * 25) %>%
	st_cast(., 'POINT')


	plot(nc1[0], reset = FALSE, border = 'grey60')
	plot(nc1_mss[0], pch = 20, col = 'red', add = TRUE)

	# messier!
	lagsums(nc1_mss)

	df2 <- as.data.frame(lagsums(nc1_mss)) %>%
	mutate(LAG = seq(n()) + 1)

	ggplot(df2, aes(x = LAG, y = V1)) +
	geom_line() +
	theme_minimal()

	# anyway tbh there are simpler measures of shape/form/complexity but this
	# is still intriguing