Skip to content

Instantly share code, notes, and snippets.

@aavogt
Created January 12, 2024 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aavogt/595195c7eaf17972c5a85c55fd7bd350 to your computer and use it in GitHub Desktop.
Save aavogt/595195c7eaf17972c5a85c55fd7bd350 to your computer and use it in GitHub Desktop.
EPM contour algorithm racing
# empirical performance model (EPM) contour active learning
I have two or three different algorithms for finding the closest pair of points between two point sets.
I want to know which one is faster, depending on the size of the point sets and the distance between them.
There are several parts to this project:
- [ ] level set method for contour curvature and normal vector / directional derivative of the EPM
- [ ] debug the distance transform and
- [ ] caret::knn.reg if it exists?
- [ ] shape constrained splines (SCAM) instead of mgcv::gam?
- [ ] `cma_es` might not be necessary: it is likely a gradient exists, and bfgs-b is probably better with the analytical gradient. predict.gam `The routine can optionally return the matrix by which the model coefficients must be pre-multiplied in order to yield the values of the linear predictor at the supplied covariate values: this is useful for obtaining credible regions for quantities derived from the model (e.g. derivatives of smooths)`
- [ ] it is also possible that both algorithms do not need to be sampled at the same locations. If I have M different algorithms, I only need to sample ones where the EPMs cannot say one is better than the other.
- [ ] <https://cran.r-project.org/web/packages/shapr/index.html> `ref/1903.10464_shapley.pdf`
```{r}
library(pacman)
p_load(tidyverse, rextendr, glue, assertthat, tictoc,
emstreeR, lhs, ggplot2, parallel, mgcv, cmaes, ggforce, tripack, RANN,
caret)
if (!exists("lib_mtime") || lib_mtime < file.info("lib.rs")$mtime) {
# recompile and load the rust code the first time, or if it has changed
# since cargo etc. take at least 1 second
source("rust_source.R")
rust_source("lib.rs")
lib_mtime <- file.info("lib.rs")$mtime
}
# RANN's nn2 is by far the fastest
closest_pairs_nn <- function(p, q) {
r <- nn2(p, q, k=1)
within(list( iq = which.min(r$nn.dists)),
{ ip <- r$nn.idx[iq]
d <- r$nn.dists[iq] })
}
# tripack1 is 10 times slower than emstreeR
# tripack is 4 times slower than emstreeR
closest_pairs_tripack1 <- function(p, q) {
tri <- tri.mesh(p$x, p$y)
q %>%
mutate(iq = seq_len(n())) %>%
rowwise %>% group_map(~ {
ns <- tri.find(tri, .x$x, .x$y) %>% as.integer
ds <- (tri$x[ns] - .x$x)^2 + (tri$y[ns] - .x$y)^2
within(.x, {
ip <- ns[which.min(ds)]
d <- sqrt(min(ds))
})
}) %>%
bind_rows %>%
{ .[which.min(.$d), ] }
}
mkP <- function(n) randomLHS(n, 2) %>% as_tibble %>% setNames(c("x", "y"))
pq <- list(mkP(1000), mkP(1000))
# closest pair algorithm for integer points
# https://en.wikipedia.org/wiki/Closest_pair_of_points_problem#Linear-time_randomized_algorithms
# Rabin 1976 is for a single point set.
#
# This adaptation to pairs of point sets is worst-case O(n^2) because
# when the two sets are very far apart, d is large and then
# all points are in 1-3 buckets, and then we have to inspect all pairs.
#
# But if the two sets overlap, then d is small and it should be closer to O(n)
closest_pairs_hash <- function(p, q) {
np <- nrow(p)
nq <- nrow(q)
assert_that(np > 0, nq > 0)
n <- min(np, nq)
sp <- sample(n, replace=TRUE)
sq <- sample(n, replace=TRUE)
d <- (p[ sp, c('x','y')] - q[ sq, c('x','y')])^2 %>% apply(1, sum) %>% sqrt %>% min
closest_in_buckets(p$x, p$y, q$x, q$y, d)
}
# closest pair via emstreeR
#
# let E be an edge in the emst between (disjoint) p and q
# let E* be the shortest edge between p and q
# if E* was not in emst, we could
# delete E and insert E* to get a shorter emst
# which is a contradiction
#
# EMST takes O(n log n) time
# inspecting each edge takes O(n) time
# this is faster than the closest_in_buckets if the point sets are far apart
# if they are close together, this is slower
# there is a cutoff point where this is faster
# the criterion is a function of the number of buckets that are adjacent
# but that in turn is a function of the number of buckets
# which in turn is a function of d which should be scaled to some measure
# of the point set size such as the distance between centroids
# this seems worth benchmarking to find the best cutoff for
# point sets that are are
closest_pairs_emst <- function(p, q) {
np <- nrow(p)
nq <- nrow(q)
assert_that(np > 0, nq > 0)
pq <- rbind(p[, c("x", "y")], q[, c("x", "y")])
emst <- ComputeMST(pq)
# inspect each edge, find the shortest has one point from each set emst$edges
closest_pairs_emst_walk(as.integer(emst$from), as.integer(emst$to), emst$dist, as.integer(np)) %>%
setNames(c("d", "ip", "iq"))
}
tic()
do.call(closest_pairs_nn, pq)
toc()
tic()
do.call(closest_pairs_emst, pq)
toc()
# closest pairs via distance transform (wrong)
closest_pairs_dt <- function(p, q) {
dx <- min(c(p$x, q$x))
dy <- min(c(p$y, q$y))
px <- p$x - dx
py <- p$y - dy
qx <- q$x - dx
qy <- q$y - dy
assert_that(is.integer(px), is.integer(py), is.integer(qx), is.integer(qy),
all(px >= 0L), all(py >= 0L), all(qx >= 0L), all(qy >= 0L))
closest_dt(px, py, qx, qy)
}
methods <- c("closest_pairs_hash", "closest_pairs_emst")
```
```{r}
mk_x <- function(n = 2000, delta = 0, width=100, dim=2) {
a <- width * randomLHS(n, dim)
b <- width * randomLHS(n, dim)
mode(b) <- mode(a) <- "integer"
colnames(a) <- colnames(b) <- c("x", "y")
a[ , 'x'] <- a[ , 'x' ] + as.integer(delta)
ab <- list(as_tibble(a, .name_repair="unique"), as_tibble(b, .name_repair="unique"))
}
# race(fs, x) races the functions in fs on the data in x. fs can either be a
# list of functions with names, or a list of strings which must be the names of
# functions. In other words, do.call(fs[i], x)
#
# Losers of the race are disqualified if they take 1+disqualifiation times
# the winner's time.
#
# polling_period is the time in seconds to wait between checking for the winner.
#
# the result is a tibble with columns
#
## A tibble: 3 × 8
# f crashed user.self sys.self elapsed user.child sys.child killed
# <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 timeout FALSE 0.001 0 0.0210 0 0 FALSE
# 2 closest_pairs_hash FALSE 0.032 0.01 0.0420 0 0 FALSE
# 3 closest_pairs_emst FALSE NA NA 0.0252 NA NA TRUE
race <- function(fs, x, polling_period=0.05, disqualification = 0.2, timeout=NA) {
do.call(assert_that,c(is.character(fs) || is.list(fs),
is.list(x),
imap(fs, ~ { is.character(.y) && (is.function(.x) || exists(.x, mode="function")) ||
is.character(.x) && exists(.x, mode="function") } )))
if (!is.na(timeout)) fs <- c(timeout= function(...) Sys.sleep(timeout), as.list(fs))
# or fs could be a list of functions, and the names(fs) is used for the f column
ps <- imap(fs, ~ {
f_str <- if (is.character(.x)) .x else .y
mcparallel({
t0 <- proc.time()
crashed <- tryCatch( { do.call(.x, x); FALSE},
error = function(e) TRUE)
t1 <- proc.time()
c(list(f= f_str,
crashed=crashed),
map2(t1, t0, `-`))
}, f_str)
})
complete <- list()
while (TRUE) {
# check each uncompleted process in ps once every every polling_period
if (is.null(ps)) break
complete1 <- map(ps, ~ mccollect(., wait=F))
nulls <- map_lgl(complete1, is.null)
complete <- c(complete, complete1[!nulls])
ps <- ps[nulls]
crashes <- complete1[!nulls] %>% map_lgl(~ .[[1]]$crashed)
if (any(!crashes)) break
Sys.sleep(polling_period)
}
# either all processes crashed or one completed
dt <- map_dbl(complete, pluck, 1, "elapsed", .default=Inf) %>% min(na.rm=TRUE)
complete2 <- mccollect(ps, wait=FALSE, timeout = dt)
walk(ps, parallel:::mckill, signal=9) # or signal=15? Does it need to run on parallel:::children()?
ps <- if (is.null(complete2)) ps else ps[map_lgl(complete2, is.null)]
t_max <- (1+disqualification) * dt
rbind(map_dfr(complete, pluck, 1) %>% mutate(killed = FALSE),
bind_rows(complete2) %>% mutate(killed=FALSE),
map_dfr(ps, ~ tibble( f = .$name,
elapsed = NA,
user.self= NA,
sys.self=NA,
user.child=NA,
sys.child=NA,
crashed = FALSE,
killed = TRUE ))) %>%
# replace NA in elapsed with the maximum time
mutate_at("elapsed", ~ ifelse(is.na(.), max(., t_max, na.rm=TRUE), .))
}
# race(c("closest_pairs_hash", "closest_pairs_emst"), mk_x(), timeout=0.02)
# initial space-filling design
d <- xfun::oache_rds({randomLHS(500, 3) %>%
as_tibble() %>%
transmute(n = V1 * 10000 + 20,
width = V2 * 1000 + 20,
delta = V3 * 100) %>%
rowwise %>%
mutate(r = list(race(methods, mk_x(n, delta, width)))) %>%
unnest(r)
})
```
```{r}
ggplot(d %>% filter(!killed), aes(n, elapsed, col=factor(f))) + geom_point(aes(size=width), alpha=0.5) + geom_smooth() +
facet_wrap(~ cut(delta, 3), labeller = "label_both") +
ggtitle("elapsed time vs n, delta, width")
```
```{r}
# fit gam s(n, delta, width)
par0 <- c(1000, 500, 50)
d <- xfun::cache_rds({
for (i in 1:10) {
ms <- d %>%
group_by(f) %>%
group_map( ~ gam(elapsed ~ s(n, delta, width), data= . ))
get_mean_se <- function(i, par) predict(ms[[i]],
newdata=data.frame(n=par[1], delta=par[2], width=par[3]),
se.fit=TRUE)
opt_result <- cma_es( par0, function(par) {
x <- get_mean_se(1, par)
y <- get_mean_se(2, par)
abs(x$fit - y$fit) - (x$se.fit + y$se.fit)
}, lower=c(20, 20, 2), upper=c(10000, 1000, 100), control=list(maxit=40))
par0 <- opt_result$par
print(par0)
d <- rbind(d, race(methods, mk_x(par0[1], par0[2], par0[3])) %>%
within({ n <- par0[1]; delta <- par0[2]; width <- par0[3] }))
}
d})
# https://github.com/tidyverse/ggplot2/issues/5349#issuecomment-1631911060
ggplot(d[ nrow(d) - (0:9), ] %>% within(i <- 1:10),
aes(delta, width, linewidth=n, col=i)) +
geom_link2(lineend="round") +
ggtitle("path taken by CMA-ES")
```
I am unsatisfied with gam above. I could do k nearest neighbours and then lm or gp.
lm would allow the coef
```{r}
knn_with_pgrad <- function(formula, data, new_x, k = 10) {
mf <- model.frame(formula, data)
if (is.numeric(new_x)) new_x <- matrix(new_x, nrow=1)
nn <- nn2(as.matrix(mf)[, -1], new_x, k = k)
m <- lm(formula, data %>% slice(nn$nn.idx[1, ]))
list(cbind(1, new_x) %*% coef(m), coef(m))
}
knn_with_pgrad(elapsed ~ n + width + delta, d, c(3000, 400, 50))
```
Next I would like to learn level set methods. Somehow plotting the prediction volume that encloses the zero contour (ie. where both models are equal).
This would pehaps result in a better active learning strategy: sampling should increase with increasing curvature of the contour, as well as with increasing gradient of the time-difference normal to the contour. Both of these factors are unaccounted for in the current method.
Goldman 2005
F(x,y,z) = 0
is the spline
I need `H*(F)`, adjoint of hessian
RConics::adjoint equivalent to `det(H)*solve(H)` https://stat.ethz.ch/pipermail/r-help/2006-August/111922.html
it seems that RConics is using the definition of cofactors/minors etc., so solve is probably faster.
predict(type="lpmatrix") is used for derivatives with respect to model parameters not with respect to the new point. I could do a numerical hessian numDeriv.
https://tensorflow.rstudio.com/guides/tensorflow/autodiff
or I could just use nn2 and say that the point is a weighted sum of k nearest neighbours.
what is the derivative in that case?
```{r}
# first find a point where the two models are equal
pStar <- optim(c(n=1000, delta=100, width=20), function(p) abs( predict(ms[[1]], as.list(p)) - predict(ms[[2]], as.list(p))),
method="L-BFGS-B", lower=c(20, 20, 2), upper=c(10000, 1000, 100))
```
```{r}
# this is too slow.
curvature <- function(p) {
f <- function(q) { newdata <- as.list(q) %>% setNames(c("n", "delta", "width"))
predict(ms[[1]], newdata) - predict(ms[[2]], newdata)
}
H <- numDeriv::hessian(f, p)
G <- numDeriv::grad(f, p)
# ref/basic.pdf
G %*% (det(H) * solve(H)) %*% G / (sum(G^2))
}
tic()
curvature(pStar$par)
toc()
```
```{r}
library(pacman)
p_load(mvtnorm, tidyverse, ggplot2, lhs)
source("stat_interp.R")
n <- 200 # Number of observations
theta <- 0.1 # length scale (smaller == more bumpy)
# Generate data using mvtnorm
data <- data.frame(randomLHS(n, 2)) %>% as_tibble %>% setNames(c("x", "y"))
data$z <- as.numeric(rmvnorm(1, sigma = exp(-as.matrix(dist(data)/ theta)) ))
data_reg <- with(data, {
o <- expand_grid(xo=seq(min(x), max(x), length.out=100), yo=seq(min(y), max(y), length.out=100))
akima::interpp(x, y = y, z, yo=o$yo,xo=o$xo, extrap=T)
}) %>% as_tibble
ggplot(data, aes(x, y, z = z)) + stat_interp_contour_filled() + geom_point()
nn_grad <- function(xy, z, new_xy) {
if (length(new_xy) == ncol(xy)) new_xy <- matrix(new_xy, ncol=ncol(xy)) # one row
nn <- nn2(as.matrix(xy), matrix(new_xy, ncol=ncol(xy)), k = ncol(xy)+1) # or more because what if I don't have a basis?
nn_svd <- svd(xy[ nn$nn.idx[1, ] , ])
# the rows of nn_svd$v are the basis vectors
# so if I multiply by nn_svd$v I get the transformation
# back to (1,0) (0, 1) etc.
}
nn_grad(data %>% select(-z), data$z, c(0.5, 0.5))
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment