Created
January 12, 2024 16:18
-
-
Save aavogt/595195c7eaf17972c5a85c55fd7bd350 to your computer and use it in GitHub Desktop.
EPM contour algorithm racing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# empirical performance model (EPM) contour active learning | |
I have two or three different algorithms for finding the closest pair of points between two point sets. | |
I want to know which one is faster, depending on the size of the point sets and the distance between them. | |
There are several parts to this project: | |
- [ ] level set method for contour curvature and normal vector / directional derivative of the EPM | |
- [ ] debug the distance transform and | |
- [ ] caret::knn.reg if it exists? | |
- [ ] shape constrained splines (SCAM) instead of mgcv::gam? | |
- [ ] `cma_es` might not be necessary: it is likely a gradient exists, and bfgs-b is probably better with the analytical gradient. predict.gam `The routine can optionally return the matrix by which the model coefficients must be pre-multiplied in order to yield the values of the linear predictor at the supplied covariate values: this is useful for obtaining credible regions for quantities derived from the model (e.g. derivatives of smooths)` | |
- [ ] it is also possible that both algorithms do not need to be sampled at the same locations. If I have M different algorithms, I only need to sample ones where the EPMs cannot say one is better than the other. | |
- [ ] <https://cran.r-project.org/web/packages/shapr/index.html> `ref/1903.10464_shapley.pdf` | |
```{r} | |
library(pacman) | |
p_load(tidyverse, rextendr, glue, assertthat, tictoc, | |
emstreeR, lhs, ggplot2, parallel, mgcv, cmaes, ggforce, tripack, RANN, | |
caret) | |
if (!exists("lib_mtime") || lib_mtime < file.info("lib.rs")$mtime) { | |
# recompile and load the rust code the first time, or if it has changed | |
# since cargo etc. take at least 1 second | |
source("rust_source.R") | |
rust_source("lib.rs") | |
lib_mtime <- file.info("lib.rs")$mtime | |
} | |
# RANN's nn2 is by far the fastest | |
closest_pairs_nn <- function(p, q) { | |
r <- nn2(p, q, k=1) | |
within(list( iq = which.min(r$nn.dists)), | |
{ ip <- r$nn.idx[iq] | |
d <- r$nn.dists[iq] }) | |
} | |
# tripack1 is 10 times slower than emstreeR | |
# tripack is 4 times slower than emstreeR | |
closest_pairs_tripack1 <- function(p, q) { | |
tri <- tri.mesh(p$x, p$y) | |
q %>% | |
mutate(iq = seq_len(n())) %>% | |
rowwise %>% group_map(~ { | |
ns <- tri.find(tri, .x$x, .x$y) %>% as.integer | |
ds <- (tri$x[ns] - .x$x)^2 + (tri$y[ns] - .x$y)^2 | |
within(.x, { | |
ip <- ns[which.min(ds)] | |
d <- sqrt(min(ds)) | |
}) | |
}) %>% | |
bind_rows %>% | |
{ .[which.min(.$d), ] } | |
} | |
mkP <- function(n) randomLHS(n, 2) %>% as_tibble %>% setNames(c("x", "y")) | |
pq <- list(mkP(1000), mkP(1000)) | |
# closest pair algorithm for integer points | |
# https://en.wikipedia.org/wiki/Closest_pair_of_points_problem#Linear-time_randomized_algorithms | |
# Rabin 1976 is for a single point set. | |
# | |
# This adaptation to pairs of point sets is worst-case O(n^2) because | |
# when the two sets are very far apart, d is large and then | |
# all points are in 1-3 buckets, and then we have to inspect all pairs. | |
# | |
# But if the two sets overlap, then d is small and it should be closer to O(n) | |
closest_pairs_hash <- function(p, q) { | |
np <- nrow(p) | |
nq <- nrow(q) | |
assert_that(np > 0, nq > 0) | |
n <- min(np, nq) | |
sp <- sample(n, replace=TRUE) | |
sq <- sample(n, replace=TRUE) | |
d <- (p[ sp, c('x','y')] - q[ sq, c('x','y')])^2 %>% apply(1, sum) %>% sqrt %>% min | |
closest_in_buckets(p$x, p$y, q$x, q$y, d) | |
} | |
# closest pair via emstreeR | |
# | |
# let E be an edge in the emst between (disjoint) p and q | |
# let E* be the shortest edge between p and q | |
# if E* was not in emst, we could | |
# delete E and insert E* to get a shorter emst | |
# which is a contradiction | |
# | |
# EMST takes O(n log n) time | |
# inspecting each edge takes O(n) time | |
# this is faster than the closest_in_buckets if the point sets are far apart | |
# if they are close together, this is slower | |
# there is a cutoff point where this is faster | |
# the criterion is a function of the number of buckets that are adjacent | |
# but that in turn is a function of the number of buckets | |
# which in turn is a function of d which should be scaled to some measure | |
# of the point set size such as the distance between centroids | |
# this seems worth benchmarking to find the best cutoff for | |
# point sets that are are | |
closest_pairs_emst <- function(p, q) { | |
np <- nrow(p) | |
nq <- nrow(q) | |
assert_that(np > 0, nq > 0) | |
pq <- rbind(p[, c("x", "y")], q[, c("x", "y")]) | |
emst <- ComputeMST(pq) | |
# inspect each edge, find the shortest has one point from each set emst$edges | |
closest_pairs_emst_walk(as.integer(emst$from), as.integer(emst$to), emst$dist, as.integer(np)) %>% | |
setNames(c("d", "ip", "iq")) | |
} | |
tic() | |
do.call(closest_pairs_nn, pq) | |
toc() | |
tic() | |
do.call(closest_pairs_emst, pq) | |
toc() | |
# closest pairs via distance transform (wrong) | |
closest_pairs_dt <- function(p, q) { | |
dx <- min(c(p$x, q$x)) | |
dy <- min(c(p$y, q$y)) | |
px <- p$x - dx | |
py <- p$y - dy | |
qx <- q$x - dx | |
qy <- q$y - dy | |
assert_that(is.integer(px), is.integer(py), is.integer(qx), is.integer(qy), | |
all(px >= 0L), all(py >= 0L), all(qx >= 0L), all(qy >= 0L)) | |
closest_dt(px, py, qx, qy) | |
} | |
methods <- c("closest_pairs_hash", "closest_pairs_emst") | |
``` | |
```{r} | |
mk_x <- function(n = 2000, delta = 0, width=100, dim=2) { | |
a <- width * randomLHS(n, dim) | |
b <- width * randomLHS(n, dim) | |
mode(b) <- mode(a) <- "integer" | |
colnames(a) <- colnames(b) <- c("x", "y") | |
a[ , 'x'] <- a[ , 'x' ] + as.integer(delta) | |
ab <- list(as_tibble(a, .name_repair="unique"), as_tibble(b, .name_repair="unique")) | |
} | |
# race(fs, x) races the functions in fs on the data in x. fs can either be a | |
# list of functions with names, or a list of strings which must be the names of | |
# functions. In other words, do.call(fs[i], x) | |
# | |
# Losers of the race are disqualified if they take 1+disqualifiation times | |
# the winner's time. | |
# | |
# polling_period is the time in seconds to wait between checking for the winner. | |
# | |
# the result is a tibble with columns | |
# | |
## A tibble: 3 × 8 | |
# f crashed user.self sys.self elapsed user.child sys.child killed | |
# <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> | |
# 1 timeout FALSE 0.001 0 0.0210 0 0 FALSE | |
# 2 closest_pairs_hash FALSE 0.032 0.01 0.0420 0 0 FALSE | |
# 3 closest_pairs_emst FALSE NA NA 0.0252 NA NA TRUE | |
race <- function(fs, x, polling_period=0.05, disqualification = 0.2, timeout=NA) { | |
do.call(assert_that,c(is.character(fs) || is.list(fs), | |
is.list(x), | |
imap(fs, ~ { is.character(.y) && (is.function(.x) || exists(.x, mode="function")) || | |
is.character(.x) && exists(.x, mode="function") } ))) | |
if (!is.na(timeout)) fs <- c(timeout= function(...) Sys.sleep(timeout), as.list(fs)) | |
# or fs could be a list of functions, and the names(fs) is used for the f column | |
ps <- imap(fs, ~ { | |
f_str <- if (is.character(.x)) .x else .y | |
mcparallel({ | |
t0 <- proc.time() | |
crashed <- tryCatch( { do.call(.x, x); FALSE}, | |
error = function(e) TRUE) | |
t1 <- proc.time() | |
c(list(f= f_str, | |
crashed=crashed), | |
map2(t1, t0, `-`)) | |
}, f_str) | |
}) | |
complete <- list() | |
while (TRUE) { | |
# check each uncompleted process in ps once every every polling_period | |
if (is.null(ps)) break | |
complete1 <- map(ps, ~ mccollect(., wait=F)) | |
nulls <- map_lgl(complete1, is.null) | |
complete <- c(complete, complete1[!nulls]) | |
ps <- ps[nulls] | |
crashes <- complete1[!nulls] %>% map_lgl(~ .[[1]]$crashed) | |
if (any(!crashes)) break | |
Sys.sleep(polling_period) | |
} | |
# either all processes crashed or one completed | |
dt <- map_dbl(complete, pluck, 1, "elapsed", .default=Inf) %>% min(na.rm=TRUE) | |
complete2 <- mccollect(ps, wait=FALSE, timeout = dt) | |
walk(ps, parallel:::mckill, signal=9) # or signal=15? Does it need to run on parallel:::children()? | |
ps <- if (is.null(complete2)) ps else ps[map_lgl(complete2, is.null)] | |
t_max <- (1+disqualification) * dt | |
rbind(map_dfr(complete, pluck, 1) %>% mutate(killed = FALSE), | |
bind_rows(complete2) %>% mutate(killed=FALSE), | |
map_dfr(ps, ~ tibble( f = .$name, | |
elapsed = NA, | |
user.self= NA, | |
sys.self=NA, | |
user.child=NA, | |
sys.child=NA, | |
crashed = FALSE, | |
killed = TRUE ))) %>% | |
# replace NA in elapsed with the maximum time | |
mutate_at("elapsed", ~ ifelse(is.na(.), max(., t_max, na.rm=TRUE), .)) | |
} | |
# race(c("closest_pairs_hash", "closest_pairs_emst"), mk_x(), timeout=0.02) | |
# initial space-filling design | |
d <- xfun::oache_rds({randomLHS(500, 3) %>% | |
as_tibble() %>% | |
transmute(n = V1 * 10000 + 20, | |
width = V2 * 1000 + 20, | |
delta = V3 * 100) %>% | |
rowwise %>% | |
mutate(r = list(race(methods, mk_x(n, delta, width)))) %>% | |
unnest(r) | |
}) | |
``` | |
```{r} | |
ggplot(d %>% filter(!killed), aes(n, elapsed, col=factor(f))) + geom_point(aes(size=width), alpha=0.5) + geom_smooth() + | |
facet_wrap(~ cut(delta, 3), labeller = "label_both") + | |
ggtitle("elapsed time vs n, delta, width") | |
``` | |
```{r} | |
# fit gam s(n, delta, width) | |
par0 <- c(1000, 500, 50) | |
d <- xfun::cache_rds({ | |
for (i in 1:10) { | |
ms <- d %>% | |
group_by(f) %>% | |
group_map( ~ gam(elapsed ~ s(n, delta, width), data= . )) | |
get_mean_se <- function(i, par) predict(ms[[i]], | |
newdata=data.frame(n=par[1], delta=par[2], width=par[3]), | |
se.fit=TRUE) | |
opt_result <- cma_es( par0, function(par) { | |
x <- get_mean_se(1, par) | |
y <- get_mean_se(2, par) | |
abs(x$fit - y$fit) - (x$se.fit + y$se.fit) | |
}, lower=c(20, 20, 2), upper=c(10000, 1000, 100), control=list(maxit=40)) | |
par0 <- opt_result$par | |
print(par0) | |
d <- rbind(d, race(methods, mk_x(par0[1], par0[2], par0[3])) %>% | |
within({ n <- par0[1]; delta <- par0[2]; width <- par0[3] })) | |
} | |
d}) | |
# https://github.com/tidyverse/ggplot2/issues/5349#issuecomment-1631911060 | |
ggplot(d[ nrow(d) - (0:9), ] %>% within(i <- 1:10), | |
aes(delta, width, linewidth=n, col=i)) + | |
geom_link2(lineend="round") + | |
ggtitle("path taken by CMA-ES") | |
``` | |
I am unsatisfied with gam above. I could do k nearest neighbours and then lm or gp. | |
lm would allow the coef | |
```{r} | |
knn_with_pgrad <- function(formula, data, new_x, k = 10) { | |
mf <- model.frame(formula, data) | |
if (is.numeric(new_x)) new_x <- matrix(new_x, nrow=1) | |
nn <- nn2(as.matrix(mf)[, -1], new_x, k = k) | |
m <- lm(formula, data %>% slice(nn$nn.idx[1, ])) | |
list(cbind(1, new_x) %*% coef(m), coef(m)) | |
} | |
knn_with_pgrad(elapsed ~ n + width + delta, d, c(3000, 400, 50)) | |
``` | |
Next I would like to learn level set methods. Somehow plotting the prediction volume that encloses the zero contour (ie. where both models are equal). | |
This would pehaps result in a better active learning strategy: sampling should increase with increasing curvature of the contour, as well as with increasing gradient of the time-difference normal to the contour. Both of these factors are unaccounted for in the current method. | |
Goldman 2005 | |
F(x,y,z) = 0 | |
is the spline | |
I need `H*(F)`, adjoint of hessian | |
RConics::adjoint equivalent to `det(H)*solve(H)` https://stat.ethz.ch/pipermail/r-help/2006-August/111922.html | |
it seems that RConics is using the definition of cofactors/minors etc., so solve is probably faster. | |
predict(type="lpmatrix") is used for derivatives with respect to model parameters not with respect to the new point. I could do a numerical hessian numDeriv. | |
https://tensorflow.rstudio.com/guides/tensorflow/autodiff | |
or I could just use nn2 and say that the point is a weighted sum of k nearest neighbours. | |
what is the derivative in that case? | |
```{r} | |
# first find a point where the two models are equal | |
pStar <- optim(c(n=1000, delta=100, width=20), function(p) abs( predict(ms[[1]], as.list(p)) - predict(ms[[2]], as.list(p))), | |
method="L-BFGS-B", lower=c(20, 20, 2), upper=c(10000, 1000, 100)) | |
``` | |
```{r} | |
# this is too slow. | |
curvature <- function(p) { | |
f <- function(q) { newdata <- as.list(q) %>% setNames(c("n", "delta", "width")) | |
predict(ms[[1]], newdata) - predict(ms[[2]], newdata) | |
} | |
H <- numDeriv::hessian(f, p) | |
G <- numDeriv::grad(f, p) | |
# ref/basic.pdf | |
G %*% (det(H) * solve(H)) %*% G / (sum(G^2)) | |
} | |
tic() | |
curvature(pStar$par) | |
toc() | |
``` | |
```{r} | |
library(pacman) | |
p_load(mvtnorm, tidyverse, ggplot2, lhs) | |
source("stat_interp.R") | |
n <- 200 # Number of observations | |
theta <- 0.1 # length scale (smaller == more bumpy) | |
# Generate data using mvtnorm | |
data <- data.frame(randomLHS(n, 2)) %>% as_tibble %>% setNames(c("x", "y")) | |
data$z <- as.numeric(rmvnorm(1, sigma = exp(-as.matrix(dist(data)/ theta)) )) | |
data_reg <- with(data, { | |
o <- expand_grid(xo=seq(min(x), max(x), length.out=100), yo=seq(min(y), max(y), length.out=100)) | |
akima::interpp(x, y = y, z, yo=o$yo,xo=o$xo, extrap=T) | |
}) %>% as_tibble | |
ggplot(data, aes(x, y, z = z)) + stat_interp_contour_filled() + geom_point() | |
nn_grad <- function(xy, z, new_xy) { | |
if (length(new_xy) == ncol(xy)) new_xy <- matrix(new_xy, ncol=ncol(xy)) # one row | |
nn <- nn2(as.matrix(xy), matrix(new_xy, ncol=ncol(xy)), k = ncol(xy)+1) # or more because what if I don't have a basis? | |
nn_svd <- svd(xy[ nn$nn.idx[1, ] , ]) | |
# the rows of nn_svd$v are the basis vectors | |
# so if I multiply by nn_svd$v I get the transformation | |
# back to (1,0) (0, 1) etc. | |
} | |
nn_grad(data %>% select(-z), data$z, c(0.5, 0.5)) | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment