aavogt/active_learning.Rmd

## active_learning.Rmd
# empirical performance model (EPM) contour active learning

I have two or three different algorithms for finding the closest pair of points between two point sets.
I want to know which one is faster, depending on the size of the point sets and the distance between them.
There are several parts to this project:

 - [ ] level set method for contour curvature and normal vector / directional derivative of the EPM
 - [ ] debug the distance transform and
 - [ ] caret::knn.reg if it exists?
 - [ ] shape constrained splines (SCAM) instead of mgcv::gam?
 - [ ] `cma_es` might not be necessary: it is likely a gradient exists, and bfgs-b is probably better with the analytical gradient. predict.gam `The routine can optionally return the matrix by which the model coefficients must be pre-multiplied in order to yield the values of the linear predictor at the supplied covariate values: this is useful for obtaining credible regions for quantities derived from the model (e.g. derivatives of smooths)`
 - [ ] it is also possible that both algorithms do not need to be sampled at the same locations. If I have M different algorithms, I only need to sample ones where the EPMs cannot say one is better than the other.
 - [ ] <https://cran.r-project.org/web/packages/shapr/index.html> `ref/1903.10464_shapley.pdf`


```{r}
library(pacman)
p_load(tidyverse, rextendr, glue, assertthat, tictoc,
       emstreeR, lhs, ggplot2, parallel, mgcv, cmaes, ggforce, tripack, RANN,
        caret)

if (!exists("lib_mtime") || lib_mtime < file.info("lib.rs")$mtime) {
    # recompile and load the rust code the first time, or if it has changed
    # since cargo etc. take at least 1 second
    source("rust_source.R")
    rust_source("lib.rs")
    lib_mtime <- file.info("lib.rs")$mtime
}


# RANN's nn2 is by far the fastest
closest_pairs_nn <- function(p, q) {
        r <- nn2(p, q, k=1)
        within(list( iq = which.min(r$nn.dists)),
               { ip <- r$nn.idx[iq]
                 d <- r$nn.dists[iq] })
}

# tripack1 is 10 times slower than emstreeR
# tripack is 4 times slower than emstreeR
closest_pairs_tripack1 <- function(p, q) {
        tri <- tri.mesh(p$x, p$y)
        q %>%
                mutate(iq = seq_len(n())) %>%
                rowwise %>% group_map(~ {
                ns <- tri.find(tri, .x$x, .x$y) %>% as.integer
                ds <- (tri$x[ns] - .x$x)^2 + (tri$y[ns] - .x$y)^2
                within(.x, {
                       ip <- ns[which.min(ds)]
                       d <- sqrt(min(ds))
                })
          }) %>%
        bind_rows %>%
        { .[which.min(.$d), ] }
}


mkP <- function(n) randomLHS(n, 2) %>% as_tibble %>% setNames(c("x", "y"))

pq <- list(mkP(1000), mkP(1000))

# closest pair algorithm for integer points
# https://en.wikipedia.org/wiki/Closest_pair_of_points_problem#Linear-time_randomized_algorithms
# Rabin 1976 is for a single point set.
#
# This adaptation to pairs of point sets is worst-case O(n^2) because
# when the two sets are very far apart, d is large and then
# all points are in 1-3 buckets, and then we have to inspect all pairs.
#
# But if the two sets overlap, then d is small and it should be closer to O(n)
closest_pairs_hash <- function(p, q) {
        np <- nrow(p)
        nq <- nrow(q)
        assert_that(np > 0, nq > 0)
        n <- min(np, nq)

        sp <- sample(n, replace=TRUE)
        sq <- sample(n, replace=TRUE)
        d <- (p[ sp, c('x','y')] - q[ sq, c('x','y')])^2 %>% apply(1, sum) %>% sqrt %>% min
        closest_in_buckets(p$x, p$y, q$x, q$y, d)
}

# closest pair via emstreeR
#
# let E be an edge in the emst between (disjoint) p and q
# let E* be the shortest edge between p and q
# if E* was not in emst, we could
# delete E and insert E* to get a shorter emst
# which is a contradiction
#
# EMST takes O(n log n) time
# inspecting each edge takes O(n) time
# this is faster than the closest_in_buckets if the point sets are far apart
# if they are close together, this is slower
# there is a cutoff point where this is faster
# the criterion is a function of the number of buckets that are adjacent
# but that in turn is a function of the number of buckets
# which in turn is a function of d which should be scaled to some measure
# of the point set size such as the distance between centroids
# this seems worth benchmarking to find the best cutoff for
# point sets that are are
closest_pairs_emst <- function(p, q) {
        np <- nrow(p)
        nq <- nrow(q)
        assert_that(np > 0, nq > 0)

        pq <- rbind(p[, c("x", "y")], q[, c("x", "y")])

        emst <- ComputeMST(pq)

        # inspect each edge, find the shortest has one point from each set emst$edges
        closest_pairs_emst_walk(as.integer(emst$from), as.integer(emst$to), emst$dist, as.integer(np)) %>%
                setNames(c("d", "ip", "iq"))
}

tic()
do.call(closest_pairs_nn, pq)
toc()
tic()
do.call(closest_pairs_emst, pq)
toc()

# closest pairs via distance transform (wrong)
closest_pairs_dt <- function(p, q) {
        dx <- min(c(p$x, q$x))
        dy <- min(c(p$y, q$y))
        px <- p$x - dx
        py <- p$y - dy
        qx <- q$x - dx
        qy <- q$y - dy
        assert_that(is.integer(px), is.integer(py), is.integer(qx), is.integer(qy),
                all(px >= 0L), all(py >= 0L), all(qx >= 0L), all(qy >= 0L))
        closest_dt(px, py, qx, qy)
}

methods <- c("closest_pairs_hash", "closest_pairs_emst")
```

```{r}
mk_x <- function(n = 2000, delta = 0, width=100, dim=2) {
 a <- width * randomLHS(n, dim)
 b <- width * randomLHS(n, dim)
 mode(b) <- mode(a) <- "integer"
 colnames(a) <- colnames(b) <- c("x", "y")
 a[ , 'x'] <- a[ , 'x' ] + as.integer(delta)
 ab <- list(as_tibble(a, .name_repair="unique"), as_tibble(b, .name_repair="unique"))
}

# race(fs, x) races the functions in fs on the data in x. fs can either be a
# list of functions with names, or a list of strings which must be the names of
# functions. In other words, do.call(fs[i], x)
#
# Losers of the race are disqualified if they take 1+disqualifiation times
# the winner's time.
#
# polling_period is the time in seconds to wait between checking for the winner.
#
# the result is a tibble with columns
#
##  A tibble: 3 × 8
#   f                  crashed user.self sys.self elapsed user.child sys.child killed
#   <chr>              <lgl>       <dbl>    <dbl>   <dbl>      <dbl>     <dbl> <lgl>
# 1 timeout            FALSE       0.001     0     0.0210          0         0 FALSE
# 2 closest_pairs_hash FALSE       0.032     0.01  0.0420          0         0 FALSE
# 3 closest_pairs_emst FALSE      NA        NA     0.0252         NA        NA TRUE
race <- function(fs, x, polling_period=0.05, disqualification = 0.2, timeout=NA) {
 do.call(assert_that,c(is.character(fs) || is.list(fs),
                       is.list(x),
                       imap(fs, ~ { is.character(.y) && (is.function(.x) || exists(.x, mode="function")) ||
                                    is.character(.x) && exists(.x, mode="function") } )))
 if (!is.na(timeout)) fs <- c(timeout= function(...) Sys.sleep(timeout), as.list(fs))
 # or fs could be a list of functions, and the names(fs) is used for the f column
 ps <- imap(fs, ~ {
        f_str <- if (is.character(.x)) .x else .y
        mcparallel({
                t0 <- proc.time()
                crashed <- tryCatch( { do.call(.x, x); FALSE},
                        error = function(e) TRUE)
                t1 <- proc.time()
                c(list(f= f_str,
                     crashed=crashed),
                  map2(t1, t0, `-`))
         }, f_str)
 })
 complete <- list()
 while (TRUE) {
         # check each uncompleted process in ps once every every polling_period
         if (is.null(ps)) break
         complete1 <- map(ps, ~ mccollect(., wait=F))
         nulls <- map_lgl(complete1, is.null)
         complete <- c(complete, complete1[!nulls])
         ps <- ps[nulls]
         crashes <- complete1[!nulls] %>% map_lgl(~ .[[1]]$crashed)
         if (any(!crashes)) break
         Sys.sleep(polling_period)
 }
 # either all processes crashed or one completed
 dt <- map_dbl(complete, pluck, 1, "elapsed", .default=Inf) %>% min(na.rm=TRUE)
 complete2 <- mccollect(ps, wait=FALSE, timeout = dt)
 walk(ps, parallel:::mckill, signal=9) # or signal=15? Does it need to run on parallel:::children()?
 ps <- if (is.null(complete2)) ps  else ps[map_lgl(complete2, is.null)]
 t_max <- (1+disqualification) * dt
 rbind(map_dfr(complete, pluck, 1) %>% mutate(killed = FALSE),
       bind_rows(complete2) %>% mutate(killed=FALSE),
       map_dfr(ps, ~ tibble( f = .$name,
                             elapsed = NA,
                             user.self= NA,
                             sys.self=NA,
                             user.child=NA,
                             sys.child=NA,
                             crashed = FALSE,
                             killed = TRUE ))) %>%
        # replace NA in elapsed with the maximum time
        mutate_at("elapsed", ~ ifelse(is.na(.), max(., t_max, na.rm=TRUE), .))
}

# race(c("closest_pairs_hash", "closest_pairs_emst"), mk_x(), timeout=0.02)

# initial space-filling design
d <- xfun::oache_rds({randomLHS(500, 3) %>%
        as_tibble() %>%
        transmute(n = V1 * 10000 + 20,
                  width = V2 * 1000 + 20,
                  delta = V3 * 100) %>%
        rowwise %>%
        mutate(r = list(race(methods, mk_x(n, delta, width)))) %>%
        unnest(r)
       })
```

```{r}
ggplot(d %>% filter(!killed), aes(n, elapsed, col=factor(f))) + geom_point(aes(size=width), alpha=0.5) + geom_smooth() +
        facet_wrap(~ cut(delta, 3), labeller = "label_both") +
        ggtitle("elapsed time vs n, delta, width")
```

```{r}
# fit gam s(n, delta, width)

par0 <- c(1000, 500, 50)

d <- xfun::cache_rds({
for (i in 1:10) {
  ms <- d %>%
          group_by(f) %>%
          group_map( ~ gam(elapsed ~ s(n, delta, width), data= . ))
  get_mean_se <- function(i, par) predict(ms[[i]],
          newdata=data.frame(n=par[1], delta=par[2], width=par[3]),
          se.fit=TRUE)
  opt_result <- cma_es( par0, function(par) {
                         x <- get_mean_se(1, par)
                         y <- get_mean_se(2, par)
                         abs(x$fit - y$fit) - (x$se.fit + y$se.fit)
          }, lower=c(20, 20, 2), upper=c(10000, 1000, 100), control=list(maxit=40))
  par0 <- opt_result$par
  print(par0)

  d <- rbind(d, race(methods, mk_x(par0[1], par0[2], par0[3])) %>%
                within({ n <- par0[1]; delta <- par0[2]; width <- par0[3] }))
}

d})

# https://github.com/tidyverse/ggplot2/issues/5349#issuecomment-1631911060
ggplot(d[ nrow(d) - (0:9), ] %>% within(i <- 1:10),
       aes(delta, width, linewidth=n, col=i)) +
        geom_link2(lineend="round") +
        ggtitle("path taken by CMA-ES")
```

I am unsatisfied with gam above. I could do k nearest neighbours and then lm or gp.
lm would allow the coef

```{r}
knn_with_pgrad <- function(formula, data, new_x, k = 10) {
        mf <- model.frame(formula, data)
        if (is.numeric(new_x)) new_x <- matrix(new_x, nrow=1)
        nn <- nn2(as.matrix(mf)[, -1], new_x, k = k)
        m <- lm(formula, data %>% slice(nn$nn.idx[1, ]))
        list(cbind(1, new_x) %*% coef(m), coef(m))
}
knn_with_pgrad(elapsed ~ n + width + delta, d, c(3000, 400, 50))
```


Next I would like to learn level set methods. Somehow plotting the prediction volume that encloses the zero contour (ie. where both models are equal).
This would pehaps result in a better active learning strategy: sampling should increase with increasing curvature of the contour, as well as with increasing gradient of the time-difference normal to the contour. Both of these factors are unaccounted for in the current method.

Goldman 2005
F(x,y,z) = 0
is the spline
I need `H*(F)`, adjoint of hessian
RConics::adjoint equivalent to `det(H)*solve(H)` https://stat.ethz.ch/pipermail/r-help/2006-August/111922.html
it seems that RConics is using the definition of cofactors/minors etc., so solve is probably faster.

predict(type="lpmatrix") is used for derivatives with respect to model parameters not with respect to the new point. I could do a numerical hessian numDeriv.

https://tensorflow.rstudio.com/guides/tensorflow/autodiff


or I could just use nn2 and say that the point is a weighted sum of k nearest neighbours.

what is the derivative in that case?


```{r}
# first find a point where the two models are equal

pStar <- optim(c(n=1000, delta=100, width=20), function(p) abs( predict(ms[[1]], as.list(p)) - predict(ms[[2]], as.list(p))),
      method="L-BFGS-B", lower=c(20, 20, 2), upper=c(10000, 1000, 100))
```

```{r}
# this is too slow.
curvature <- function(p) {
        f <- function(q) { newdata <- as.list(q) %>% setNames(c("n", "delta", "width"))
                           predict(ms[[1]], newdata) - predict(ms[[2]], newdata)
        }
       H <- numDeriv::hessian(f, p)
       G <- numDeriv::grad(f, p)
       # ref/basic.pdf
       G %*% (det(H) * solve(H)) %*% G / (sum(G^2))
}
tic()
curvature(pStar$par)
toc()
```

```{r}
library(pacman)
p_load(mvtnorm, tidyverse, ggplot2, lhs)
source("stat_interp.R")
n <- 200  # Number of observations
theta <- 0.1 # length scale (smaller == more bumpy)

# Generate data using mvtnorm
data <- data.frame(randomLHS(n, 2)) %>% as_tibble %>% setNames(c("x", "y"))
data$z <- as.numeric(rmvnorm(1, sigma = exp(-as.matrix(dist(data)/ theta)) ))

data_reg <- with(data, {
   o <- expand_grid(xo=seq(min(x), max(x), length.out=100), yo=seq(min(y), max(y), length.out=100))
   akima::interpp(x, y = y, z, yo=o$yo,xo=o$xo, extrap=T)
      }) %>% as_tibble

ggplot(data, aes(x, y, z = z)) + stat_interp_contour_filled() + geom_point()


nn_grad <- function(xy, z, new_xy) {
        if (length(new_xy) == ncol(xy)) new_xy <- matrix(new_xy, ncol=ncol(xy)) # one row
        nn <- nn2(as.matrix(xy), matrix(new_xy, ncol=ncol(xy)), k = ncol(xy)+1) # or more because what if I don't have a basis?

        nn_svd <- svd(xy[ nn$nn.idx[1, ] , ])
        # the rows of nn_svd$v are the basis vectors
        # so if I multiply by nn_svd$v I get the transformation
        # back to (1,0) (0, 1) etc.

}
nn_grad(data %>% select(-z), data$z, c(0.5, 0.5))
```
	# empirical performance model (EPM) contour active learning

	I have two or three different algorithms for finding the closest pair of points between two point sets.
	I want to know which one is faster, depending on the size of the point sets and the distance between them.
	There are several parts to this project:

	- [ ] level set method for contour curvature and normal vector / directional derivative of the EPM
	- [ ] debug the distance transform and
	- [ ] caret::knn.reg if it exists?
	- [ ] shape constrained splines (SCAM) instead of mgcv::gam?
	- [ ] `cma_es` might not be necessary: it is likely a gradient exists, and bfgs-b is probably better with the analytical gradient. predict.gam `The routine can optionally return the matrix by which the model coefficients must be pre-multiplied in order to yield the values of the linear predictor at the supplied covariate values: this is useful for obtaining credible regions for quantities derived from the model (e.g. derivatives of smooths)`
	- [ ] it is also possible that both algorithms do not need to be sampled at the same locations. If I have M different algorithms, I only need to sample ones where the EPMs cannot say one is better than the other.
	- [ ] <https://cran.r-project.org/web/packages/shapr/index.html> `ref/1903.10464_shapley.pdf`



	```{r}
	library(pacman)
	p_load(tidyverse, rextendr, glue, assertthat, tictoc,
	emstreeR, lhs, ggplot2, parallel, mgcv, cmaes, ggforce, tripack, RANN,
	caret)

	if (!exists("lib_mtime") \|\| lib_mtime < file.info("lib.rs")$mtime) {
	# recompile and load the rust code the first time, or if it has changed
	# since cargo etc. take at least 1 second
	source("rust_source.R")
	rust_source("lib.rs")
	lib_mtime <- file.info("lib.rs")$mtime
	}


	# RANN's nn2 is by far the fastest
	closest_pairs_nn <- function(p, q) {
	r <- nn2(p, q, k=1)
	within(list( iq = which.min(r$nn.dists)),
	{ ip <- r$nn.idx[iq]
	d <- r$nn.dists[iq] })
	}

	# tripack1 is 10 times slower than emstreeR
	# tripack is 4 times slower than emstreeR
	closest_pairs_tripack1 <- function(p, q) {
	tri <- tri.mesh(p$x, p$y)
	q %>%
	mutate(iq = seq_len(n())) %>%
	rowwise %>% group_map(~ {
	ns <- tri.find(tri, .x$x, .x$y) %>% as.integer
	ds <- (tri$x[ns] - .x$x)^2 + (tri$y[ns] - .x$y)^2
	within(.x, {
	ip <- ns[which.min(ds)]
	d <- sqrt(min(ds))
	})
	}) %>%
	bind_rows %>%
	{ .[which.min(.$d), ] }
	}


	mkP <- function(n) randomLHS(n, 2) %>% as_tibble %>% setNames(c("x", "y"))

	pq <- list(mkP(1000), mkP(1000))

	# closest pair algorithm for integer points
	# https://en.wikipedia.org/wiki/Closest_pair_of_points_problem#Linear-time_randomized_algorithms
	# Rabin 1976 is for a single point set.
	#
	# This adaptation to pairs of point sets is worst-case O(n^2) because
	# when the two sets are very far apart, d is large and then
	# all points are in 1-3 buckets, and then we have to inspect all pairs.
	#
	# But if the two sets overlap, then d is small and it should be closer to O(n)
	closest_pairs_hash <- function(p, q) {
	np <- nrow(p)
	nq <- nrow(q)
	assert_that(np > 0, nq > 0)
	n <- min(np, nq)

	sp <- sample(n, replace=TRUE)
	sq <- sample(n, replace=TRUE)
	d <- (p[ sp, c('x','y')] - q[ sq, c('x','y')])^2 %>% apply(1, sum) %>% sqrt %>% min
	closest_in_buckets(p$x, p$y, q$x, q$y, d)
	}

	# closest pair via emstreeR
	#
	# let E be an edge in the emst between (disjoint) p and q
	# let E* be the shortest edge between p and q
	# if E* was not in emst, we could
	# delete E and insert E* to get a shorter emst
	# which is a contradiction
	#
	# EMST takes O(n log n) time
	# inspecting each edge takes O(n) time
	# this is faster than the closest_in_buckets if the point sets are far apart
	# if they are close together, this is slower
	# there is a cutoff point where this is faster
	# the criterion is a function of the number of buckets that are adjacent
	# but that in turn is a function of the number of buckets
	# which in turn is a function of d which should be scaled to some measure
	# of the point set size such as the distance between centroids
	# this seems worth benchmarking to find the best cutoff for
	# point sets that are are
	closest_pairs_emst <- function(p, q) {
	np <- nrow(p)
	nq <- nrow(q)
	assert_that(np > 0, nq > 0)

	pq <- rbind(p[, c("x", "y")], q[, c("x", "y")])

	emst <- ComputeMST(pq)

	# inspect each edge, find the shortest has one point from each set emst$edges
	closest_pairs_emst_walk(as.integer(emst$from), as.integer(emst$to), emst$dist, as.integer(np)) %>%
	setNames(c("d", "ip", "iq"))
	}

	tic()
	do.call(closest_pairs_nn, pq)
	toc()
	tic()
	do.call(closest_pairs_emst, pq)
	toc()

	# closest pairs via distance transform (wrong)
	closest_pairs_dt <- function(p, q) {
	dx <- min(c(p$x, q$x))
	dy <- min(c(p$y, q$y))
	px <- p$x - dx
	py <- p$y - dy
	qx <- q$x - dx
	qy <- q$y - dy
	assert_that(is.integer(px), is.integer(py), is.integer(qx), is.integer(qy),
	all(px >= 0L), all(py >= 0L), all(qx >= 0L), all(qy >= 0L))
	closest_dt(px, py, qx, qy)
	}

	methods <- c("closest_pairs_hash", "closest_pairs_emst")
	```

	```{r}
	mk_x <- function(n = 2000, delta = 0, width=100, dim=2) {
	a <- width * randomLHS(n, dim)
	b <- width * randomLHS(n, dim)
	mode(b) <- mode(a) <- "integer"
	colnames(a) <- colnames(b) <- c("x", "y")
	a[ , 'x'] <- a[ , 'x' ] + as.integer(delta)
	ab <- list(as_tibble(a, .name_repair="unique"), as_tibble(b, .name_repair="unique"))
	}

	# race(fs, x) races the functions in fs on the data in x. fs can either be a
	# list of functions with names, or a list of strings which must be the names of
	# functions. In other words, do.call(fs[i], x)
	#
	# Losers of the race are disqualified if they take 1+disqualifiation times
	# the winner's time.
	#
	# polling_period is the time in seconds to wait between checking for the winner.
	#
	# the result is a tibble with columns
	#
	## A tibble: 3 × 8
	# f crashed user.self sys.self elapsed user.child sys.child killed
	# <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
	# 1 timeout FALSE 0.001 0 0.0210 0 0 FALSE
	# 2 closest_pairs_hash FALSE 0.032 0.01 0.0420 0 0 FALSE
	# 3 closest_pairs_emst FALSE NA NA 0.0252 NA NA TRUE
	race <- function(fs, x, polling_period=0.05, disqualification = 0.2, timeout=NA) {
	do.call(assert_that,c(is.character(fs) \|\| is.list(fs),
	is.list(x),
	imap(fs, ~ { is.character(.y) && (is.function(.x) \|\| exists(.x, mode="function")) \|\|
	is.character(.x) && exists(.x, mode="function") } )))
	if (!is.na(timeout)) fs <- c(timeout= function(...) Sys.sleep(timeout), as.list(fs))
	# or fs could be a list of functions, and the names(fs) is used for the f column
	ps <- imap(fs, ~ {
	f_str <- if (is.character(.x)) .x else .y
	mcparallel({
	t0 <- proc.time()
	crashed <- tryCatch( { do.call(.x, x); FALSE},
	error = function(e) TRUE)
	t1 <- proc.time()
	c(list(f= f_str,
	crashed=crashed),
	map2(t1, t0, `-`))
	}, f_str)
	})
	complete <- list()
	while (TRUE) {
	# check each uncompleted process in ps once every every polling_period
	if (is.null(ps)) break
	complete1 <- map(ps, ~ mccollect(., wait=F))
	nulls <- map_lgl(complete1, is.null)
	complete <- c(complete, complete1[!nulls])
	ps <- ps[nulls]
	crashes <- complete1[!nulls] %>% map_lgl(~ .[[1]]$crashed)
	if (any(!crashes)) break
	Sys.sleep(polling_period)
	}
	# either all processes crashed or one completed
	dt <- map_dbl(complete, pluck, 1, "elapsed", .default=Inf) %>% min(na.rm=TRUE)
	complete2 <- mccollect(ps, wait=FALSE, timeout = dt)
	walk(ps, parallel:::mckill, signal=9) # or signal=15? Does it need to run on parallel:::children()?
	ps <- if (is.null(complete2)) ps else ps[map_lgl(complete2, is.null)]
	t_max <- (1+disqualification) * dt
	rbind(map_dfr(complete, pluck, 1) %>% mutate(killed = FALSE),
	bind_rows(complete2) %>% mutate(killed=FALSE),
	map_dfr(ps, ~ tibble( f = .$name,
	elapsed = NA,
	user.self= NA,
	sys.self=NA,
	user.child=NA,
	sys.child=NA,
	crashed = FALSE,
	killed = TRUE ))) %>%
	# replace NA in elapsed with the maximum time
	mutate_at("elapsed", ~ ifelse(is.na(.), max(., t_max, na.rm=TRUE), .))
	}

	# race(c("closest_pairs_hash", "closest_pairs_emst"), mk_x(), timeout=0.02)

	# initial space-filling design
	d <- xfun::oache_rds({randomLHS(500, 3) %>%
	as_tibble() %>%
	transmute(n = V1 * 10000 + 20,
	width = V2 * 1000 + 20,
	delta = V3 * 100) %>%
	rowwise %>%
	mutate(r = list(race(methods, mk_x(n, delta, width)))) %>%
	unnest(r)
	})
	```

	```{r}
	ggplot(d %>% filter(!killed), aes(n, elapsed, col=factor(f))) + geom_point(aes(size=width), alpha=0.5) + geom_smooth() +
	facet_wrap(~ cut(delta, 3), labeller = "label_both") +
	ggtitle("elapsed time vs n, delta, width")
	```

	```{r}
	# fit gam s(n, delta, width)

	par0 <- c(1000, 500, 50)

	d <- xfun::cache_rds({
	for (i in 1:10) {
	ms <- d %>%
	group_by(f) %>%
	group_map( ~ gam(elapsed ~ s(n, delta, width), data= . ))
	get_mean_se <- function(i, par) predict(ms[[i]],
	newdata=data.frame(n=par[1], delta=par[2], width=par[3]),
	se.fit=TRUE)
	opt_result <- cma_es( par0, function(par) {
	x <- get_mean_se(1, par)
	y <- get_mean_se(2, par)
	abs(x$fit - y$fit) - (x$se.fit + y$se.fit)
	}, lower=c(20, 20, 2), upper=c(10000, 1000, 100), control=list(maxit=40))
	par0 <- opt_result$par
	print(par0)

	d <- rbind(d, race(methods, mk_x(par0[1], par0[2], par0[3])) %>%
	within({ n <- par0[1]; delta <- par0[2]; width <- par0[3] }))
	}

	d})

	# https://github.com/tidyverse/ggplot2/issues/5349#issuecomment-1631911060
	ggplot(d[ nrow(d) - (0:9), ] %>% within(i <- 1:10),
	aes(delta, width, linewidth=n, col=i)) +
	geom_link2(lineend="round") +
	ggtitle("path taken by CMA-ES")
	```

	I am unsatisfied with gam above. I could do k nearest neighbours and then lm or gp.
	lm would allow the coef

	```{r}
	knn_with_pgrad <- function(formula, data, new_x, k = 10) {
	mf <- model.frame(formula, data)
	if (is.numeric(new_x)) new_x <- matrix(new_x, nrow=1)
	nn <- nn2(as.matrix(mf)[, -1], new_x, k = k)
	m <- lm(formula, data %>% slice(nn$nn.idx[1, ]))
	list(cbind(1, new_x) %*% coef(m), coef(m))
	}
	knn_with_pgrad(elapsed ~ n + width + delta, d, c(3000, 400, 50))
	```


	Next I would like to learn level set methods. Somehow plotting the prediction volume that encloses the zero contour (ie. where both models are equal).
	This would pehaps result in a better active learning strategy: sampling should increase with increasing curvature of the contour, as well as with increasing gradient of the time-difference normal to the contour. Both of these factors are unaccounted for in the current method.

	Goldman 2005
	F(x,y,z) = 0
	is the spline
	I need `H*(F)`, adjoint of hessian
	RConics::adjoint equivalent to `det(H)*solve(H)` https://stat.ethz.ch/pipermail/r-help/2006-August/111922.html
	it seems that RConics is using the definition of cofactors/minors etc., so solve is probably faster.

	predict(type="lpmatrix") is used for derivatives with respect to model parameters not with respect to the new point. I could do a numerical hessian numDeriv.

	https://tensorflow.rstudio.com/guides/tensorflow/autodiff


	or I could just use nn2 and say that the point is a weighted sum of k nearest neighbours.

	what is the derivative in that case?



	```{r}
	# first find a point where the two models are equal

	pStar <- optim(c(n=1000, delta=100, width=20), function(p) abs( predict(ms[[1]], as.list(p)) - predict(ms[[2]], as.list(p))),
	method="L-BFGS-B", lower=c(20, 20, 2), upper=c(10000, 1000, 100))
	```

	```{r}
	# this is too slow.
	curvature <- function(p) {
	f <- function(q) { newdata <- as.list(q) %>% setNames(c("n", "delta", "width"))
	predict(ms[[1]], newdata) - predict(ms[[2]], newdata)
	}
	H <- numDeriv::hessian(f, p)
	G <- numDeriv::grad(f, p)
	# ref/basic.pdf
	G %% (det(H) solve(H)) %*% G / (sum(G^2))
	}
	tic()
	curvature(pStar$par)
	toc()
	```

	```{r}
	library(pacman)
	p_load(mvtnorm, tidyverse, ggplot2, lhs)
	source("stat_interp.R")
	n <- 200 # Number of observations
	theta <- 0.1 # length scale (smaller == more bumpy)

	# Generate data using mvtnorm
	data <- data.frame(randomLHS(n, 2)) %>% as_tibble %>% setNames(c("x", "y"))
	data$z <- as.numeric(rmvnorm(1, sigma = exp(-as.matrix(dist(data)/ theta)) ))

	data_reg <- with(data, {
	o <- expand_grid(xo=seq(min(x), max(x), length.out=100), yo=seq(min(y), max(y), length.out=100))
	akima::interpp(x, y = y, z, yo=o$yo,xo=o$xo, extrap=T)
	}) %>% as_tibble

	ggplot(data, aes(x, y, z = z)) + stat_interp_contour_filled() + geom_point()


	nn_grad <- function(xy, z, new_xy) {
	if (length(new_xy) == ncol(xy)) new_xy <- matrix(new_xy, ncol=ncol(xy)) # one row
	nn <- nn2(as.matrix(xy), matrix(new_xy, ncol=ncol(xy)), k = ncol(xy)+1) # or more because what if I don't have a basis?

	nn_svd <- svd(xy[ nn$nn.idx[1, ] , ])
	# the rows of nn_svd$v are the basis vectors
	# so if I multiply by nn_svd$v I get the transformation
	# back to (1,0) (0, 1) etc.

	}
	nn_grad(data %>% select(-z), data$z, c(0.5, 0.5))
	```