MatsuuraKentaro/calc-scale_best-example.R

## calc-scale_best-example.R
library(ggjoy)
library(dplyr)
library(purrr)

# yutani data ------------------------------------------------------------
set.seed(10)
l <- rerun(26, rnorm(1000, mean = runif(1), sd = sqrt(runif(1))))
names(l) <- LETTERS
l[[2]] <- c(rnorm(500, mean = -1, sd = 0.3), rnorm(500, mean = 1, sd = 0.3))
d <- as.data.frame(l) %>% tidyr::gather(id, value)


target_function <- function(scale) {
  p <- ggplot(d, aes(value, id)) +
    geom_joy(fill = alpha("skyblue4", 0.9), colour = "white", scale = scale)
  d_build <- as.data.frame(ggplot_build(p)$data)
  loss <- loss_function(d_build)
  return(loss)
}

loss_function <- function(d_build, lambda=0.01) {
  d_slim <- d_build %>% select(group, x, ymin, ymax)

  # group = g における分布の最大値と、group = g+1 における分布の底の隙間があると、美しくない。
  # そのため、隙間の合計値を loss1 としている。
  loss1 <- d_slim %>% group_by(group) %>%
    filter(ymax==max(ymax)) %>%
    mutate(space=ymin + 1 - ymax) %>%
    mutate(space=if_else(space < 0, 0, space)) %>%
    ungroup() %>% select(space) %>% unlist() %>% sum()

  # group = g における分布の最大値が どのgroupまで影響ありそうかをymaxの値をfloorして求めている。
  ymax_floor <- d_slim %>% group_by(group) %>%
    filter(ymax==max(ymax)) %>%
    mutate(ymax_floor=floor(ymax)) %>%
    ungroup() %>% select(ymax_floor) %>% unlist() %>% unname()
  ymax_floor <- ifelse(ymax_floor > max(d_slim$group), max(d_slim$group), ymax_floor)

  # 縦にx, 横にgroupが並ぶymaxのmatrixを求めている。
  ymax_mat <- d_slim %>% select(-ymin) %>%
    tidyr::spread(key=group, value=ymax) %>% select(-x)
  N_x <- nrow(ymax_mat)
  N_groups <- ncol(ymax_mat)

  # group = gの分布によって隠される部分が多いと美しくない。
  # そのため、groupがgより大きいところで、隠される分布の高さ（ymax - ymin）の合計をloss2としている。
  loss2 <- sum(sapply(seq_len(N_groups-1), function(i) {
    possible_groups <- i:ymax_floor[i]
    ymax_mat_possible <- ymax_mat[,possible_groups]
    ymin_mat_possible <- matrix(rep(possible_groups, each=N_x), nrow=N_x)
    overlap <- ymax_mat_possible - ymax_mat[,i] < 0.0
    hidden_height <- ymax_mat_possible - ymin_mat_possible
    if (all(!overlap)) 0.0 else sum(hidden_height[overlap])
  }))

  # loss1とloss2に適当な重みをかけて合計する。
  # labmdaは0.1から0.001ぐらいがよさそうだけど、とりあえず0.01固定でよいような。
  loss <- loss1 + lambda*loss2
  return(loss)
}

scale_best <- optim(3.0, target_function, method='Brent', lower=1, upper=10,
  control=list(maxit=100, reltol=0.01))$par

ggplot(d, aes(value, id)) +
  geom_joy(fill = alpha("skyblue4", 0.9), colour = "white", scale = scale_best) +
  theme_minimal()
	library(ggjoy)
	library(dplyr)
	library(purrr)

	# yutani data ------------------------------------------------------------
	set.seed(10)
	l <- rerun(26, rnorm(1000, mean = runif(1), sd = sqrt(runif(1))))
	names(l) <- LETTERS
	l[[2]] <- c(rnorm(500, mean = -1, sd = 0.3), rnorm(500, mean = 1, sd = 0.3))
	d <- as.data.frame(l) %>% tidyr::gather(id, value)


	target_function <- function(scale) {
	p <- ggplot(d, aes(value, id)) +
	geom_joy(fill = alpha("skyblue4", 0.9), colour = "white", scale = scale)
	d_build <- as.data.frame(ggplot_build(p)$data)
	loss <- loss_function(d_build)
	return(loss)
	}

	loss_function <- function(d_build, lambda=0.01) {
	d_slim <- d_build %>% select(group, x, ymin, ymax)

	# group = g における分布の最大値と、group = g+1 における分布の底の隙間があると、美しくない。
	# そのため、隙間の合計値を loss1 としている。
	loss1 <- d_slim %>% group_by(group) %>%
	filter(ymax==max(ymax)) %>%
	mutate(space=ymin + 1 - ymax) %>%
	mutate(space=if_else(space < 0, 0, space)) %>%
	ungroup() %>% select(space) %>% unlist() %>% sum()

	# group = g における分布の最大値がどのgroupまで影響ありそうかをymaxの値をfloorして求めている。
	ymax_floor <- d_slim %>% group_by(group) %>%
	filter(ymax==max(ymax)) %>%
	mutate(ymax_floor=floor(ymax)) %>%
	ungroup() %>% select(ymax_floor) %>% unlist() %>% unname()
	ymax_floor <- ifelse(ymax_floor > max(d_slim$group), max(d_slim$group), ymax_floor)

	# 縦にx, 横にgroupが並ぶymaxのmatrixを求めている。
	ymax_mat <- d_slim %>% select(-ymin) %>%
	tidyr::spread(key=group, value=ymax) %>% select(-x)
	N_x <- nrow(ymax_mat)
	N_groups <- ncol(ymax_mat)

	# group = gの分布によって隠される部分が多いと美しくない。
	# そのため、groupがgより大きいところで、隠される分布の高さ（ymax - ymin）の合計をloss2としている。
	loss2 <- sum(sapply(seq_len(N_groups-1), function(i) {
	possible_groups <- i:ymax_floor[i]
	ymax_mat_possible <- ymax_mat[,possible_groups]
	ymin_mat_possible <- matrix(rep(possible_groups, each=N_x), nrow=N_x)
	overlap <- ymax_mat_possible - ymax_mat[,i] < 0.0
	hidden_height <- ymax_mat_possible - ymin_mat_possible
	if (all(!overlap)) 0.0 else sum(hidden_height[overlap])
	}))

	# loss1とloss2に適当な重みをかけて合計する。
	# labmdaは0.1から0.001ぐらいがよさそうだけど、とりあえず0.01固定でよいような。
	loss <- loss1 + lambda*loss2
	return(loss)
	}

	scale_best <- optim(3.0, target_function, method='Brent', lower=1, upper=10,
	control=list(maxit=100, reltol=0.01))$par

	ggplot(d, aes(value, id)) +
	geom_joy(fill = alpha("skyblue4", 0.9), colour = "white", scale = scale_best) +
	theme_minimal()