Skip to content

Instantly share code, notes, and snippets.

@abikoushi
Created March 24, 2019 21:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abikoushi/d7e443774155de3934b754cf1a66625a to your computer and use it in GitHub Desktop.
Save abikoushi/d7e443774155de3934b754cf1a66625a to your computer and use it in GitHub Desktop.
require(ggplot2)
####utilities
"%||%" <- function(a, b) {
if (!is.null(a)) a else b
}
rbind_dfs <- function(dfs) {
out <- list()
columns <- unique(unlist(lapply(dfs, names)))
nrows <- vapply(dfs, .row_names_info, integer(1), type = 2L)
total <- sum(nrows)
if (length(columns) == 0) return(new_data_frame(list(), total))
allocated <- rep(FALSE, length(columns))
names(allocated) <- columns
col_levels <- list()
for (df in dfs) {
new_columns <- intersect(names(df), columns[!allocated])
for (col in new_columns) {
if (is.factor(df[[col]])) {
all_factors <- all(vapply(dfs, function(df) {
val <- .subset2(df, col)
is.null(val) || is.factor(val)
}, logical(1)))
if (all_factors) {
col_levels[[col]] <- unique(unlist(lapply(dfs, function(df) levels(.subset2(df, col)))))
}
out[[col]] <- rep(NA_character_, total)
} else {
out[[col]] <- rep(.subset2(df, col)[1][NA], total)
}
}
allocated[new_columns] <- TRUE
if (all(allocated)) break
}
pos <- c(cumsum(nrows) - nrows + 1)
for (i in seq_along(dfs)) {
df <- dfs[[i]]
rng <- seq(pos[i], length.out = nrows[i])
for (col in names(df)) {
if (inherits(df[[col]], 'factor')) {
out[[col]][rng] <- as.character(df[[col]])
} else {
out[[col]][rng] <- df[[col]]
}
}
}
for (col in names(col_levels)) {
out[[col]] <- factor(out[[col]], levels = col_levels[[col]])
}
attributes(out) <- list(class = "data.frame", names = names(out), row.names = .set_row_names(total))
out
}
dapply <- function(df, by, fun, ..., drop = TRUE) {
grouping_cols <- .subset(df, by)
ids <- id(grouping_cols, drop = drop)
group_rows <- split(seq_len(nrow(df)), ids)
rbind_dfs(lapply(seq_along(group_rows), function(i) {
cur_data <- df_rows(df, group_rows[[i]])
res <- fun(cur_data, ...)
if (is.null(res)) return(res)
if (length(res) == 0) return(new_data_frame())
vars <- lapply(setNames(by, by), function(col) .subset2(cur_data, col)[1])
if (is.matrix(res)) res <- split_matrix(res)
if (is.null(names(res))) names(res) <- paste0("V", seq_along(res))
new_data_frame(modify_list(unclass(vars), unclass(res)))
}))
}
df_rows <- function(x, i) {
new_data_frame(lapply(x, `[`, i = i))
}
new_data_frame <- function(x = list(), n = NULL) {
if (length(x) != 0 && is.null(names(x))) stop("Elements must be named", call. = FALSE)
lengths <- vapply(x, length, integer(1))
if (is.null(n)) {
n <- if (length(x) == 0) 0 else max(lengths)
}
for (i in seq_along(x)) {
if (lengths[i] == n) next
if (lengths[i] != 1) stop("Elements must equal the number of rows or 1", call. = FALSE)
x[[i]] <- rep(x[[i]], n)
}
class(x) <- "data.frame"
attr(x, "row.names") <- .set_row_names(n)
x
}
modify_list <- function(old, new) {
for (i in names(new)) old[[i]] <- new[[i]]
old
}
id_var <- function(x, drop = FALSE) {
if (length(x) == 0) {
id <- integer()
n = 0L
} else if (!is.null(attr(x, "n")) && !drop) {
return(x)
} else if (is.factor(x) && !drop) {
x <- addNA(x, ifany = TRUE)
id <- as.integer(x)
n <- length(levels(x))
} else {
levels <- sort(unique(x), na.last = TRUE)
id <- match(x, levels)
n <- max(id)
}
attr(id, "n") <- n
id
}
id <- function(.variables, drop = FALSE) {
nrows <- NULL
if (is.data.frame(.variables)) {
nrows <- nrow(.variables)
.variables <- unclass(.variables)
}
lengths <- vapply(.variables, length, integer(1))
.variables <- .variables[lengths != 0]
if (length(.variables) == 0) {
n <- nrows %||% 0L
id <- seq_len(n)
attr(id, "n") <- n
return(id)
}
if (length(.variables) == 1) {
return(id_var(.variables[[1]], drop = drop))
}
ids <- rev(lapply(.variables, id_var, drop = drop))
p <- length(ids)
ndistinct <- vapply(ids, attr, "n", FUN.VALUE = numeric(1), USE.NAMES = FALSE)
n <- prod(ndistinct)
if (n > 2^31) {
char_id <- do.call("paste", c(ids, sep = "\r"))
res <- match(char_id, unique(char_id))
}
else {
combs <- c(1, cumprod(ndistinct[-p]))
mat <- do.call("cbind", ids)
res <- c((mat - 1L) %*% combs + 1L)
}
if (drop) {
id_var(res, drop = TRUE)
}
else {
res <- as.integer(res)
attr(res, "n") <- n
res
}
}
####
draw_key_grid <- function(data, params, size) {
grid::rectGrob(0.5, 0.5, 0.5, 0.5,
gp = grid::gpar(
col = alpha(data$colour, data$alpha),
fill = alpha(data$fill, data$alpha)
)
)
}
geom_grid <- function(mapping = NULL, data = NULL,
position = "identity",
...,
bins = NULL,
binwidth = NULL,
binaxis = "y",
method = "dotdensity",
binpositions = "all",
stackdir = "up",
stackratio = 1,
dotsize = 1,
stackgroups = TRUE,
origin = NULL,
right = TRUE,
width = 0.9,
drop = FALSE,
na.rm = FALSE,
show.legend = NA,
inherit.aes = TRUE) {
# If identical(position, "stack") or position is position_stack(), tell them
# to use stackgroups=TRUE instead. Need to use identical() instead of ==,
# because == will fail if object is position_stack() or position_dodge()
if (!is.null(position) &&
(identical(position, "stack") || (inherits(position, "PositionStack"))))
message("position=\"stack\" doesn't work properly with geom_dotplot. Use stackgroups=TRUE instead.")
if (stackgroups && method == "dotdensity" && binpositions == "bygroup")
message('geom_dotplot called with stackgroups=TRUE and method="dotdensity". You probably want to set binpositions="all"')
layer(
data = data,
mapping = mapping,
stat = StatBingrid,
geom = GeomGrid,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
# Need to make sure that the binaxis goes to both the stat and the geom
params = list(
binaxis = binaxis,
bins = bins,
binwidth = binwidth,
binpositions = binpositions,
method = method,
origin = origin,
right = right,
width = width,
drop = drop,
stackdir = stackdir,
stackratio = stackratio,
dotsize = dotsize,
stackgroups = stackgroups,
na.rm = na.rm,
...
)
)
}
GeomGrid <- ggproto("GeomGrid", Geom,
required_aes = c("x", "y"),
non_missing_aes = c("size", "shape"),
default_aes = aes(colour = "black", fill = "white", alpha = NA, stroke = 1, linetype = "solid"),
setup_data = function(data, params) {
data$width <- data$width %||%
params$width %||% (resolution(data$x, FALSE) * 0.9)
# Set up the stacking function and range
if (is.null(params$stackdir) || params$stackdir == "up") {
stackdots <- function(a) a - .5
stackaxismin <- 0
stackaxismax <- 1
} else if (params$stackdir == "down") {
stackdots <- function(a) -a + .5
stackaxismin <- -1
stackaxismax <- 0
} else if (params$stackdir == "center") {
stackdots <- function(a) a - 1 - max(a - 1) / 2
stackaxismin <- -.5
stackaxismax <- .5
} else if (params$stackdir == "centerwhole") {
stackdots <- function(a) a - 1 - floor(max(a - 1) / 2)
stackaxismin <- -.5
stackaxismax <- .5
}
# Fill the bins: at a given x (or y), if count=3, make 3 entries at that x
data <- data[rep(1:nrow(data), data$count), ]
# Next part will set the position of each dot within each stack
# If stackgroups=TRUE, split only on x (or y) and panel; if not stacking, also split by group
# plyvars <- params$binaxis %||% "x"
plyvars <- params$binaxis
plyvars <- c(plyvars, "x", "PANEL")
if (is.null(params$stackgroups) || !params$stackgroups)
plyvars <- c(plyvars, "group")
# Within each x, or x+group, set countidx=1,2,3, and set stackpos according to stack function
data <- dapply(data, plyvars, function(xx) {
xx$countidx <- 1:nrow(xx)
xx$stackpos <- stackdots(xx$countidx)
xx
})
# Set the bounding boxes for the dots
if (is.null(params$binaxis) || params$binaxis == "x") {
# ymin, ymax, xmin, and xmax define the bounding rectangle for each stack
# Can't do bounding box per dot, because y position isn't real.
# After position code is rewritten, each dot should have its own bounding box.
data$xmin <- data$x - data$binwidth / 2
data$xmax <- data$x + data$binwidth / 2
data$ymin <- stackaxismin
data$ymax <- stackaxismax
data$y <- 0
} else if (params$binaxis == "y") {
# ymin, ymax, xmin, and xmax define the bounding rectangle for each stack
# Can't do bounding box per dot, because x position isn't real.
# xmin and xmax aren't really the x bounds, because of the odd way the grob
# works. They're just set to the standard x +- width/2 so that dot clusters
# can be dodged like other geoms.
# After position code is rewritten, each dot should have its own bounding box.
data <- dapply(data, c("group", "PANEL"), transform,
ymin = min(y) - binwidth[1] / 2,
ymax = max(y) + binwidth[1] / 2)
data$xmin <- data$x + data$width * stackaxismin
data$xmax <- data$x + data$width * stackaxismax
# Unlike with y above, don't change x because it will cause problems with dodging
}
data
},
draw_group = function(data, panel_params, coord, na.rm = FALSE,
binaxis = "x", stackdir = "up", stackratio = 1,
dotsize = 1, stackgroups = FALSE) {
if (!coord$is_linear()) {
warning("geom_dotplot does not work properly with non-linear coordinates.")
}
tdata <- coord$transform(data, panel_params)
# Swap axes if using coord_flip
if (inherits(coord, "CoordFlip"))
binaxis <- ifelse(binaxis == "x", "y", "x")
if (binaxis == "x") {
stackaxis = "y"
dotdianpc <- dotsize * tdata$binwidth[1] / (max(panel_params$x.range) - min(panel_params$x.range))
} else if (binaxis == "y") {
stackaxis = "x"
dotdianpc <- dotsize * tdata$binwidth[1] / (max(panel_params$y.range) - min(panel_params$y.range))
}
ggplot2:::ggname("geom_grid",
rectstackGrob(stackaxis = stackaxis, x = tdata$x, y = tdata$y, dotdia = dotdianpc,
stackposition = tdata$stackpos, stackratio = stackratio,
default.units = "npc",
gp = grid::gpar(col = alpha(tdata$colour, tdata$alpha),
fill = alpha(tdata$fill, tdata$alpha),
lwd = tdata$stroke, lty = tdata$linetype))
)
},
draw_key = draw_key_grid
)
rectstackGrob <- function(
x = unit(0.5, "npc"), # x pos of the dotstack's origin
y = unit(0.5, "npc"), # y pos of the dotstack's origin
stackaxis = "y",
dotdia = unit(1, "npc"), # Dot diameter in the non-stack axis, should be in npc
stackposition = 0, # Position of each dot in the stack, relative to origin
stackratio = 1, # Stacking height of dots (.75 means 25% dot overlap)
default.units = "npc", name = NULL, gp = gpar(), vp = NULL)
{
if (!grid::is.unit(x))
x <- unit(x, default.units)
if (!grid::is.unit(y))
y <- unit(y, default.units)
if (!grid::is.unit(dotdia))
dotdia <- unit(dotdia, default.units)
if (attr(dotdia,"unit") != "npc")
warning("Unit type of dotdia should be 'npc'")
grid::grob(x = x, y = y, stackaxis = stackaxis, dotdia = dotdia,
stackposition = stackposition, stackratio = stackratio,
name = name, gp = gp, vp = vp, cl = "rectstackGrob")
}
makeContext.rectstackGrob <- function(x, recording = TRUE) {
# Need absolute coordinates because when using npc coords with circleGrob,
# the radius is in the _smaller_ of the two axes. We need the radius
# to instead be defined in terms of the non-stack axis.
xmm <- grid::convertX(x$x, "mm", valueOnly = TRUE)
ymm <- grid::convertY(x$y, "mm", valueOnly = TRUE)
if (x$stackaxis == "x") {
dotdiamm <- grid::convertY(x$dotdia, "mm", valueOnly = TRUE)
xpos <- xmm + dotdiamm * (x$stackposition * x$stackratio + (1 - x$stackratio) / 2)
ypos <- ymm
} else if (x$stackaxis == "y") {
dotdiamm <- grid::convertX(x$dotdia, "mm", valueOnly = TRUE)
xpos <- xmm
ypos <- ymm + dotdiamm * (x$stackposition * x$stackratio + (1 - x$stackratio) / 2)
}
grid::rectGrob(
x = xpos, y = ypos, width = dotdiamm , height = dotdiamm ,
default.units = "mm",
name = x$name, gp = x$gp, vp = x$vp
)
}
densitybin2 <- function(x, weight = NULL, bins = NULL, binwidth = NULL, method = method, range = NULL) {
if (length(stats::na.omit(x)) == 0) return(new_data_frame())
if (is.null(weight)) weight <- rep(1, length(x))
weight[is.na(weight)] <- 0
if (is.null(range)) range <- range(x, na.rm = TRUE, finite = TRUE)
if (is.null(binwidth)) binwidth <- diff(range) / bins
# Sort weight and x, by x
weight <- weight[order(x)]
x <- x[order(x)]
cbin <- 0 # Current bin ID
bin <- rep.int(NA, length(x)) # The bin ID for each observation
binend <- -Inf # End position of current bin (scan left to right)
# Scan list and put dots in bins
for (i in 1:length(x)) {
# If past end of bin, start a new bin at this point
if (x[i] >= binend) {
binend <- x[i] + binwidth
cbin <- cbin + 1
}
bin[i] <- cbin
}
results <- new_data_frame(list(
x = x,
bin = bin,
binwidth = binwidth,
weight = weight
), n = length(x))
results <- dapply(results, "bin", function(df) {
df$bincenter = (min(df$x) + max(df$x)) / 2
return(df)
})
return(results)
}
StatBingrid <- ggproto("StatBingrid", Stat,
required_aes = "x",
non_missing_aes = "weight",
default_aes = aes(y = stat(count)),
setup_params = function(data, params) {
if (is.null(params$binwidth) && is.null(params$bins)) {
message("`stat_bingrid()` using `bins = 30`. Pick better value with `binwidth`.")
params$bins <- 30
}
params
},
compute_layer = function(self, data, params, panels) {
data <- remove_missing(data, params$na.rm,
params$binaxis,
ggplot2:::snake_class(self),
finite = TRUE
)
ggproto_parent(Stat, self)$compute_layer(data, params, panels)
},
compute_panel = function(self, data, scales, na.rm = FALSE, bins=NULL, binwidth = NULL,
binaxis = "x", method = "dotdensity",
binpositions = "bygroup", origin = NULL,
width = 0.9, drop = FALSE,
right = TRUE) {
# If using dotdensity and binning over all, we need to find the bin centers
# for all data before it's split into groups.
if (method == "dotdensity" && binpositions == "all") {
if (binaxis == "x") {
newdata <- densitybin2(x = data$x, weight = data$weight, bins = bins, binwidth = binwidth,
method = method)
data <- data[order(data$x), ]
newdata <- newdata[order(newdata$x), ]
} else if (binaxis == "y") {
newdata <- densitybin2(x = data$y, weight = data$weight, bins = bins, binwidth = binwidth,
method = method)
data <- data[order(data$y), ]
newdata <- newdata[order(newdata$x), ]
}
data$bin <- newdata$bin
data$binwidth <- newdata$binwidth
data$weight <- newdata$weight
data$bincenter <- newdata$bincenter
}
ggproto_parent(Stat, self)$compute_panel(data, scales, bins = bins, binwidth = binwidth,
binaxis = binaxis, method = method, binpositions = binpositions,
origin = origin, width = width, drop = drop,
right = right)
},
compute_group = function(self, data, scales, bins=NULL, binwidth = NULL, binaxis = "x",
method = "dotdensity", binpositions = "bygroup",
origin = NULL, width = 0.9, drop = FALSE,
right = TRUE) {
# This function taken from integer help page
is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) {
abs(x - round(x)) < tol
}
# Check that weights are whole numbers (for dots, weights must be whole)
if (!is.null(data$weight) && any(!is.wholenumber(data$weight)) &&
any(data$weight < 0)) {
stop("Weights for stat_bindot must be nonnegative integers.")
}
if (binaxis == "x") {
range <- scales$x$dimension()
values <- data$x
} else if (binaxis == "y") {
range <- scales$y$dimension()
values <- data$y
# The middle of each group, on the stack axis
midline <- mean(range(data$x))
}
if (method == "histodot") {
closed <- if (right) "right" else "left"
if (!is.null(binwidth)) {
bins <- bin_breaks_width(range, binwidth, boundary = origin, closed = closed)
} else {
bins <- bin_breaks_bins(range, 30, boundary = origin, closed = closed)
}
data <- bin_vector(values, bins, weight = data$weight, pad = FALSE)
# Change "width" column to "binwidth" for consistency
names(data)[names(data) == "width"] <- "binwidth"
names(data)[names(data) == "x"] <- "bincenter"
} else if (method == "dotdensity") {
# If bin centers are found by group instead of by all, find the bin centers
# (If binpositions=="all", then we'll already have bin centers.)
if (binpositions == "bygroup")
data <- densitybin2(x = values, weight = data$weight, bins = bins, binwidth = binwidth,
method = method, range = range)
# Collapse each bin and get a count
data <- dapply(data, "bincenter", function(x) {
new_data_frame(list(
binwidth = .subset2(x, "binwidth")[1],
count = sum(.subset2(x, "weight"))
))
})
if (sum(data$count, na.rm = TRUE) != 0) {
data$count[is.na(data$count)] <- 0
data$ncount <- data$count / max(abs(data$count), na.rm = TRUE)
if (drop) data <- subset(data, count > 0)
}
}
if (binaxis == "x") {
names(data)[names(data) == "bincenter"] <- "x"
# For x binning, the width of the geoms is same as the width of the bin
data$width <- data$binwidth
} else if (binaxis == "y") {
names(data)[names(data) == "bincenter"] <- "y"
# For y binning, set the x midline. This is needed for continuous x axis
data$x <- midline
}
return(data)
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment