Last active
January 12, 2019 13:56
-
-
Save eric-czech/bf63a4833cf991a1595e3fc503856c4f to your computer and use it in GitHub Desktop.
R ggplot 2D scatterplot colored by point density (with optional sampling to improve performance as well as edge case handling)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Get density associated with points in 2D space | |
#' | |
#' @param x Vector of numeric values | |
#' @param y Vector of numeric values; must have equal length to \code{x} | |
#' @param nbins Grid size used for 2D kde. Larger numbers mean density bins are | |
#' more continuous and at around 50 visible discretization begins to disappear | |
#' in scatterplots (though lower numbers will improve speed) | |
#' @param limit Maximum number of points to use for kde estimation | |
#' @param ... Other arguments for MASS::kde2d | |
#' @return A vector of densities with length equal to length of \code{x} and \code{y}. | |
#' Note that if the given vectors are empty, an empty numeric vector is returned | |
get_density <- function(x, y, nbins=50, limit=Inf, ...) { | |
if (length(x) != length(y)){ | |
stop('Length of x and y vectors must match') | |
} | |
n <- length(x) | |
# Return empty vector if input empty | |
if (n < 1){ | |
return(numeric()) | |
# Return scalar density of 1 if only one point given | |
} else if (n < 2){ | |
return(1) | |
} | |
# Determine data to be used for KDE estimation | |
xs <- x | |
ys <- y | |
if (n > limit){ | |
idx <- base::sample(1:n, limit, replace=FALSE) | |
xs <- x[idx] | |
ys <- y[idx] | |
} | |
# kde2d will attempt to estimate bandwidths using bandwidth.nrd but | |
# this will return 0, causing kde2d to fail, if x or y have very low or no | |
# variance so this check will preempt that condition and return 0s instead | |
bandwidths <- c(MASS::bandwidth.nrd(x), MASS::bandwidth.nrd(y)) | |
if (any(bandwidths <= 0)){ | |
return(rep(0, n)) | |
} | |
# Assign density for each point in original vectors | |
dens <- MASS::kde2d(xs, ys, h=bandwidths, n=nbins, ...) | |
ix <- base::findInterval(x, dens$x, all.inside=TRUE) | |
iy <- base::findInterval(y, dens$y, all.inside=TRUE) | |
ii <- cbind(ix, iy) | |
return(dens$z[ii]) | |
} | |
# To test density colored scatterplot: | |
# data.frame(x=rnorm(25000), y=rnorm(25000)) %>% | |
# dplyr::mutate(density=get_density(x, y)) %>% | |
# ggplot2::ggplot(aes(x=x, y=y, color=density)) + | |
# geom_point(size=.3, alpha=.5) + | |
# scale_color_gradientn(colours = rev(RColorBrewer::brewer.pal(11,"Spectral"))) + | |
# theme_bw() + | |
# theme( | |
# panel.grid.major = element_blank(), | |
# panel.grid.minor = element_blank(), | |
# panel.background = element_blank() | |
# ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment