Skip to content

Instantly share code, notes, and snippets.

@jeffreyiacono
Last active January 21, 2016 01:08
Show Gist options
  • Save jeffreyiacono/1bee231519346d82c0bb to your computer and use it in GitHub Desktop.
Save jeffreyiacono/1bee231519346d82c0bb to your computer and use it in GitHub Desktop.
Plot percentiles by boundaries
options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("ggplot2")
install.packages("dplyr")
require(ggplot2)
require(dplyr)
points <- 10000
buckets <- 20
quantile_splits <- c(0, 0.25, 0.5, 0.75, 1)
ntile_labels <- c("0%-tile", "25%-tile", "50%-tile", "75%-tile", "99%-tile", "100%-tile")
ntile_colors <- scales::hue_pal()(length(ntile_labels))
names(ntile_colors) <- ntile_labels
# create sequential Xs and normally distributed Ys the have a 4x positive skew
# why? just for fun and it looks cooler!
x <- seq(0, points - 1, by = 1)
y <- rnorm(points, mean = 0, sd = 1)
jitter <- rnorm(length(x), mean = 0, sd = 500)
# create dataframe + ntiles
df <- data.frame(
x = x,
scaled_y = (-1 / 10000) * x ** 2 + jitter * (1 + x / 1000)
)
df$x_ntile <- ntile(df$x, buckets)
# group and calculate
df <- df %>%
group_by(x_ntile) %>%
mutate(
y_ntile_0 = quantile(scaled_y, quantile_splits)[1],
y_ntile_025 = quantile(scaled_y, quantile_splits)[2],
y_ntile_05 = quantile(scaled_y, quantile_splits)[3],
y_ntile_075 = quantile(scaled_y, quantile_splits)[4],
y_ntile_099 = quantile(scaled_y, c(0.99))[1],
y_ntile_1 = quantile(scaled_y, quantile_splits)[5],
y_ntile = factor(
ntile(scaled_y, 4),
labels = c("0-25%", "25-50%", "50-75%", "75-100%"),
ordered = TRUE
)
)
# group and calculate
ggplot(
data = df,
aes(x = x)
) +
geom_point(
aes(
y = scaled_y,
color = y_ntile
),
alpha = 0.3
) +
geom_line(
aes(
y = y_ntile_0
),
color = ntile_colors[1]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_0,
label = ntile_labels[1],
hjust = 0
) +
geom_line(
aes(
y = y_ntile_025
),
color = ntile_colors[2]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_025,
label = ntile_labels[2],
hjust = 0
) +
geom_line(
aes(
y = y_ntile_05
),
color = ntile_colors[3]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_05,
label = ntile_labels[3],
hjust = 0
) +
geom_line(
aes(
y = y_ntile_075
),
color = ntile_colors[4]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_075,
label = ntile_labels[4],
hjust = 0
) +
geom_line(
aes(
y = y_ntile_099
),
color = ntile_colors[5]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_099,
label = ntile_labels[5],
hjust = 0
) +
geom_line(
aes(
y = y_ntile_1
),
color = ntile_colors[6]
) +
annotate(
geom = "text",
x = points,
y = df[points, ]$y_ntile_1,
label = ntile_labels[6],
hjust = 0
) +
scale_color_discrete(
guide = guide_legend(
title = "Percentiles",
override.aes = list(alpha = 1)
)
) +
scale_x_continuous(
breaks = seq(0, points, by = points / buckets),
limits = c(1, points)
) +
labs(
x = "Xs",
y = "some random Ys"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment