Skip to content

Instantly share code, notes, and snippets.

@nathancday
Created October 24, 2022 11:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nathancday/193138e156129d2535ea6682adf20972 to your computer and use it in GitHub Desktop.
Save nathancday/193138e156129d2535ea6682adf20972 to your computer and use it in GitHub Desktop.
Code snippet for demo and graphic in Probability-Proportional-To-Size-Sampling blogpost
#' ---
#' title: Code snippet for demo and graphic in Probability-Proportional-To-Size-Sampling blogpost
#' author: nathancday--at-Github--
#' date: 2022-10-24
#' ---
# Libs --------------------------------------------------------------------
library(cowplot)
library(tidyverse)
# Helpers -----------------------------------------------------------------
sample_and_plot = function(x, weight = FALSE) {
if (!weight) {
sample_dat = sample_n(dat, 50)
plt_title = paste0("SRS #", x)
} else {
# this is the magic ... weight=frewg
sample_dat = sample_n(dat, 50, weight = freq)
plt_title = paste0("PPTSS #", x)
}
sample_pct_lbl = scales::percent(sum(sample_dat$freq) / sum(dat$freq))
ggplot(dat, aes(query, freq)) +
geom_col(fill = "white") +
geom_col(data = sample_dat) +
annotate("text", x = 400, y = 7000, label = paste0("Sample covers: ", sample_pct_lbl)) +
scale_x_discrete(labels = NULL) +
labs(title = plt_title,
y = NULL, x = NULL)
}
# Ingest -----------------------------------------------------------------
# CSV data from AI Powered Search by Grainger. Turnbull, Irwin
# https://github.com/o19s/visualizing-signals#step-by-step-setup
dat = read_csv("weighted_sampling/signals.csv") %>%
filter(type == "query") %>%
mutate(query = tolower(target)) %>%
count(query, sort = TRUE, name = "freq") %>%
slice_head(n = 500) %>%
mutate(query = fct_inorder(query))
# Viz ---------------------------------------------------------------------
ylab = "Traffic frequency"
xlab = "Individual queries\nMost frequent ---> Least frequent"
p_all = ggplot(dat, aes(query, freq)) +
geom_col() +
scale_x_discrete(labels = NULL) +
labs(title = "All queries sorted by traffic frequency",
y = ylab, x = xlab)
p_uni_li = map(1:5, ~ sample_and_plot(.x))
p_uni = cowplot::plot_grid(plotlist = p_uni_li, ncol = 1)
p_wt_li = map(1:5, ~ sample_and_plot(.x, weight = TRUE))
p_wt = cowplot::plot_grid(plotlist = p_wt_li, ncol = 1)
p_samples = cowplot::plot_grid(p_uni, p_wt, nrow = 1)
cowplot::plot_grid(p_all, p_samples, ncol = 1, rel_heights = c(0.33, .66))
# Output ------------------------------------------------------------------
ggsave("sampling.png", height = 7, width = 8) # saves last plot by default
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment