nathancday/weighted_sampling_scratch.R

## weighted_sampling_scratch.R
#' ---
#' title: Code snippet for demo and graphic in Probability-Proportional-To-Size-Sampling blogpost
#' author: nathancday--at-Github--
#' date: 2022-10-24
#' ---


# Libs --------------------------------------------------------------------
library(cowplot)
library(tidyverse)


# Helpers -----------------------------------------------------------------

sample_and_plot = function(x, weight = FALSE) {
  if (!weight) {
    sample_dat = sample_n(dat, 50)
    plt_title = paste0("SRS #", x)
  } else {
    # this is the magic ... weight=frewg
    sample_dat = sample_n(dat, 50, weight = freq)
    plt_title = paste0("PPTSS #", x)
  }

  sample_pct_lbl = scales::percent(sum(sample_dat$freq) / sum(dat$freq))

  ggplot(dat, aes(query, freq)) +
    geom_col(fill = "white") +
    geom_col(data = sample_dat) +
    annotate("text", x = 400, y = 7000, label = paste0("Sample covers: ", sample_pct_lbl)) +
    scale_x_discrete(labels = NULL) +
    labs(title = plt_title,
         y = NULL, x = NULL)
}


# Ingest -----------------------------------------------------------------

# CSV data from AI Powered Search by Grainger. Turnbull, Irwin
# https://github.com/o19s/visualizing-signals#step-by-step-setup
dat = read_csv("weighted_sampling/signals.csv") %>%
  filter(type == "query") %>%
  mutate(query = tolower(target)) %>%
  count(query, sort = TRUE, name = "freq") %>%
  slice_head(n = 500) %>%
  mutate(query = fct_inorder(query))


# Viz ---------------------------------------------------------------------

ylab = "Traffic frequency"
xlab = "Individual queries\nMost frequent  ---> Least frequent"

p_all = ggplot(dat, aes(query, freq)) +
  geom_col() +
  scale_x_discrete(labels = NULL) +
  labs(title = "All queries sorted by traffic frequency",
       y = ylab, x = xlab)

p_uni_li = map(1:5, ~ sample_and_plot(.x))

p_uni = cowplot::plot_grid(plotlist = p_uni_li, ncol = 1)

p_wt_li = map(1:5, ~ sample_and_plot(.x, weight = TRUE))

p_wt = cowplot::plot_grid(plotlist = p_wt_li, ncol = 1)

p_samples = cowplot::plot_grid(p_uni, p_wt, nrow = 1)

cowplot::plot_grid(p_all, p_samples, ncol = 1, rel_heights = c(0.33, .66))


# Output ------------------------------------------------------------------

ggsave("sampling.png", height = 7, width = 8) # saves last plot by default
	#' ---
	#' title: Code snippet for demo and graphic in Probability-Proportional-To-Size-Sampling blogpost
	#' author: nathancday--at-Github--
	#' date: 2022-10-24
	#' ---


	# Libs --------------------------------------------------------------------
	library(cowplot)
	library(tidyverse)


	# Helpers -----------------------------------------------------------------

	sample_and_plot = function(x, weight = FALSE) {
	if (!weight) {
	sample_dat = sample_n(dat, 50)
	plt_title = paste0("SRS #", x)
	} else {
	# this is the magic ... weight=frewg
	sample_dat = sample_n(dat, 50, weight = freq)
	plt_title = paste0("PPTSS #", x)
	}

	sample_pct_lbl = scales::percent(sum(sample_dat$freq) / sum(dat$freq))

	ggplot(dat, aes(query, freq)) +
	geom_col(fill = "white") +
	geom_col(data = sample_dat) +
	annotate("text", x = 400, y = 7000, label = paste0("Sample covers: ", sample_pct_lbl)) +
	scale_x_discrete(labels = NULL) +
	labs(title = plt_title,
	y = NULL, x = NULL)
	}


	# Ingest -----------------------------------------------------------------

	# CSV data from AI Powered Search by Grainger. Turnbull, Irwin
	# https://github.com/o19s/visualizing-signals#step-by-step-setup
	dat = read_csv("weighted_sampling/signals.csv") %>%
	filter(type == "query") %>%
	mutate(query = tolower(target)) %>%
	count(query, sort = TRUE, name = "freq") %>%
	slice_head(n = 500) %>%
	mutate(query = fct_inorder(query))


	# Viz ---------------------------------------------------------------------

	ylab = "Traffic frequency"
	xlab = "Individual queries\nMost frequent ---> Least frequent"

	p_all = ggplot(dat, aes(query, freq)) +
	geom_col() +
	scale_x_discrete(labels = NULL) +
	labs(title = "All queries sorted by traffic frequency",
	y = ylab, x = xlab)

	p_uni_li = map(1:5, ~ sample_and_plot(.x))

	p_uni = cowplot::plot_grid(plotlist = p_uni_li, ncol = 1)

	p_wt_li = map(1:5, ~ sample_and_plot(.x, weight = TRUE))

	p_wt = cowplot::plot_grid(plotlist = p_wt_li, ncol = 1)

	p_samples = cowplot::plot_grid(p_uni, p_wt, nrow = 1)

	cowplot::plot_grid(p_all, p_samples, ncol = 1, rel_heights = c(0.33, .66))


	# Output ------------------------------------------------------------------

	ggsave("sampling.png", height = 7, width = 8) # saves last plot by default