Created
August 28, 2023 16:26
-
-
Save tomsing1/27cce955193aede83038e4422bbac9e4 to your computer and use it in GitHub Desktop.
R script to create a dataset similar to Yanai and Lercher, Selective attention in hypothesis-driven data analysis, biorXiv, 2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# source: Matt Dray's blog: https://www.rostrum.blog/posts/2021-10-05-gorilla/ | |
library(magick) | |
# download and read the image | |
img_file <- tempfile(fileext = ".jpg") | |
download.file( | |
paste0( | |
"https://classroomclipart.com/images/gallery/", | |
"Clipart/Black_and_White_Clipart/Animals/", | |
"gorilla-waving-cartoon-black-white-outline-clipart-914.jpg" | |
), | |
img_file | |
) | |
img <- magick::image_read(img_file) | |
go <- img |> | |
# reduce to two distinct colours only (i.e. for the lines and background) | |
magick::image_quantize(2) |> # colour reduction | |
# convert from an image to point data | |
magick::image_raster() |> # as x-y data | |
as.data.frame() | |
# identify the colors used | |
colors <- unique(go$col) | |
# reverse the order of the y values so the gorilla is right-side up | |
go$y <- rev(go$y) | |
# filter to retain only the datapoints that represent lines | |
go <- go[go$col != colors[1], ] | |
# rescale the x and y to create ‘Body Mass Index’ (BMI)1 and ‘steps’ variables | |
go$bmi <- go$y / max(go$y) * 17 + 15 | |
go$steps <- 15000 - go$x * 15000 / max(go$x) | |
# remove watermark | |
go$logo <- ifelse(go$bmi < 16 & go$steps < 5500, TRUE, FALSE) | |
go <- go[!go$logo, ] | |
# sample a subset of points | |
go_smp <- go[sample(nrow(go), 1768), ] | |
# split into `male` and `female` groups, weighted so that the female group has | |
# higher step counts. | |
go_smp$rnorm <- rnorm(nrow(go_smp), mean = 0, sd = 10) | |
go_smp$index <- go_smp$steps * (1 + go_smp$rnorm) | |
go_smp$group <- ifelse( | |
go_smp$index < median(go_smp$steps), "F", "M") |> | |
as.factor() | |
# plot the data | |
if (interactive()) { | |
with( | |
go_smp, | |
plot( | |
steps, bmi, | |
xlim = c(0, 15000), | |
pch = 16, cex = 0.5, | |
col = ifelse(group == "M", "blue","red"), | |
xlab = "Steps", ylab = "BMI", | |
) | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment