Last active
September 14, 2023 06:30
-
-
Save MichaelChirico/e7d3a7eb22b6a43916fda85da68be57c to your computer and use it in GitHub Desktop.
EDA on my New York Times Crossword data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
# output from https://github.com/mattdodge/nyt-crossword-stats | |
# python fetch_puzzle_stats.py -u "..." -p "..." -s 1993-11-21 | |
DT = fread("data.csv") | |
# Sunday is a bit of its own thing, so put it last | |
DT[, day := factor(day, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))] | |
DT[, iso_week := format(date, "%G-%V")] | |
DT[, by = day, { | |
# Don't get a min like '0' which is a clear outlier/bug. My min time is ~2 minutes. | |
lower_bound = 0.5 * quantile(elapsed_seconds, 0.05) | |
`:=`( | |
rolling_median = frollapply(elapsed_seconds, 26L, median) | |
rolling_min = frollapply(elapsed_seconds, 26L, \(x) min(x[x > lower_bound])) | |
) | |
}] | |
# TIME SERIES OF SOLVE TIMES BY WEEKDAY | |
# Could mess around a bit more with the order of the days / centering of the grid, | |
# but this gets close without much fuss. | |
par(mfrow = c(3, 3)) | |
for (DAY in levels(DT$day)) { | |
DT[day == DAY][ | |
# global lower bound, but day-specific upper bound | |
elapsed_seconds < quantile(elapsed_seconds, 0.95) * 1.5 & elapsed_seconds > 120, | |
matplot( | |
date, cbind(elapsed_seconds, rolling_median, rolling_min)/60, | |
type = "l", lty = 1L, las = 1L, lwd = c(0.5, 2L, 2L), col = c(1, 2, 4), | |
xlab = "", ylab = "Solve Time (m)", main = DAY | |
) | |
] | |
} | |
# DISTRIBUTION OF SOLVE TIMES BY WEEKDAY | |
par(mfrow = 1:2) | |
DT[, { | |
x = boxplot( | |
elapsed_seconds/60 ~ day, | |
notch = TRUE, ylim = c(2, 30), las = 1L, col = 2:8, | |
xlab = "", ylab = "Solve Time (m)", main = "Distribution by Day" | |
) | |
bounds <- x$stats[2:4, ] | |
bounds_time <- sprintf("%02d:%02d", floor(bounds), round(60 * (bounds %% 1))) | |
writeLines(sprintf( | |
"%s: %s", | |
levels(day), | |
apply(matrix(bounds_time, nrow = 3L), 2L, paste, collapse = " / ") | |
)) | |
text(rep(1:7, each = 3L), bounds, bounds_time) | |
}] | |
# STACKED ROLLING MEDIANS | |
DT[, { | |
data <- dcast(.SD, iso_week ~ day, value.var = "rolling_median") | |
y <- as.matrix(data[, !"iso_week"]) / 60 | |
matplot( | |
date[match(data$iso_week, iso_week)], y, | |
lty = 1L, type = "l", lwd = 2L, col = 2:8, las = 1L, | |
xlab = "", ylab = "Solve Time (m)", main = "Rolling Median by Day" | |
) | |
}] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment