Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Last active September 14, 2023 06:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MichaelChirico/e7d3a7eb22b6a43916fda85da68be57c to your computer and use it in GitHub Desktop.
Save MichaelChirico/e7d3a7eb22b6a43916fda85da68be57c to your computer and use it in GitHub Desktop.
EDA on my New York Times Crossword data
library(data.table)
# output from https://github.com/mattdodge/nyt-crossword-stats
# python fetch_puzzle_stats.py -u "..." -p "..." -s 1993-11-21
DT = fread("data.csv")
# Sunday is a bit of its own thing, so put it last
DT[, day := factor(day, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))]
DT[, iso_week := format(date, "%G-%V")]
DT[, by = day, {
# Don't get a min like '0' which is a clear outlier/bug. My min time is ~2 minutes.
lower_bound = 0.5 * quantile(elapsed_seconds, 0.05)
`:=`(
rolling_median = frollapply(elapsed_seconds, 26L, median)
rolling_min = frollapply(elapsed_seconds, 26L, \(x) min(x[x > lower_bound]))
)
}]
# TIME SERIES OF SOLVE TIMES BY WEEKDAY
# Could mess around a bit more with the order of the days / centering of the grid,
# but this gets close without much fuss.
par(mfrow = c(3, 3))
for (DAY in levels(DT$day)) {
DT[day == DAY][
# global lower bound, but day-specific upper bound
elapsed_seconds < quantile(elapsed_seconds, 0.95) * 1.5 & elapsed_seconds > 120,
matplot(
date, cbind(elapsed_seconds, rolling_median, rolling_min)/60,
type = "l", lty = 1L, las = 1L, lwd = c(0.5, 2L, 2L), col = c(1, 2, 4),
xlab = "", ylab = "Solve Time (m)", main = DAY
)
]
}
# DISTRIBUTION OF SOLVE TIMES BY WEEKDAY
par(mfrow = 1:2)
DT[, {
x = boxplot(
elapsed_seconds/60 ~ day,
notch = TRUE, ylim = c(2, 30), las = 1L, col = 2:8,
xlab = "", ylab = "Solve Time (m)", main = "Distribution by Day"
)
bounds <- x$stats[2:4, ]
bounds_time <- sprintf("%02d:%02d", floor(bounds), round(60 * (bounds %% 1)))
writeLines(sprintf(
"%s: %s",
levels(day),
apply(matrix(bounds_time, nrow = 3L), 2L, paste, collapse = " / ")
))
text(rep(1:7, each = 3L), bounds, bounds_time)
}]
# STACKED ROLLING MEDIANS
DT[, {
data <- dcast(.SD, iso_week ~ day, value.var = "rolling_median")
y <- as.matrix(data[, !"iso_week"]) / 60
matplot(
date[match(data$iso_week, iso_week)], y,
lty = 1L, type = "l", lwd = 2L, col = 2:8, las = 1L,
xlab = "", ylab = "Solve Time (m)", main = "Rolling Median by Day"
)
}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment