This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Marathon-Insights" | |
author: "frank-corrigan" | |
date: "3/1/2022" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
library(tidyverse) # for data buildings | |
library(ggplot2) # for visualization | |
library(reshape2) # for wide to long, typically | |
library(broom) # for grabbing data from fitting nls | |
library(kableExtra) # for printing tables | |
library(choroplethr) # for maps | |
library(readr) # for maps | |
library(usdata) # for maps | |
library(usmap) # for maps | |
library(ggthemes) # for maps | |
``` | |
### Read Parsed Data | |
```{r read-data} | |
# read the pre-processed data (processed with python) | |
data = read.csv("parsed_marathon_data_4.csv", stringsAsFactors = FALSE) | |
# remove NAs (small amount of funky data) | |
data <- data %>% drop_na() | |
# just a check - how much data? | |
dim(data) | |
``` | |
### Clean & Create New Variables | |
```{r data-cleaning} | |
# convert variables to desired type | |
data$age <- as.numeric(data$age) | |
# remove white space from end of runners names | |
data$name_clean <- tolower(trimws(data$name)) | |
# filter data to desired range (between 2 hours and 12 hours) | |
data <- data %>% filter(time_total_minutes <= 720 & time_total_minutes >= 120) | |
# for this exerise, just looking at dudes | |
data <- data %>% filter(gender == "M") | |
# create standalone marathon title (without the date) | |
get_title <- function(marathon_name){ | |
output <- trimws(str_split(marathon_name, fixed("("))[[1]][1]) | |
return(output) | |
} | |
data$marathon_title <- unlist(lapply(data$marathon_name, get_title)) | |
# manually fix typos | |
data[data$marathon_name == "Hops Marathon by the Bay (FL - 12/10/00)" & | |
data$name == "Scott Burrows ",8] = "2:32:43" | |
data[data$marathon_name == "Hops Marathon by the Bay (FL - 12/10/00)" & | |
data$name == "Scott Burrows ",9] = 153 | |
head(data) %>% | |
kbl() %>% | |
kable_styling() | |
``` | |
### Lifespan of Marathons | |
```{r repeat-marathons} | |
# how many marathons in dataset? | |
num_marathons <- length(unique(data$marathon_title)) | |
print(paste("Num of marathons (i.e. NYC would be 1 marathon here):", num_marathons)) | |
# how many each marathon was hosted | |
marathon_freq <- data %>% | |
select(marathon_year, marathon_title) %>% | |
unique() %>% | |
group_by(marathon_title) %>% | |
summarise(count = n()) | |
# proportion of marathons that occured x times | |
prop.table(table(marathon_freq$count)) | |
print(paste("Approximate lifespan of a marathon:", | |
(0.46*1) + (0.24*5) + (0.16*10) + (0.09*15) + (0.04*20))) | |
``` | |
### Create conversion functions | |
```{r helper-functions} | |
# hours minutes seconds to seconds | |
hmsts <- function(time){ | |
pieces <- str_split(time, ":") | |
return(as.numeric(pieces[[1]][1]) * 60 * 60 + | |
as.numeric(pieces[[1]][2]) * 60 + | |
as.numeric(pieces[[1]][3])) | |
} | |
# create a seconds column | |
data$seconds <- unlist(lapply(data$time, hmsts)) | |
# seconds to hours minutes seconds | |
sthms <- function(seconds){ | |
hours <- floor(seconds / 60 / 60) | |
minutes <- floor((seconds / 60 / 60 - hours) * 60) | |
seconds <- floor(((seconds / 60 / 60 - hours) * 60 - minutes) * 60) | |
if (minutes < 10){ | |
return(paste0(hours, ":0", minutes, ":", seconds)) | |
} else { | |
return(paste0(hours, ":", minutes, ":", seconds)) | |
} | |
} | |
``` | |
### Race Counts & Average Times | |
```{r race-growth, echo=FALSE} | |
agg_race_stats_df <- data %>% group_by(marathon_year) %>% | |
summarise(race_count = n_distinct(marathon_name), avg_time = median(time_total_minutes)) %>% | |
mutate(race_count_pct = (race_count-lag(race_count))/lag(race_count), avg_time_pct = (avg_time-lag(avg_time))/lag(avg_time)) %>% | |
mutate(growth_color = ifelse(race_count_pct > 0, "Growth", "Shrink")) | |
p <- ggplot(agg_race_stats_df, aes(x=marathon_year, y=race_count, fill=growth_color)) + | |
geom_bar(stat = "identity") + | |
scale_fill_manual(values=c("#1a9641", "#d7191c")) + | |
theme_minimal() + | |
theme(legend.position="top", legend.title=element_blank(), | |
) + | |
labs(x="Year", y="Race Count", title="Marathon Growth", subtitle="Race count increased between 2000-2015, but shrank significantly during the pandemic.") | |
p | |
``` | |
### General Marathon Stats | |
```{r overall-stats} | |
state_list <- c( 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', | |
'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', | |
'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', | |
'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', | |
'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY') | |
print(paste("Dims of the data:", dim(data))) | |
print(paste("Median age of finishers:", median(data$age))) | |
print(paste("Median finish time:", sthms(median(data$seconds)))) | |
print(paste("Percent of data that is US-centric:", nrow(data[!data$marathon_state %in% state_list,]) / nrow(data))) | |
``` | |
### How Many Finishers per Race? | |
```{r finishers-per-race} | |
finisher_tbl <- data %>% | |
group_by(marathon_year, marathon_name) %>% | |
summarise(total_finishers = mean(num_finishers)) %>% | |
group_by(marathon_year) %>% | |
summarise(avg_finishers = round(median(total_finishers))) | |
color_scale <- c("#ff0000", "#ffc100", "#ffff00", "#d6ff00", "#63ff00") | |
head(finisher_tbl) %>% | |
kbl(caption = "Median # of Finishers per Race", align = "c",) %>% | |
kable_classic(full_width = F, html_font = "Cambria", position = "center") %>% | |
column_spec(2, color = "black", background = spec_color2(finisher_tbl$avg_finishers, palette = color_scale)) | |
``` | |
### Estimate Time by Age | |
```{r age} | |
# find mean time by age | |
by_age_avg_time <- data %>% | |
filter(age >= 20 & age < 90) %>% | |
group_by(age) %>% | |
summarise(avg_time = median(seconds), | |
min_time = min(seconds), | |
max_time = max(seconds)) %>% | |
mutate(superfast = hmsts("2:20:00")) | |
# find best fit function to predict time based on age | |
# # https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/ | |
fit <- nls(data = data, | |
formula = seconds ~ a + b * (age^c), | |
start=c(a = 12000, b = 0.01, c=2), | |
control = list(maxiter = 12000)) | |
fit_data <- augment(fit) | |
# plot data | |
# create buckets to display H:M instead of minutes | |
min_min <- min(data$seconds) | |
max_min <- hmsts("8:00:00")# max(data$seconds) | |
the_seq <- seq(min_min, max_min, 1200) | |
y_labels <- unlist(lapply(the_seq, sthms)) | |
# plot data | |
p <- ggplot(data %>% filter(age >= 20 & age < 90 & seconds < hmsts("8:00:00"))) + | |
geom_jitter(aes(x=age, y=seconds, color="Result"), | |
alpha=0.05) + | |
scale_x_continuous(limits = c(20, 84), | |
breaks = seq(20, 84, 2), | |
expand = expansion(mult = c(0.01, 0.01))) + | |
scale_y_continuous(labels=y_labels, | |
breaks=the_seq, | |
expand = expansion(mult = c(0.01, 0.01))) + | |
geom_step(data = by_age_avg_time, | |
aes(x=age, y=avg_time, group=1, color="Average"), | |
lwd=2.0, alpha=0.8, inherit.aes = FALSE) + | |
geom_line(data = fit_data, | |
aes(x=age, y=.fitted, group=1, alpha=1, color="Fitted"), | |
lwd=2.0, show.legend=FALSE, inherit.aes = FALSE) + | |
geom_line(data = by_age_avg_time, | |
aes(x=age, y=superfast, group=1, alpha=1, color="Fast"), | |
lwd=2.0, show.legend=FALSE, inherit.aes = FALSE) + | |
theme_minimal() + | |
scale_color_manual("", | |
values=c("Result" = "black", | |
"Average" = "#5f987a", | |
"Fitted" = "deeppink", | |
"Fast" = "steelblue"), | |
labels = c("Result", | |
"Average (Median by Age)", | |
"Fitted", | |
"Super Fast")) + | |
# theme(legend.position = c(0.5, 0.95)) + | |
theme(legend.position = "top") + | |
guides(colour = guide_legend(nrow = 1)) + | |
labs(x="Age", | |
y="Finish Time (h:m:s)", | |
title="Age vs. Speed", | |
subtitle="As we get older, we slow down... but the deterioration is gradual.") | |
p | |
``` | |
### Find Your Cluster: Time at Age 38+ | |
```{r time-at-38} | |
# find individuals with multiple race results in dataset | |
multiple_marathons <- data %>% | |
group_by(name_clean) %>% | |
summarise(count = n()) %>% | |
filter(count > 1) | |
# find avg time of individuals with race history | |
multiple_times <- data %>% | |
filter(name_clean %in% multiple_marathons$name_clean) %>% | |
group_by(name_clean) %>% | |
summarise(multiples_avg_time = mean(time_total_minutes)) | |
# join avg times for racers with history to base data | |
data <- left_join(data, multiple_times, by="name_clean") | |
# threshold: if average race history is less than 2:40, FASTer, otherwise slower. | |
data$history_cat <- ifelse(data$multiples_avg_time <= 162, "History_Faster", | |
ifelse(data$multiples_avg_time > 162, "History_Slower", "Single_Race")) | |
# filter to just fast history racers when they are 38 or older | |
focus_group_df <- data %>% filter(history_cat == "History_Faster" & age >= 38) | |
# use our subset to estimate prob of running <=2:20 at 38+ | |
d <- ecdf(focus_group_df$seconds) | |
print(paste("Frank, your 'probability' of running faster than 2:20 after the age of 38 is approximately", d(hmsts("2:20:00"))*100, "percent. Good luck!")) | |
# create histogram to support | |
min_min <- min(focus_group_df$seconds) | |
max_min <- max(focus_group_df$seconds) | |
the_seq <- seq(min_min, max_min, 10*60) | |
x_labels <- unlist(lapply(the_seq, sthms)) | |
ggplot(focus_group_df, aes(x=seconds)) + | |
geom_histogram(fill="steelblue") + | |
scale_x_continuous(labels=x_labels, | |
breaks=the_seq | |
) + | |
theme_minimal() + | |
geom_vline(xintercept = hmsts("2:20:00"), linetype="dotted") + | |
labs(x="Finish Time (h:m:s)", | |
y="Count", | |
title="Age 38+ Finish Times of Historically Fast Individuals") | |
``` | |
### When? | |
``` {r by-month} | |
# find mean time per month | |
time_by_month <- data %>% | |
group_by(marathon_month) %>% | |
summarise(avg_time = mean(seconds)) | |
# create month name as factor | |
time_by_month$month_name <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec") | |
time_by_month$month_name <- factor(time_by_month$month_name, | |
levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) | |
# create variables to identify relatively slower or faster months | |
monthly_time <- mean(time_by_month$avg_time) | |
time_by_month$variance <- monthly_time - time_by_month$avg_time | |
time_by_month$color <- ifelse(time_by_month$variance < 0, "Slower", "Faster") | |
# plot it! | |
p <- ggplot(time_by_month, aes(x=month_name, y=variance, fill=color)) + | |
geom_bar(stat = "identity") + | |
scale_fill_manual(values=c("#1a9641", "#d7191c")) + | |
theme_minimal() + | |
theme(legend.position="top", legend.title=element_blank(), | |
) + | |
labs(x="Month", y="Minutes Relative to Monthly Average") | |
p | |
``` | |
### Now the Maps | |
``` {r by-state} | |
# helpful: | |
# https://rkabacoff.github.io/datavis/GeoMaps.html | |
us_states <- map_data("state") | |
state_df <- data %>% | |
group_by(marathon_state) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
state_df <- state_df %>% filter(marathon_state %in% state_list) | |
state_df$state_name <- abbr2state(state_df$marathon_state) | |
state_df$region <- tolower(state_df$state_name) | |
state_df <- left_join(state_df, us_states, by="region") | |
# create map in stages | |
# step 1: feed your data to ggplot | |
p0 <- ggplot(data = subset(state_df), | |
mapping = aes(x = long, y = lat, | |
group = group, | |
fill = avg_time)) | |
# step 2: define your map attributes | |
p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) + | |
coord_map(projection = "albers", lat0 = 39, lat1 = 45) | |
# step 3: define your colors | |
p2 <- p1 + scale_fill_viridis_c(option = "A", direction = 1) | |
# step 4: labels and make it look sharp | |
p3 <- p2 + theme_map() + | |
theme(legend.position = "top", | |
strip.background = element_blank()) + | |
labs(fill = "Average Minutes", | |
title = "States Marathon Avg. Times (Top 100 Finishers)", | |
subtitle = "Years included = 2000, 2005, 2010, 2015, 2021") | |
p3 | |
``` | |
```{r map-facet} | |
library(maps) | |
us_states <- map_data("state") | |
state_facet <- data %>% | |
group_by(marathon_month, marathon_state) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
state_facet <- state_facet %>% filter(marathon_state %in% state_list) | |
state_facet$state_name <- abbr2state(state_facet$marathon_state) | |
state_facet$region <- tolower(state_facet$state_name) | |
state_facet <- left_join(state_facet, us_states, by="region") | |
p0 <- ggplot(data = subset(state_facet), | |
mapping = aes(x = long, y = lat, | |
group = group, | |
fill = avg_time)) | |
p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) + | |
coord_map(projection = "albers", lat0 = 39, lat1 = 45) | |
p2 <- p1 + scale_fill_viridis_c(option = "A", direction = 1) | |
p3 <- p2 + theme_map() + facet_wrap(~ marathon_month, ncol = 3) + | |
theme(legend.position = "top", | |
strip.background = element_blank()) + | |
labs(fill = "Average Minutes ", | |
title = "State & Month Marathon Avg. Times (Top 100 Finishers)", | |
subtitle = "Years included = 2000, 2005, 2010, 2015, 2021") | |
p3 | |
``` | |
### Specific Marathons to Run | |
```{r gems} | |
gems_df <- data %>% | |
filter(marathon_year >= 2015 & marathon_state != " -") %>% | |
group_by(marathon_title, marathon_state, marathon_month) %>% | |
summarise(avg_finishers = round(mean(num_finishers)), | |
avg_seconds = median(seconds), | |
avg_time = sthms(median(seconds)), | |
min_seconds = min(seconds), | |
min_time = sthms(min(seconds)), | |
ten_pct = quantile(seconds, c(0.10)), | |
ten_pct_time = sthms(quantile(seconds, c(0.10)))) | |
gems_df %>% filter(avg_finishers < 2500 & | |
ten_pct <= hmsts("3:00:00") & | |
min_seconds <= hmsts("2:30:00") & | |
marathon_state %in% c("NJ", "NY", "CT", "RI", "VT", "NH", "MA")) %>% | |
arrange(ten_pct) %>% | |
select(marathon_title, marathon_state, marathon_month, | |
avg_finishers, avg_time, min_time, ten_pct_time) %>% | |
head(20) %>% | |
kbl(caption = "Possible Target Races", align=c("l",rep("c",6))) %>% | |
kable_classic(full_width = F, html_font = "Cambria", position = "center") | |
``` | |
# END | |
Below is simple a lot of experimenting... code or ideas that didn't make the cut for the blog post. | |
```{r bayes, echo=FALSE, eval = FALSE} | |
age_param <- c() | |
month_param <- c() | |
interaction_param <- c() | |
for (i in 1:100) { | |
s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 5000, replace = TRUE) | |
m <- lm(data=s, time_total_minutes ~ age^2 * marathon_month) | |
age_param <- c(age_param, m$coefficients[2]) | |
month_param <- c(month_param, m$coefficients[3]) | |
interaction_param <- c(interaction_param, m$coefficients[4]) | |
} | |
age_param <- c() | |
for (i in 1:1000) { | |
s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 5000, replace = TRUE) | |
fit <- nls(data = s, | |
formula = time_total_minutes ~ a + b * (age^c), | |
start=c(a = 200, b = 0.01, c=2), | |
control = list(maxiter = 200)) | |
age_param <- c(age_param, coef(fit)[3]) | |
} | |
hist(age_param) | |
``` | |
Mysteries are good - they keep you thinking. However, we also need some helpful information. Running NYC is exilerating. Coming down the 59th Street Bridge and hearing the roar of the crowd on 1st Avenue is quite a sureal experience. But the race logistics of NYC are nightmareish. Waiting hours at the start line? If I can avoid that, I will. Where are the races that are fast - fast courses - with reltively few participants? Let's look at races with less than 500 finishers sorted by finish times. | |
```{r participant-average, echo=FALSE, eval = FALSE} | |
base <- distinct(data %>% select(marathon_name, num_finishers)) | |
m_year <- distinct(data %>% select(marathon_name, marathon_year)) | |
participants <- left_join(base, m_year, by="marathon_name") | |
participants %>% | |
group_by(marathon_year) %>% | |
summarise(mean_finishers = mean(num_finishers), | |
median_finishers = median(num_finishers)) | |
``` | |
``` {r time-trends, echo=FALSE, eval = FALSE} | |
avg_time = mean(data$time_total_minutes) | |
ggplot(data, aes(x=marathon_year, y=time_total_minutes, group=marathon_year)) + | |
geom_boxplot(outlier.shape = NA) + | |
scale_y_continuous(limits = c(100, 400)) + | |
geom_hline(yintercept = avg_time, color="blue") + | |
scale_fill_manual(values=c("#1a9641", "#d7191c")) + | |
theme_minimal() + | |
theme(legend.position="top", legend.title=element_blank(), | |
) + | |
labs(x="Year", y="Minutes", title="Marathon Times Evolution", subtitle="Post pandemic, average marathon times have platued but the distribution has widened.") | |
``` | |
``` {r race-size-analysis-1, echo=FALSE, eval = FALSE} | |
data$race_size <- cut(data$num_finishers, 4, labels = c("Very Small", "Small", "Large", "Very Large")) | |
time_by_race_size_df <- data %>% group_by(race_size) %>% summarize(avg_time = mean(time_total_minutes)) | |
ggplot(time_by_race_size_df, aes(x=race_size, y=avg_time, fill=avg_time)) + | |
geom_bar(stat="identity") + | |
# geom_hline(yintercept = avg_time, color="blue") + | |
# scale_fill_manual(values=c("#1a9641", "#d7191c")) + | |
theme_minimal() + | |
theme(legend.position="top") + | |
labs(x="Relative Race Size", y="Average Minutes", title="Larger Race, Faster Times", subtitle="Generally, bigger races attract faster runners.") | |
``` | |
``` {r num-finishers-analysis-2, echo=FALSE, eval = FALSE} | |
# avg time by race size | |
time_by_finishers_df <- data %>% | |
group_by(num_finishers) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
# what is the function? frank manually tries to get close... | |
# a <- 1 | |
# b <- 2 | |
# y <- 160 + 200*(1-0.002)^time_by_finishers_df$num_finishers | |
# time_by_finishers_df$pred_time <- y | |
# ggplot(time_by_finishers_df, aes(x=num_finishers, y=avg_time)) + | |
# geom_point() + | |
# geom_line(data=time_by_finishers_df, aes(x=num_finishers, y=pred_time), color="red") | |
fit <- nls(data = time_by_finishers_df, | |
formula = avg_time ~ a + b * (1 - c) ^ num_finishers, | |
start = c(a = 160, b = 200, c = 0.002), | |
control = list(maxiter = 200)) | |
fit_data <- augment(fit) | |
ggplot(data = fit_data, aes(x=num_finishers, y=avg_time)) + | |
geom_point() + | |
geom_line(data = fit_data, aes(x=num_finishers, y=.fitted), color="red") + | |
theme_minimal() + | |
theme(legend.position="top") + | |
labs(x="Relative Race Size", y="Average Minutes", title="Larger Race, Faster Times", subtitle="Generally, bigger races attract faster runners.") | |
``` | |
``` {r num-finishers-analysis-3, echo=FALSE, eval = FALSE} | |
# avg time by race size | |
time_by_finishers_tiny <- data %>% | |
filter(num_finishers <= 2000) %>% | |
group_by(num_finishers) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
fit_tiny <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_tiny) | |
fit_tiny_data <- augment(fit_tiny) | |
p1 <- ggplot(data = fit_tiny_data, aes(x=num_finishers, y=avg_time)) + | |
geom_point() + | |
geom_line(data = fit_tiny_data, aes(x=num_finishers, y=.fitted), color="red") + | |
scale_y_continuous(limits=c(100, 450)) + | |
theme_minimal() + | |
theme(legend.position="top") + | |
labs(x="Relative Race Size", y="Average Minutes", title="Smaller Races", subtitle="Generally, bigger races attract faster runners.") | |
# avg time by race size | |
time_by_finishers_not_tiny <- data %>% | |
filter(num_finishers > 2000) %>% | |
group_by(num_finishers) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/ | |
fit_not_tiny <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_not_tiny) | |
fit_not_tiny_data <- augment(fit_not_tiny) | |
p2 <- ggplot(data = fit_not_tiny_data, aes(x=num_finishers, y=avg_time)) + | |
geom_point() + | |
geom_line(data = fit_not_tiny_data, aes(x=num_finishers, y=.fitted), color="red") + | |
scale_y_continuous(limits=c(100, 450)) + | |
theme_minimal() + | |
theme(legend.position="top") + | |
labs(x="Relative Race Size", y="Average Minutes", title="Larger Races", subtitle="Generally, bigger races attract faster runners.") | |
grid.arrange(p1, p2, ncol = 2) | |
``` | |
``` {r small-races-analysis, echo=FALSE, eval = FALSE} | |
data$small_race <- ifelse(data$num_finishers < 100, "Small", ifelse(data$num_finishers < 1000, "Medium", "Large")) | |
data %>% | |
group_by(marathon_year, small_race) %>% | |
summarise(count = n()) %>% | |
group_by(marathon_year) %>% | |
mutate(pct = count / sum(count)) | |
``` | |
``` {r race-size-times, echo=FALSE, eval = FALSE} | |
race_size_times_df <- data %>% | |
group_by(small_race, marathon_year) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
ggplot(race_size_times_df, aes(x=marathon_year, | |
y=avg_time, | |
group=factor(small_race), | |
color=factor(small_race))) + | |
geom_line() + | |
# scale_y_continuous(limits=c(100, 450)) + | |
theme_minimal() + | |
theme(legend.position="top") + | |
labs(x="Relative Race Size", y="Average Minutes", title="Larger Races", subtitle="Generally, bigger races attract faster runners.") | |
``` | |
``` {r age-analysis, fig.width=16, echo=FALSE, eval = FALSE} | |
ggplot(data %>% filter(age >= 18 & age <= 80), aes(x=age, y=time_total_minutes, group=age)) + geom_boxplot() | |
``` | |
There's a particular break here between Very Samll and Small. Let's find it. | |
You don't have to know much about statistics to see that this is a crappy fit. Where does that line sort of bottom out - that's where I want to seperate Very Small from the rest. | |
``` {r, echo=FALSE, eval = FALSE} | |
df <- time_by_finishers_df | |
fitted <- df %>% | |
nest() %>% | |
mutate( | |
fit = map(data, ~nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = .)), | |
tidied = map(fit, tidy), | |
augmented = map(fit, augment), | |
) | |
augmented <- fitted %>% | |
unnest(augmented) | |
augmented | |
``` | |
The break happens around 1,000 finishers. This means there is a drastic difference between races that are less than 1,000 runners from more than 1,000 runners. Could we fit a better curve if we just looked at races with more than 1,000 participants? | |
``` {r num-finishers-analysis-2, echo=FALSE, eval = FALSE} | |
# avg time by race size | |
time_by_finishers_df <- data %>% | |
filter(num_finishers >= 1000) %>% | |
group_by(num_finishers) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/ | |
# fit exponential curve | |
fit <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_df) | |
# plot curve | |
qplot(num_finishers, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue")) | |
``` | |
Definitely a better fit. What does it mean? Overall, the insight here is that smaller races have slower average times. One hypothesis is... | |
``` {r small-races-analysis, echo=FALSE, eval = FALSE} | |
data$small_race <- ifelse(data$num_finishers < 100, "Small", ifelse(data$num_finishers < 1000, "Medium", "Large")) | |
data %>% | |
group_by(marathon_year, small_race) %>% | |
summarise(count = n()) %>% | |
group_by(marathon_year) %>% | |
mutate(pct = count / sum(count)) | |
``` | |
``` {r race-size-times, echo=FALSE, eval = FALSE} | |
race_size_times_df <- data %>% | |
group_by(small_race, marathon_year) %>% | |
summarise(avg_time = mean(time_total_minutes)) | |
ggplot(race_size_times_df, aes(x=marathon_year, y=avg_time, group=factor(small_race), color=factor(small_race))) + geom_line() | |
``` | |
This first split tells us a few things about the upward trend and plateau in average marathon times. A) the plateau is driven by more smaller races. B) the performance of top 100 in big races in 2021 saw big gains - consistent with headlines and records breaking. C) Since bigger races typically have faster times, this shows a hint of possibility that as big races come back online in 2022, average race times will continue to stay flat of perhaps even improve. | |
``` {r repeat-races, echo=FALSE, eval = FALSE} | |
repeat_races_df <- data %>% group_by(marathon_title) %>% summarise(count = n_distinct(marathon_year)) | |
# hist(repeat_races_df$count) | |
repeat_races <- repeat_races_df %>% filter(count >= 4) | |
repeat_viz <- data %>% filter(marathon_title %in% repeat_races$marathon_title) %>% | |
group_by(small_race, marathon_year) %>% | |
summarise(race_count = n_distinct(marathon_title), avg_time = mean(time_total_minutes)) | |
ggplot(repeat_viz, aes(x=marathon_year, y=avg_time, group=factor(small_race), color=factor(small_race))) + geom_line() | |
``` | |
## Marathon Location | |
``` {r by-age-scatter-and-fit, echo=FALSE, eval = FALSE} | |
by_age <- data %>% filter(age >= 21 & age < 90) %>% group_by(age) %>% summarise(avg_time = mean(time_total_minutes)) | |
ggplot(by_age, aes(x=age, y=avg_time)) + geom_point() | |
# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/ | |
# fit exponential curve | |
fit <- nls(avg_time ~ SSasymp(age, yf, y0, log_alpha), data = by_age) | |
# plot curve | |
qplot(age, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue")) | |
model <- function(data){ | |
nls(avg_time ~ a + b * (age^b), data=data, start=c(a = 10, b = 0)) | |
} | |
hold <- by_age %>% group_by(age) %>% | |
nest() %>% | |
mutate(mod= map(.x=data, model)) | |
fit <- nls(data = by_age, formula = avg_time ~ a + b * (age^c), start=c(a = 205, b = 0.01, c=2)) | |
fit_data <- augment(fit) | |
qplot(age, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue")) + | |
theme_minimal(base_family="Roboto") | |
# start <- min(by_age$avg_time) | |
# my_range <- 1:70 | |
# y <- start+5 + 0.01 * my_range ^ 2.40 | |
# y <- start+5 + 0.005 * my_range ^ 2.58 | |
# y <- 214.485465023 + 0.000003259 * my_range ^ 4.104293201 | |
# | |
# all_ages <- 21:90 | |
# base_data <- data.frame(age = all_ages) | |
# test <- base_data %>% left_join(by_age, by="age") | |
# test$fit <- y | |
# | |
# ggplot(test, aes(x=age, y=avg_time)) + geom_point() + | |
# geom_line(data=test, aes(x=age, y=fit), color="red") | |
``` | |
``` {r sample-data, echo=FALSE, eval = FALSE} | |
df <- data.frame("index" = numeric(), "age"= numeric(), "time_total_minutes" = numeric(), | |
".fitted" = numeric(), ".resid" = numeric()) | |
for (i in 1:50) { | |
print(i) | |
s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 1000, replace = TRUE) | |
start_value <- (data %>% filter(age < 20) %>% summarize(v = mean(time_total_minutes)))$v | |
tryCatch( | |
{fit <- nls(data = s, | |
formula = time_total_minutes ~ a + b * (age^c), | |
start = c(a = start_value, b = 0.01, c = 2), | |
control = list(maxiter = 200)) | |
}, | |
error=function(e) { | |
e | |
print(paste("Oops! --> Error in Loop ",i,sep = "")) | |
} | |
) | |
fit_data <- augment(fit) | |
fit_data$index <- i | |
df <- rbind(df, fit_data) | |
} | |
ggplot(data = df, aes(x=age, y=time_total_minutes)) + geom_point() + | |
geom_line(data = df, aes(x=age, y=.fitted, group = factor(index), color = factor(index))) + | |
theme(legend.position = "none") | |
``` | |
```{joys-marathons, echo=FALSE, eval = FALSE} | |
joys_marathons <- c("REVEL Canyon City Marathon", "Big Cottonwood Marathon & Half", "REVEL Big Bear and Half Marathon") | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment