FrankRuns/marathon-insights.Rmd

## marathon-insights.Rmd
---
title: "Marathon-Insights"
author: "frank-corrigan"
date: "3/1/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(tidyverse) # for data buildings
library(ggplot2) # for visualization
library(reshape2) # for wide to long, typically
library(broom) # for grabbing data from fitting nls
library(kableExtra) # for printing tables
library(choroplethr) # for maps
library(readr) # for maps
library(usdata) # for maps
library(usmap) # for maps
library(ggthemes) # for maps

```

### Read Parsed Data

```{r read-data}

# read the pre-processed data (processed with python)
data = read.csv("parsed_marathon_data_4.csv", stringsAsFactors = FALSE)

# remove NAs (small amount of funky data)
data <- data %>% drop_na()

# just a check - how much data?
dim(data)

```

### Clean & Create New Variables

```{r data-cleaning}

# convert variables to desired type
data$age <- as.numeric(data$age)

# remove white space from end of runners names
data$name_clean <- tolower(trimws(data$name))

# filter data to desired range (between 2 hours and 12 hours)
data <- data %>% filter(time_total_minutes <= 720 & time_total_minutes >= 120)

# for this exerise, just looking at dudes
data <- data %>% filter(gender == "M")

# create standalone marathon title (without the date)
get_title <- function(marathon_name){
  output <- trimws(str_split(marathon_name, fixed("("))[[1]][1])
  return(output)
}
data$marathon_title <- unlist(lapply(data$marathon_name, get_title))

# manually fix typos
data[data$marathon_name == "Hops Marathon by the Bay (FL - 12/10/00)" &
       data$name == "Scott Burrows ",8] = "2:32:43"
data[data$marathon_name == "Hops Marathon by the Bay (FL - 12/10/00)" &
       data$name == "Scott Burrows ",9] = 153

head(data) %>%
  kbl() %>%
  kable_styling()

```


### Lifespan of Marathons

```{r repeat-marathons}

# how many marathons in dataset?
num_marathons <- length(unique(data$marathon_title))
print(paste("Num of marathons (i.e. NYC would be 1 marathon here):", num_marathons))

# how many each marathon was hosted
marathon_freq <- data %>%
  select(marathon_year, marathon_title) %>%
  unique() %>%
  group_by(marathon_title) %>%
  summarise(count = n())

# proportion of marathons that occured x times
prop.table(table(marathon_freq$count))

print(paste("Approximate lifespan of a marathon:",
(0.46*1) + (0.24*5) + (0.16*10) + (0.09*15) + (0.04*20)))

```
### Create conversion functions

```{r helper-functions}

# hours minutes seconds to seconds
hmsts <- function(time){
  pieces <- str_split(time, ":")
  return(as.numeric(pieces[[1]][1]) * 60 * 60 +
           as.numeric(pieces[[1]][2]) * 60 +
           as.numeric(pieces[[1]][3]))
}

# create a seconds column
data$seconds <- unlist(lapply(data$time, hmsts))

# seconds to hours minutes seconds
sthms <- function(seconds){
  hours <- floor(seconds / 60 / 60)
  minutes <- floor((seconds / 60 / 60 - hours) * 60)
  seconds <- floor(((seconds / 60 / 60 - hours) * 60 - minutes) * 60)
  if (minutes < 10){
    return(paste0(hours, ":0", minutes, ":", seconds))
  } else {
    return(paste0(hours, ":", minutes, ":", seconds))
  }
}

```

### Race Counts & Average Times

```{r race-growth, echo=FALSE}

agg_race_stats_df <- data %>% group_by(marathon_year) %>%
  summarise(race_count = n_distinct(marathon_name), avg_time = median(time_total_minutes)) %>%
  mutate(race_count_pct = (race_count-lag(race_count))/lag(race_count), avg_time_pct = (avg_time-lag(avg_time))/lag(avg_time)) %>%
  mutate(growth_color = ifelse(race_count_pct > 0, "Growth", "Shrink"))

p <- ggplot(agg_race_stats_df, aes(x=marathon_year, y=race_count, fill=growth_color)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values=c("#1a9641", "#d7191c")) +
  theme_minimal() +
  theme(legend.position="top", legend.title=element_blank(),
        ) +
  labs(x="Year", y="Race Count", title="Marathon Growth", subtitle="Race count increased between 2000-2015, but shrank significantly during the pandemic.")

p

```
### General Marathon Stats

```{r overall-stats}

state_list <- c( 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY')

print(paste("Dims of the data:", dim(data)))
print(paste("Median age of finishers:", median(data$age)))
print(paste("Median finish time:", sthms(median(data$seconds))))
print(paste("Percent of data that is US-centric:", nrow(data[!data$marathon_state %in% state_list,]) / nrow(data)))

```
### How Many Finishers per Race?

```{r finishers-per-race}

finisher_tbl <- data %>%
  group_by(marathon_year, marathon_name) %>%
  summarise(total_finishers = mean(num_finishers)) %>%
  group_by(marathon_year) %>%
  summarise(avg_finishers = round(median(total_finishers)))

color_scale <- c("#ff0000", "#ffc100", "#ffff00", "#d6ff00", "#63ff00")

head(finisher_tbl) %>%
  kbl(caption = "Median # of Finishers per Race", align = "c",) %>%
  kable_classic(full_width = F, html_font = "Cambria", position = "center") %>%
  column_spec(2, color = "black", background = spec_color2(finisher_tbl$avg_finishers, palette = color_scale))

```

### Estimate Time by Age

```{r age}

# find mean time by age
by_age_avg_time <- data %>%
  filter(age >= 20 & age < 90) %>%
  group_by(age) %>%
  summarise(avg_time = median(seconds),
            min_time = min(seconds),
            max_time = max(seconds)) %>%
  mutate(superfast = hmsts("2:20:00"))

# find best fit function to predict time based on age
# # https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/
fit <- nls(data = data,
           formula = seconds ~ a + b * (age^c),
           start=c(a = 12000, b = 0.01, c=2),
           control = list(maxiter = 12000))

fit_data <- augment(fit)

# plot data

# create buckets to display H:M instead of minutes
min_min <- min(data$seconds)
max_min <- hmsts("8:00:00")# max(data$seconds)
the_seq <- seq(min_min, max_min, 1200)
y_labels <- unlist(lapply(the_seq, sthms))

# plot data
p <- ggplot(data %>% filter(age >= 20 & age < 90 & seconds < hmsts("8:00:00"))) +

  geom_jitter(aes(x=age, y=seconds, color="Result"),
              alpha=0.05) +

  scale_x_continuous(limits = c(20, 84),
                     breaks = seq(20, 84, 2),
                     expand = expansion(mult = c(0.01, 0.01))) +

  scale_y_continuous(labels=y_labels,
                     breaks=the_seq,
                     expand = expansion(mult = c(0.01, 0.01))) +

  geom_step(data = by_age_avg_time,
            aes(x=age, y=avg_time, group=1, color="Average"),
            lwd=2.0, alpha=0.8, inherit.aes = FALSE) +

  geom_line(data = fit_data,
            aes(x=age, y=.fitted, group=1, alpha=1, color="Fitted"),
            lwd=2.0, show.legend=FALSE, inherit.aes = FALSE) +

  geom_line(data = by_age_avg_time,
            aes(x=age, y=superfast, group=1, alpha=1, color="Fast"),
            lwd=2.0, show.legend=FALSE, inherit.aes = FALSE) +

  theme_minimal() +

  scale_color_manual("",
                     values=c("Result" = "black",
                              "Average" = "#5f987a",
                              "Fitted" = "deeppink",
                              "Fast" = "steelblue"),
                     labels = c("Result",
                                "Average (Median by Age)",
                                "Fitted",
                                "Super Fast")) +

  # theme(legend.position = c(0.5, 0.95)) +
  theme(legend.position = "top") +

  guides(colour = guide_legend(nrow = 1)) +

  labs(x="Age",
       y="Finish Time (h:m:s)",
       title="Age vs. Speed",
       subtitle="As we get older, we slow down... but the deterioration is gradual.")

p

```


### Find Your Cluster: Time at Age 38+

```{r time-at-38}

# find individuals with multiple race results in dataset
multiple_marathons <- data %>%
  group_by(name_clean) %>%
  summarise(count = n()) %>%
  filter(count > 1)

# find avg time of individuals with race history
multiple_times <- data %>%
  filter(name_clean %in% multiple_marathons$name_clean) %>%
  group_by(name_clean) %>%
  summarise(multiples_avg_time = mean(time_total_minutes))

# join avg times for racers with history to base data
data <- left_join(data, multiple_times, by="name_clean")

# threshold: if average race history is less than 2:40, FASTer, otherwise slower.
data$history_cat <- ifelse(data$multiples_avg_time <= 162, "History_Faster",
                           ifelse(data$multiples_avg_time > 162, "History_Slower", "Single_Race"))

# filter to just fast history racers when they are 38 or older
focus_group_df <- data %>% filter(history_cat == "History_Faster" & age >= 38)

# use our subset to estimate prob of running <=2:20 at 38+
d <- ecdf(focus_group_df$seconds)
print(paste("Frank, your 'probability' of running faster than 2:20 after the age of 38 is approximately", d(hmsts("2:20:00"))*100, "percent. Good luck!"))

# create histogram to support

min_min <- min(focus_group_df$seconds)
max_min <- max(focus_group_df$seconds)
the_seq <- seq(min_min, max_min, 10*60)
x_labels <- unlist(lapply(the_seq, sthms))

ggplot(focus_group_df, aes(x=seconds)) +
  geom_histogram(fill="steelblue") +
  scale_x_continuous(labels=x_labels,
                   breaks=the_seq
                   ) +
  theme_minimal() +
  geom_vline(xintercept = hmsts("2:20:00"), linetype="dotted") +
  labs(x="Finish Time (h:m:s)",
     y="Count",
     title="Age 38+ Finish Times of Historically Fast Individuals")

```
### When?

``` {r by-month}

# find mean time per month
time_by_month <- data %>%
  group_by(marathon_month) %>%
  summarise(avg_time = mean(seconds))

# create month name as factor
time_by_month$month_name <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
time_by_month$month_name <- factor(time_by_month$month_name,
                                   levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))

# create variables to identify relatively slower or faster months
monthly_time <- mean(time_by_month$avg_time)
time_by_month$variance <- monthly_time - time_by_month$avg_time
time_by_month$color <- ifelse(time_by_month$variance < 0, "Slower", "Faster")

# plot it!
p <- ggplot(time_by_month, aes(x=month_name, y=variance, fill=color)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values=c("#1a9641", "#d7191c")) +
  theme_minimal() +
  theme(legend.position="top", legend.title=element_blank(),
        ) +
  labs(x="Month", y="Minutes Relative to Monthly Average")

p

```
### Now the Maps

``` {r by-state}

# helpful:
# https://rkabacoff.github.io/datavis/GeoMaps.html

us_states <- map_data("state")

state_df <- data %>%
  group_by(marathon_state) %>%
  summarise(avg_time = mean(time_total_minutes))

state_df <- state_df %>% filter(marathon_state %in% state_list)
state_df$state_name <- abbr2state(state_df$marathon_state)
state_df$region <- tolower(state_df$state_name)

state_df <- left_join(state_df, us_states, by="region")

# create map in stages

# step 1: feed your data to ggplot
p0 <- ggplot(data = subset(state_df),
             mapping = aes(x = long, y = lat,
                 group = group,
                 fill = avg_time))

# step 2: define your map attributes
p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) +
     coord_map(projection = "albers", lat0 = 39, lat1 = 45)

# step 3: define your colors
p2 <- p1 + scale_fill_viridis_c(option = "A", direction = 1)

# step 4: labels and make it look sharp
p3 <- p2 + theme_map() +
    theme(legend.position = "top",
          strip.background = element_blank()) +
    labs(fill = "Average Minutes",
          title = "States Marathon Avg. Times (Top 100 Finishers)",
         subtitle = "Years included = 2000, 2005, 2010, 2015, 2021")

p3

```


```{r map-facet}

library(maps)
us_states <- map_data("state")

state_facet <- data %>%
  group_by(marathon_month, marathon_state) %>%
  summarise(avg_time = mean(time_total_minutes))

state_facet <- state_facet %>% filter(marathon_state %in% state_list)
state_facet$state_name <- abbr2state(state_facet$marathon_state)
state_facet$region <- tolower(state_facet$state_name)

state_facet <- left_join(state_facet, us_states, by="region")

p0 <- ggplot(data = subset(state_facet),
             mapping = aes(x = long, y = lat,
                 group = group,
                 fill = avg_time))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.05) +
     coord_map(projection = "albers", lat0 = 39, lat1 = 45)

p2 <- p1 + scale_fill_viridis_c(option = "A", direction = 1)

p3 <- p2 + theme_map() + facet_wrap(~ marathon_month, ncol = 3) +
    theme(legend.position = "top",
          strip.background = element_blank()) +
    labs(fill = "Average Minutes ",
         title = "State & Month Marathon Avg. Times (Top 100 Finishers)",
         subtitle = "Years included = 2000, 2005, 2010, 2015, 2021")

p3

```
### Specific Marathons to Run

```{r gems}

gems_df <- data %>%
  filter(marathon_year >= 2015 & marathon_state != " -") %>%
  group_by(marathon_title, marathon_state, marathon_month) %>%
  summarise(avg_finishers = round(mean(num_finishers)),
            avg_seconds = median(seconds),
            avg_time = sthms(median(seconds)),
            min_seconds = min(seconds),
            min_time = sthms(min(seconds)),
            ten_pct = quantile(seconds, c(0.10)),
            ten_pct_time = sthms(quantile(seconds, c(0.10))))

gems_df %>% filter(avg_finishers < 2500 &
                     ten_pct <= hmsts("3:00:00") &
                     min_seconds <= hmsts("2:30:00") &
                     marathon_state %in% c("NJ", "NY", "CT", "RI", "VT", "NH", "MA")) %>%
  arrange(ten_pct) %>%
  select(marathon_title, marathon_state, marathon_month,
         avg_finishers, avg_time, min_time, ten_pct_time) %>%
  head(20) %>%
  kbl(caption = "Possible Target Races", align=c("l",rep("c",6))) %>%
  kable_classic(full_width = F, html_font = "Cambria", position = "center")


```

# END

Below is simple a lot of experimenting... code or ideas that didn't make the cut for the blog post.

```{r bayes, echo=FALSE, eval = FALSE}

age_param <- c()
month_param <- c()
interaction_param <- c()
for (i in 1:100) {
  s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 5000, replace = TRUE)
  m <- lm(data=s, time_total_minutes ~ age^2 * marathon_month)
  age_param <- c(age_param, m$coefficients[2])
  month_param <- c(month_param, m$coefficients[3])
  interaction_param <- c(interaction_param, m$coefficients[4])
}


age_param <- c()
for (i in 1:1000) {
  s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 5000, replace = TRUE)
  fit <- nls(data = s,
           formula = time_total_minutes ~ a + b * (age^c),
           start=c(a = 200, b = 0.01, c=2),
           control = list(maxiter = 200))
  age_param <- c(age_param, coef(fit)[3])
}

hist(age_param)

```


Mysteries are good - they keep you thinking. However, we also need some helpful information. Running NYC is exilerating. Coming down the 59th Street Bridge and hearing the roar of the crowd on 1st Avenue is quite a sureal experience. But the race logistics of NYC are nightmareish. Waiting hours at the start line? If I can avoid that, I will. Where are the races that are fast - fast courses - with reltively few participants? Let's look at races with less than 500 finishers sorted by finish times.

```{r participant-average, echo=FALSE, eval = FALSE}

base <- distinct(data %>% select(marathon_name, num_finishers))
m_year <- distinct(data %>% select(marathon_name, marathon_year))
participants <- left_join(base, m_year, by="marathon_name")
participants %>%
  group_by(marathon_year) %>%
  summarise(mean_finishers = mean(num_finishers),
            median_finishers = median(num_finishers))

```

``` {r time-trends, echo=FALSE, eval = FALSE}

avg_time = mean(data$time_total_minutes)
ggplot(data, aes(x=marathon_year, y=time_total_minutes, group=marathon_year)) +
  geom_boxplot(outlier.shape = NA) +
  scale_y_continuous(limits = c(100, 400)) +
  geom_hline(yintercept = avg_time, color="blue") +
  scale_fill_manual(values=c("#1a9641", "#d7191c")) +
  theme_minimal() +
  theme(legend.position="top", legend.title=element_blank(),
        ) +
  labs(x="Year", y="Minutes", title="Marathon Times Evolution", subtitle="Post pandemic, average marathon times have platued but the distribution has widened.")

```

``` {r race-size-analysis-1, echo=FALSE, eval = FALSE}

data$race_size <- cut(data$num_finishers, 4, labels = c("Very Small", "Small", "Large", "Very Large"))

time_by_race_size_df <- data %>% group_by(race_size) %>% summarize(avg_time = mean(time_total_minutes))

ggplot(time_by_race_size_df, aes(x=race_size, y=avg_time, fill=avg_time)) +
  geom_bar(stat="identity") +
  # geom_hline(yintercept = avg_time, color="blue") +
  # scale_fill_manual(values=c("#1a9641", "#d7191c")) +
  theme_minimal() +
  theme(legend.position="top") +
  labs(x="Relative Race Size", y="Average Minutes", title="Larger Race, Faster Times", subtitle="Generally, bigger races attract faster runners.")


```

``` {r num-finishers-analysis-2, echo=FALSE, eval = FALSE}

# avg time by race size
time_by_finishers_df <- data %>%
  group_by(num_finishers) %>%
  summarise(avg_time = mean(time_total_minutes))

# what is the function? frank manually tries to get close...
# a <- 1
# b <- 2
# y <- 160 + 200*(1-0.002)^time_by_finishers_df$num_finishers
# time_by_finishers_df$pred_time <- y

# ggplot(time_by_finishers_df, aes(x=num_finishers, y=avg_time)) +
#   geom_point() +
#   geom_line(data=time_by_finishers_df, aes(x=num_finishers, y=pred_time), color="red")

fit <- nls(data = time_by_finishers_df,
               formula =  avg_time ~ a + b * (1 - c) ^ num_finishers,
               start = c(a = 160, b = 200, c = 0.002),
               control = list(maxiter = 200))

fit_data <- augment(fit)

ggplot(data = fit_data, aes(x=num_finishers, y=avg_time)) +
  geom_point() +
  geom_line(data = fit_data, aes(x=num_finishers, y=.fitted), color="red") +
  theme_minimal() +
  theme(legend.position="top") +
  labs(x="Relative Race Size", y="Average Minutes", title="Larger Race, Faster Times", subtitle="Generally, bigger races attract faster runners.")

```

``` {r num-finishers-analysis-3, echo=FALSE, eval = FALSE}

# avg time by race size
time_by_finishers_tiny <- data %>%
  filter(num_finishers <= 2000) %>%
  group_by(num_finishers) %>%
  summarise(avg_time = mean(time_total_minutes))

fit_tiny <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_tiny)

fit_tiny_data <- augment(fit_tiny)

p1 <- ggplot(data = fit_tiny_data, aes(x=num_finishers, y=avg_time)) +
  geom_point() +
  geom_line(data = fit_tiny_data, aes(x=num_finishers, y=.fitted), color="red") +
  scale_y_continuous(limits=c(100, 450)) +
  theme_minimal() +
  theme(legend.position="top") +
  labs(x="Relative Race Size", y="Average Minutes", title="Smaller Races", subtitle="Generally, bigger races attract faster runners.")

# avg time by race size
time_by_finishers_not_tiny <- data %>%
  filter(num_finishers > 2000) %>%
  group_by(num_finishers) %>%
  summarise(avg_time = mean(time_total_minutes))

# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/
fit_not_tiny <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_not_tiny)

fit_not_tiny_data <- augment(fit_not_tiny)

p2 <- ggplot(data = fit_not_tiny_data, aes(x=num_finishers, y=avg_time)) +
  geom_point() +
  geom_line(data = fit_not_tiny_data, aes(x=num_finishers, y=.fitted), color="red") +
  scale_y_continuous(limits=c(100, 450)) +
  theme_minimal() +
  theme(legend.position="top") +
  labs(x="Relative Race Size", y="Average Minutes", title="Larger Races", subtitle="Generally, bigger races attract faster runners.")

grid.arrange(p1, p2, ncol = 2)

```

``` {r small-races-analysis, echo=FALSE, eval = FALSE}

data$small_race <- ifelse(data$num_finishers < 100, "Small", ifelse(data$num_finishers < 1000, "Medium", "Large"))

data %>%
  group_by(marathon_year, small_race) %>%
  summarise(count = n()) %>%
  group_by(marathon_year) %>%
  mutate(pct = count / sum(count))

```

``` {r race-size-times, echo=FALSE, eval = FALSE}

race_size_times_df <- data %>%
  group_by(small_race, marathon_year) %>%
  summarise(avg_time = mean(time_total_minutes))

ggplot(race_size_times_df, aes(x=marathon_year,
                               y=avg_time,
                               group=factor(small_race),
                               color=factor(small_race))) +
  geom_line() +
  # scale_y_continuous(limits=c(100, 450)) +
  theme_minimal() +
  theme(legend.position="top") +
  labs(x="Relative Race Size", y="Average Minutes", title="Larger Races", subtitle="Generally, bigger races attract faster runners.")

```


``` {r age-analysis, fig.width=16, echo=FALSE, eval = FALSE}

ggplot(data %>% filter(age >= 18 & age <= 80), aes(x=age, y=time_total_minutes, group=age)) + geom_boxplot()

```


There's a particular break here between Very Samll and Small. Let's find it.

You don't have to know much about statistics to see that this is a crappy fit. Where does that line sort of bottom out - that's where I want to seperate Very Small from the rest.

``` {r, echo=FALSE, eval = FALSE}

df <- time_by_finishers_df

fitted <- df %>%
  nest() %>%
  mutate(
    fit = map(data, ~nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = .)),
    tidied = map(fit, tidy),
    augmented = map(fit, augment),
  )

augmented <- fitted %>%
  unnest(augmented)

augmented

```
The break happens around 1,000 finishers. This means there is a drastic difference between races that are less than 1,000 runners from more than 1,000 runners. Could we fit a better curve if we just looked at races with more than 1,000 participants?

``` {r num-finishers-analysis-2, echo=FALSE, eval = FALSE}

# avg time by race size
time_by_finishers_df <- data %>%
  filter(num_finishers >= 1000) %>%
  group_by(num_finishers) %>%
  summarise(avg_time = mean(time_total_minutes))

# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/
# fit exponential curve
fit <- nls(avg_time ~ SSasymp(num_finishers, yf, y0, log_alpha), data = time_by_finishers_df)

# plot curve
qplot(num_finishers, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue"))

```
Definitely a better fit. What does it mean? Overall, the insight here is that smaller races have slower average times. One hypothesis is...

``` {r small-races-analysis, echo=FALSE, eval = FALSE}

data$small_race <- ifelse(data$num_finishers < 100, "Small", ifelse(data$num_finishers < 1000, "Medium", "Large"))

data %>%
  group_by(marathon_year, small_race) %>%
  summarise(count = n()) %>%
  group_by(marathon_year) %>%
  mutate(pct = count / sum(count))

```

``` {r race-size-times, echo=FALSE, eval = FALSE}

race_size_times_df <- data %>%
  group_by(small_race, marathon_year) %>%
  summarise(avg_time = mean(time_total_minutes))

ggplot(race_size_times_df, aes(x=marathon_year, y=avg_time, group=factor(small_race), color=factor(small_race))) + geom_line()

```
This first split tells us a few things about the upward trend and plateau in average marathon times. A) the plateau is driven by more smaller races. B) the performance of top 100 in big races in 2021 saw big gains - consistent with headlines and records breaking. C) Since bigger races typically have faster times, this shows a hint of possibility that as big races come back online in 2022, average race times will continue to stay flat of perhaps even improve.

``` {r repeat-races, echo=FALSE, eval = FALSE}

repeat_races_df <- data %>% group_by(marathon_title) %>% summarise(count = n_distinct(marathon_year))
# hist(repeat_races_df$count)

repeat_races <- repeat_races_df %>% filter(count >= 4)

repeat_viz <- data %>% filter(marathon_title %in% repeat_races$marathon_title) %>%
  group_by(small_race, marathon_year) %>%
  summarise(race_count = n_distinct(marathon_title), avg_time = mean(time_total_minutes))
ggplot(repeat_viz, aes(x=marathon_year, y=avg_time, group=factor(small_race), color=factor(small_race))) + geom_line()

```

## Marathon Location


``` {r by-age-scatter-and-fit, echo=FALSE, eval = FALSE}


by_age <- data %>% filter(age >= 21 & age < 90) %>% group_by(age) %>% summarise(avg_time = mean(time_total_minutes))

ggplot(by_age, aes(x=age, y=avg_time)) + geom_point()

# https://douglas-watson.github.io/post/2018-09_exponential_curve_fitting/
# fit exponential curve
fit <- nls(avg_time ~ SSasymp(age, yf, y0, log_alpha), data = by_age)

# plot curve
qplot(age, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue"))


model <- function(data){
  nls(avg_time ~ a + b * (age^b), data=data, start=c(a = 10, b = 0))
}

hold <- by_age %>% group_by(age) %>%
  nest() %>%
  mutate(mod= map(.x=data, model))


fit <- nls(data = by_age, formula = avg_time ~ a + b * (age^c), start=c(a = 205, b = 0.01, c=2))

fit_data <- augment(fit)

qplot(age, avg_time, data = augment(fit)) + geom_line(aes(y = .fitted, color="blue")) +
  theme_minimal(base_family="Roboto")


# start <- min(by_age$avg_time)
# my_range <- 1:70
# y <- start+5 + 0.01 * my_range ^ 2.40
# y <- start+5 + 0.005 * my_range ^ 2.58
# y <- 214.485465023 + 0.000003259 * my_range ^ 4.104293201
#
# all_ages <- 21:90
# base_data <- data.frame(age = all_ages)
# test <- base_data %>% left_join(by_age, by="age")
# test$fit <- y
#
# ggplot(test, aes(x=age, y=avg_time)) + geom_point() +
#   geom_line(data=test, aes(x=age, y=fit), color="red")


```

``` {r sample-data, echo=FALSE, eval = FALSE}

df <- data.frame("index" = numeric(), "age"= numeric(), "time_total_minutes" = numeric(),
                 ".fitted" = numeric(), ".resid" = numeric())
for (i in 1:50) {

  print(i)

  s <- data %>% filter(age >= 21 & age <= 90) %>% slice_sample(n = 1000, replace = TRUE)

  start_value <- (data %>% filter(age < 20) %>% summarize(v = mean(time_total_minutes)))$v

  tryCatch(
    {fit <- nls(data = s,
               formula = time_total_minutes ~ a + b * (age^c),
               start = c(a = start_value, b = 0.01, c = 2),
               control = list(maxiter = 200))
    },
    error=function(e) {
      e
      print(paste("Oops! --> Error in Loop ",i,sep = ""))
    }
  )

  fit_data <- augment(fit)
  fit_data$index <- i

  df <- rbind(df, fit_data)

}

ggplot(data = df, aes(x=age, y=time_total_minutes)) + geom_point() +
  geom_line(data = df, aes(x=age, y=.fitted, group = factor(index), color = factor(index))) +
  theme(legend.position = "none")

```


```{joys-marathons, echo=FALSE, eval = FALSE}

joys_marathons <- c("REVEL Canyon City Marathon", "Big Cottonwood Marathon & Half", "REVEL Big Bear and Half Marathon")

```