Skip to content

Instantly share code, notes, and snippets.

@FrankRuns
Created November 28, 2023 11:59
Show Gist options
  • Save FrankRuns/bc0ba0dad94ab3b99c8932cb445c9ad8 to your computer and use it in GitHub Desktop.
Save FrankRuns/bc0ba0dad94ab3b99c8932cb445c9ad8 to your computer and use it in GitHub Desktop.
Quick inspection and vis of Dr. Who dataset for TidyTuesday 2023-11-28
# load packages
library(tidyverse)
library(tidytuesdayR) # Used for loading datasets from the TidyTuesday project
# load datasets
tuesdata <- tidytuesdayR::tt_load('2023-11-28')
drwho_episodes <- tuesdata$drwho_episodes
drwho_directors <- tuesdata$drwho_directors
drwho_writers <- tuesdata$drwho_writers
# Initialize the correlations data frame
correlations <- data.frame(lag = integer(), correlation = numeric())
# Calculate correlation for 0 lag
corr_0_lag <- cor(drwho_episodes$uk_viewers, drwho_episodes$rating, use="complete.obs")
correlations <- rbind(correlations, data.frame(lag = 0, correlation = corr_0_lag))
# Calculate correlations for lags 1 through 4
for (i in 1:4) {
lag_var <- paste0("rating_lag", i)
drwho_episodes[[lag_var]] <- lag(drwho_episodes$rating, i)
corr <- cor(drwho_episodes$uk_viewers, drwho_episodes[[lag_var]], use="complete.obs")
correlations <- rbind(correlations, data.frame(lag = i, correlation = corr))
}
# Join with writers dataset to analyze average viewership by writer
drwho_episodes <- left_join(drwho_episodes, drwho_writers, by="story_number")
drwho_episodes %>%
group_by(writer) %>%
summarise(avg_viewers = mean(uk_viewers, na.rm = TRUE)) %>%
arrange(desc(avg_viewers))
# Investigating how the previous week's writer influences current week's viewership
drwho_episodes$writer_lag1 <- lag(drwho_episodes$writer, 1)
drwho_episodes %>%
filter(!is.na(writer_lag1)) %>%
group_by(writer_lag1) %>%
summarise(avg_viewers = mean(uk_viewers, na.rm = TRUE)) %>%
arrange(desc(avg_viewers))
# Add a column for color
correlations$color <- ifelse(correlations$lag == 1, "darkblue", "lightblue")
# Plot the correlations with conditional coloring
ggplot(correlations, aes(x = lag, y = correlation, fill = color)) +
geom_bar(stat = "identity", show.legend = FALSE) +
scale_fill_manual(values = c("darkblue", "steelblue")) +
labs(title = "#TidyTuesday: Dr. Who's Viewership",
subtitle = "Correlation of UK Viewers with Current/Lagged Ratings",
x = "Lag (ie. prior week's rating)",
y = "Correlation") +
theme_minimal()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment