Skip to content

Instantly share code, notes, and snippets.

@FrankRuns
Created January 10, 2024 19:47
Show Gist options
  • Save FrankRuns/cfb88f42778e719cd44de0be543fb321 to your computer and use it in GitHub Desktop.
Save FrankRuns/cfb88f42778e719cd44de0be543fb321 to your computer and use it in GitHub Desktop.
Script to simulate tv holiday episode data for data analysts make mistakes article
# Load Required Libraries
if (!require("MASS")) install.packages("MASS")
library(MASS)
# Define TV Shows
# A vector of TV show titles
tv_shows <- c(
"Breaking Bad", "Game of Thrones", "The Wire",
"Stranger Things", "The Crown", "Mad Men",
"The Sopranos", "Friends", "The Office",
"Parks and Recreation", "Sherlock", "Doctor Who",
"Fargo", "The Mandalorian", "Westworld",
"Better Call Saul", "Black Mirror", "The Simpsons",
"Futurama", "Rick and Morty", "The Big Bang Theory",
"Brooklyn Nine-Nine", "The Marvelous Mrs. Maisel", "Succession",
"Ted Lasso", "The Handmaid's Tale", "The Witcher",
"Mindhunter", "Killing Eve", "Ozark",
"Peaky Blinders", "Narcos", "Vikings",
"The Boys", "The Expanse", "Bridgerton",
"Money Heist", "The Umbrella Academy", "Daredevil",
"This Is Us", "Grey's Anatomy", "House of Cards",
"The Haunting of Hill House", "Lost", "Chernobyl",
"Seinfeld", "Arrested Development", "Twin Peaks",
"The Twilight Zone", "South Park", "Archer",
"It's Always Sunny in Philadelphia", "Community", "Fleabag",
"Dexter", "Homeland", "Prison Break",
"House", "The Last Dance", "The Queen's Gambit",
"True Detective", "The Walking Dead", "How I Met Your Mother",
"Curb Your Enthusiasm", "BoJack Horseman", "Band of Brothers",
"The Office (UK)", "Schitt's Creek", "The Americans",
"Battlestar Galactica", "The X-Files", "Lost in Space",
"Star Trek", "Firefly", "The Good Place",
"Atlanta", "Barry", "The Handmaid's Tale",
"Orange Is the New Black", "Downton Abbey", "The West Wing",
"Hannibal", "Luther", "The Marvelous Mrs. Maisel",
"Modern Family", "Scrubs", "Silicon Valley",
"Suits", "True Blood", "Vampire Diaries",
"Gossip Girl", "The Young Pope", "Lupin",
"The Crown"
)
# Parameters for Simulating Season Counts using Negative Binomial Distribution
mu <- 3 # Mean number of seasons
size <- 1 # Size parameter (dispersion)
# Simulate Season Counts
# Reproducibility with set.seed and generate random season counts for each TV show
set.seed(42)
season_counts <- rnbinom(n = length(tv_shows), size, mu = mu)
# Parameters for Simulating Base Ratings using Normal Distribution
mean_rating <- 7
sd_rating <- 1.5
# Simulate Base Ratings
# Generate base ratings and ensure ratings are within 0 to 10
base_ratings <- rnorm(n = length(tv_shows), mean = mean_rating, sd = sd_rating)
base_ratings <- pmin(pmax(base_ratings, 0), 10)
# Function to Generate Season Ratings
# Generates ratings for each season based on the base rating
generate_season_ratings <- function(base_rating, count) {
if (count > 0) {
season_ratings <- rnorm(n = count, mean = base_rating, sd = 0.5)
return(pmin(pmax(season_ratings, 0), 10)) # Ensure ratings are within 0 to 10
} else {
return(numeric(0)) # Return empty vector if count is 0
}
}
# Create a Dataframe of TV Show Seasons and Ratings
# Iterates through TV shows, applying the season ratings function
tv_show_season_df <- do.call("rbind", lapply(1:length(tv_shows), function(i) {
if (season_counts[i] > 0) {
group_label <- ifelse(season_counts[i] > 1, "multiple_seasons", "one_season")
return(data.frame(TV_Show = tv_shows[i],
Season_Number = 1:season_counts[i],
Rating = generate_season_ratings(base_ratings[i], season_counts[i]),
Group = rep(group_label, season_counts[i])))
}
}))
# Clean-up and Display
# Remove rows with NULL values and rename columns for clarity
tv_show_season_df <- tv_show_season_df[!sapply(tv_show_season_df, is.null), ]
names(tv_show_season_df) <- c("parent_primary_title", "season_number", "parent_average_rating", "group")
# Display a Formatted Sample of the Dataframe
tv_show_season_df_head <- head(tv_show_season_df)
formattable::formattable(tv_show_season_df_head, list(
parent_average_rating = formattable::color_tile("transparent", "lightpink"),
group = color_bar("#80ed99")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment