Created
April 3, 2023 13:59
-
-
Save bayesball/ea6152b7790963eaec573e13ec367fa4 to your computer and use it in GitHub Desktop.
Some R functions produced by the ChatGPT program
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "ChatGPT Examples" | |
format: html | |
editor: visual | |
--- | |
## Collecting Cards | |
_Write R function to illustrate baseball card collection problem._ | |
```{r} | |
baseball_card_problem <- function(n, m) { | |
# Generate random cards until the set is complete | |
cards <- c() | |
while (length(unique(cards)) < m) { | |
new_card <- sample(1:n, 1) | |
cards <- c(cards, new_card) | |
} | |
# Count the number of cards needed to complete the set | |
total_cards <- length(cards) | |
# Return the result | |
return(total_cards) | |
} | |
``` | |
```{r} | |
set.seed(123) # Set the random seed for reproducibility | |
n <- 100 # Number of unique cards in the set | |
m <- 50 # Number of cards needed to complete the set | |
simulations <- 10000 # Number of simulations to run | |
# Simulate the baseball card problem | |
results <- replicate(simulations, baseball_card_problem(n, m)) | |
# Calculate the mean and standard deviation of the results | |
mean_result <- mean(results) | |
sd_result <- sd(results) | |
# Print the results | |
cat("To complete a set of", m, "cards with", n, "unique cards in the set:\n") | |
cat("The expected number of cards needed is", round(mean_result, 2), "\n") | |
cat("The standard deviation of the number of cards needed is", round(sd_result, 2), "\n") | |
``` | |
## Streaks | |
_Write a R function to find the longest streak in a binary sequence._ | |
Note: I had to make a correction in finding the maximum and minimum of the streaks of 1's an 0's in the `find_longest_streak()` function. | |
```{r} | |
find_longest_streak <- function(x) { | |
# Convert the input to a numeric vector | |
x <- as.numeric(x) | |
# Find the longest streak of consecutive 1's | |
max_streak_ones <- max(rle(x)$lengths[rle(x)$values == 1]) | |
# Find the longest streak of consecutive 0's | |
max_streak_zeros <- max(rle(x)$lengths[rle(x)$values == 0]) | |
# Return the result | |
return(max(max_streak_ones, max_streak_zeros)) | |
} | |
``` | |
```{r} | |
set.seed(123) # Set the random seed for reproducibility | |
# Generate a binary sequence of length 20 | |
x <- sample(c(0, 1), 20, replace = TRUE) | |
# Print the sequence | |
cat("The binary sequence is:", paste(x, collapse = " "), "\n") | |
# Find the length of the longest streak of consecutive 1's or 0's | |
max_streak <- find_longest_streak(x) | |
# Print the result | |
cat("The length of the longest streak of consecutive 1's or 0's is:", max_streak, "\n") | |
``` | |
## Simulating a Baseball Season | |
_Write the R function to use the Bradley Terry Model to simulate a baseball season._ | |
```{r} | |
simulate_baseball_season <- function(team_names, team_strengths) { | |
# Check that the input is valid | |
if (length(team_names) != length(team_strengths)) { | |
stop("Length of team_names and team_strengths must be the same") | |
} | |
# Create a data frame with the team names and strengths | |
teams <- data.frame(name = team_names, strength = team_strengths) | |
# Compute the pairwise strengths for each matchup | |
pairwise_strengths <- outer(team_strengths, team_strengths, "/") | |
# Set the diagonal to zero (no team plays against itself) | |
diag(pairwise_strengths) <- 0 | |
# Compute the probability of winning each game using the Bradley-Terry model | |
prob_wins <- pairwise_strengths / (1 + pairwise_strengths) | |
# Simulate the outcomes of the games | |
num_teams <- length(team_names) | |
num_games <- num_teams * (num_teams - 1) / 2 | |
outcomes <- rbinom(num_games, 1, prob_wins) | |
# Create a data frame with the game outcomes | |
games <- expand.grid(home = team_names, away = team_names) | |
games <- games[games$home != games$away, ] | |
games$outcome <- outcomes | |
# Compute the total number of wins for each team | |
wins <- aggregate(outcome ~ home, data = games, FUN = sum) | |
colnames(wins) <- c("name", "wins") | |
# Sort the teams by the number of wins | |
teams <- merge(teams, wins, by = "name") | |
teams <- teams[order(teams$wins, decreasing = TRUE), ] | |
# Return the result | |
return(teams) | |
} | |
``` | |
```{r} | |
# Define the team names and strengths | |
team_names <- c("Red Sox", "Yankees", "Blue Jays", "Rays", "Orioles") | |
team_strengths <- c(1.2, 1.1, 1.0, 0.9, 0.8) | |
# Simulate the baseball season | |
simulate_baseball_season(team_names, team_strengths) | |
``` | |
## Regression to the Mean | |
_Write R code with baseball data to illustrate regression to the mean._ | |
```{r} | |
# Load necessary libraries | |
library(Lahman) | |
library(dplyr) | |
library(ggplot2) | |
# Load Lahman baseball data | |
data("Batting") | |
# Select data for a specific player | |
player_data <- Batting %>% | |
filter(playerID == "troutmi01") %>% # Mike Trout | |
select(yearID, AB, H) | |
# Calculate batting average for each season | |
player_data$BA <- player_data$H / player_data$AB | |
# Calculate the mean batting average for this player | |
mean_BA <- mean(player_data$BA) | |
# Plot the player's batting average by year | |
ggplot(player_data, aes(x = yearID, y = BA)) + | |
geom_point() + | |
geom_hline(yintercept = mean_BA, linetype = "dashed") + | |
ggtitle("Mike Trout's Batting Average by Year") + | |
xlab("Year") + | |
ylab("Batting Average") | |
# Calculate the correlation between consecutive seasons | |
cor(player_data$BA[-1], player_data$BA[-length(player_data$BA)]) | |
``` | |
## Berkson's Paradox | |
_Illustrate Berkson's paradox using baseball data._ | |
NOTE: Although the description of the paradox was correct, this particular code example was not a good illustration of the paradox. | |
``` | |
library(dplyr) | |
library(ggplot2) | |
# Read in Lahman data | |
batting <- read.csv("Batting.csv") | |
# Subset data to players with at least 500 at-bats | |
batting_sub <- batting %>% | |
group_by(playerID) %>% | |
summarize(G = sum(G), AB = sum(AB), H = sum(H), | |
HR = sum(HR), BB = sum(BB), yearID = min(yearID)) %>% | |
filter(AB >= 500) | |
# Calculate batting average and slugging percentage | |
batting_sub <- mutate(batting_sub, AVG = H / AB, SLG = (H + 2 * HR) / AB) | |
# Plot the relationship between batting average and slugging percentage | |
ggplot(batting_sub, aes(x = AVG, y = SLG)) + | |
geom_point() + | |
geom_smooth(method = "lm", se = FALSE) + | |
labs(title = "Relationship between Batting Average and Slugging Percentage", | |
x = "Batting Average", y = "Slugging Percentage") | |
``` | |
``` | |
# Subset data to players with at least one game in the majors | |
batting_sub_majors <- batting_sub %>% | |
filter(G > 0) | |
# Plot the relationship between batting average and slugging percentage | |
ggplot(batting_sub_majors, aes(x = AVG, y = SLG)) + | |
geom_point() + | |
geom_smooth(method = "lm", se = FALSE) + | |
labs(title = "Relationship between Batting Average and Slugging Percentage (Majors Only)", | |
x = "Batting Average", y = "Slugging Percentage") | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment