Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Code for Pokemon post-evolution CP model blog post
########## FUNCTIONS ###################
sign_flip <- function(x){
flip <- rbinom(1,1,0.5)
x <- ifelse(flip == 0, x, -x)
return(x)
}
sim_CP_new <- function(dat, spec, shape1, shape2, reps = 100){
spec_dat <- filter(dat1, species == eval(spec))
b_spec <- rbeta(10000, shape1, shape2)
b_spec <- sapply(b_spec, FUN = sign_flip)
b_spec <- b_spec + spec_dat$spec_mean_pdcp[1]
sim_spec <- replicate(reps,spec_dat$cp * (1 + sample(b_spec, nrow(spec_dat))))
return(sim_spec)
}
prep_sim_data_for_plot <- function(sim_spec, spec_dat){
sim_spec <- data.frame(sim_spec,
cp_new = spec_dat$cp_new)
spec_melt <- reshape2::melt(sim_spec)
spec_melt$model <- rep(c("sim","data"),
times = c(((ncol(sim_spec)-1)*nrow(sim_spec))
,nrow(sim_spec)))
spec_melt$species <- as.character(unique(spec_dat$species))
return(spec_melt)
}
sim_beta_RMSE <- function(dat, melt, draws_from_beta){
melt <- filter(melt, model == "sim")
melt$cp_new <- rep(dat$cp_new, times = draws_from_beta)
melt$id <- rep(1:nrow(dat), times = draws_from_beta)
beta_rmse <- group_by(melt, id) %>%
summarise(means = mean(value),
sd = sd(value),
rmse = sqrt(mean((cp_new - value)^2)),
mae = mean(abs(cp_new - value)),
species = eval(unique(as.character(dat$species))),
cp_new = mean(cp_new)) %>%
data.frame()
return(beta_rmse)
}
sim_CP_new_by_param <- function(dat, spec, param_dist, reps = 100){
shape_sample <- param_dist[sample(nrow(param_dist),1),]
shape1 <- shape_sample[1,1]
shape2 <- shape_sample[1,2]
spec_dat <- filter(dat1, species == eval(spec))
f_spec <- fitdist(abs(spec_dat$cent_spec_mean_pdcp),"beta")
b_spec <- rbeta(10000, shape1, shape2)
b_spec <- sapply(b_spec, FUN = sign_flip)
b_spec <- b_spec + spec_dat$spec_mean_pdcp[1]
# simulate cp_new of known pokes under [reps] different beta draws
sim_spec <- replicate(reps,spec_dat$cp * (1 + sample(b_spec, nrow(spec_dat))),
simplify = TRUE)
return(sim_spec)
}
sim_param_RMSE <- function(dat, melt, param_draws){
melt$cp_new <- rep(dat$cp_new, param_draws)
sim_rmse <- group_by(melt, Var1) %>%
summarise(means = mean(value),
sd = sd(value),
rmse = sqrt(mean((cp_new - value)^2)),
mae = mean(abs(cp_new - value)),
species = eval(unique(as.character(dat$species))),
cp_new = mean(cp_new)) %>%
data.frame()
}
prep_psim_for_plot <- function(psim_spec_dat, spec_dat){
psim_spec <- matrix(psim_spec_dat, nrow = nrow(spec_dat),
ncol = param_draws * beta_draws,
byrow = FALSE)
psim_spec_melt <- melt(psim_spec)
psim_spec_means <- group_by(psim_spec_melt, Var1) %>%
summarise(means = mean(value)) %>%
arrange(means)
psim_spec_melt$ord <- factor(psim_spec_melt$Var1, levels = psim_spec_means$Var1)
spec_dat$id <- seq(1:nrow(spec_dat))
spec_dat$ord <- factor(spec_dat$id, levels = psim_spec_means$Var1)
return(list(psim_spec_melt, spec_dat))
}
pedict_simulated_cp_new <- function(sim_dat, shape1, shape2, reps = 100){
# replicate a bunch of beta values from parametrized distribution
b_spec <- rbeta(10000, shape1, shape2)
# use sign_flip to randomly flip sign (p = 0.5)
b_spec <- sapply(b_spec, FUN = sign_flip)
# add the pos/neg beta (variation) to the mean pdcp for that species
b_spec <- b_spec + sim_dat$spec_mean_pdcp[1]
# replicate evolutions scenerios sampled from b_spec on simulated pokes
sim_spec <- replicate(reps,sim_dat$cp * (1 + sample(b_spec, nrow(sim_dat))))
return(sim_spec)
}
mround <- function(x,base){
base*round(x/base)
}
sim_new_species <- function(spec_ranges, spec_dat){
spec <- eval(unique(spec_dat$specie))
spec_cp_range <- data.frame(species = eval(unique(spec_dat$species)),
cp = mround(seq(spec_ranges[which(spec_ranges$species == spec),"min_cp"],
spec_ranges[which(spec_ranges$species == spec),"max_cp"],
length.out = length_out), base),
spec_mean_pdcp = spec_dat$spec_mean_pdcp[1])
return(spec_cp_range)
}
make_pred_cp_plot_dat <- function(predicted_spec, spec_cp_range){
spec_cp_new_ranges <- group_by(melt(predicted_spec), Var1) %>%
summarise(
cp_new_min = floor(min(value)),
cp_new_5 = floor(quantile(value, 0.05)),
cp_new_mean = floor(mean(value)),
cp_new_95 = ceiling(quantile(value, 0.95)),
cp_new_max = ceiling(max(value))) %>%
mutate(cp = spec_cp_range$cp) %>%
data.frame()
return(spec_cp_new_ranges)
}
########## END FUNCTIONS ######################
########## Require Libraries ##################
library("dplyr") # for data wrangling
library("corrplot") # for correlation plot
library("fitdistrplus") # for distribution GoF
library("ggplot2") # for plotting
library("ggalt") # for added plotting functions
library("broom") # for tidy
library("tidyr") # for gather()
library("tibble") # for display of data
library("xtable") # for output of tables for blog
library("ggthemes") # for color-blind safe plot colors
library("reshape2") # for melt function
########## END Require Libraries ##############
setwd("~/Dropbox/R/pokemon/")
pokedat <- as_tibble(read.csv("pokemon.csv"))
dat1 <- mutate(pokedat, delta_cp = cp_new - cp) %>% # net change in cp
# the % change in cp from old to new
mutate(pcnt_d_cp = delta_cp / cp) %>%
# group by species to calculate additonal varaibles
group_by(species) %>%
# species grouped mean percent change
mutate(spec_mean_pdcp = mean(pcnt_d_cp)) %>%
# species grouped std dev of % changes from old to new cp
mutate(spec_mean_pdcp_sd = sd(pcnt_d_cp)) %>%
# difference in % delta cp (pdcp) change from species group mean
mutate(cent_spec_mean_pdcp = pcnt_d_cp - spec_mean_pdcp) %>%
# z score for pdcp change within species group
mutate(z_spec_mean_pdcp = cent_spec_mean_pdcp / spec_mean_pdcp_sd) %>%
data.frame()
# Z score to answer the more appropriate question:
# what leads to an above average gain in CP per evolution within a given species?
print(xtable(table(dat1$species)), type = "html")
# plot the full 2k by 2k base pairs() plot for fun
# corr plot
c2 <- cor(dat1[,c(3:8,11,14:20,23,26,28:33)])
corrplot(c2, method = "square", order = "hclust",
tl.col = "black", tl.cex = 1.5)
# mean CP & HP Table
dat1 %>%
group_by(species) %>%
summarize(Mean_CP = round(mean(cp),2),
Mean_HP = round(mean(hp),2)) %>%
xtable(.) %>%
print(., type = "html", include.rownames=FALSE)
# R and R^2 table
dat1 %>%
group_by(species) %>%
summarize(r = round(cor(cp, hp),3)) %>%
mutate(r2 = round(r^2,3)) %>%
xtable(.) %>%
print(., type = "html", include.rownames=FALSE)
# plot linear correlation of CP & HP by species
ggplot(data = dat1, aes(x = cp, y = hp, color = species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
# scale_color_pokemon("infernape") +
scale_color_colorblind() +
theme_bw() +
labs(title="Correlation of Pokémon CP and HP",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "CP",
y = "HP") +
theme(
axis.text.x = element_text(size = 10, family = "Trebuchet MS"),
axis.text.y = element_text(size = 10, family = "Trebuchet MS"),
axis.title = element_text(size = 12, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ggsave("CP_HP_corr.png", width = 6, height = 5)
# plot distriubtion of centered PDCP
ggplot(dat1, aes(x = cent_spec_mean_pdcp, fill = species)) +
geom_density() +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
# scale_fill_pokemon("infernape") +
scale_fill_colorblind() +
facet_wrap(~species, nrow = 1) +
theme_bw() +
labs(title="Distribution or Percent Change in CP from Species Mean",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Centered PDCP",
y = "Density") +
theme(
strip.background = element_rect(colour = "white", fill = "white"),
strip.text.x = element_text(colour = "black", size = 10, face = "bold",
family = "Trebuchet MS"),
panel.margin = unit(0, "lines"),
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ggsave("centered_PDCP.png", width = 6, height = 5)
# resdiuals of centered PDCP
ggplot(data = dat1, aes(x = cp, y = cent_spec_mean_pdcp, color = species)) +
geom_hline(yintercept = 0, color = "gray60", linetype = 2) +
geom_point() +
scale_color_colorblind() +
facet_wrap(~species, nrow = 2) +
theme_bw() +
labs(title="Centered Percent Change in CP from Species Mean",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "CP",
y = "Density") +
theme(
strip.background = element_rect(colour = "white", fill = "white"),
strip.text.x = element_text(colour = "black", size = 10, face = "bold",
family = "Trebuchet MS"),
panel.margin = unit(0, "lines"),
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ggsave("centered_PDCP_over_CP.png", width = 6, height = 5)
# trend of mean species PDCP and variation in PDCP
ggplot(data = dat1, aes(x = spec_mean_pdcp, y = spec_mean_pdcp_sd, color = species)) +
geom_line(color = "gray65") +
geom_point(size = 4) +
scale_color_colorblind() +
theme_bw() +
labs(title="Relationship if Mean and Variation in PDCP by Species",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Mean PDCP",
y = "Std Dev of PDCP") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold")
)
ggsave("mean_vs_stdDev_PDCP.png", width = 6, height = 5)
# look for universal deviation from mean
quantile(dat1$cent_spec_mean_pdcp) # difference from mean
# distrobution of Centered PDCP for all species
ggplot(dat1, aes(x = cent_spec_mean_pdcp)) +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
geom_density(fill = "skyblue") +
scale_x_continuous(breaks = c(seq(-0.45,0.15,0.05))) +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
theme_bw() +
labs(title="Distribution of Deviations from Centered PDCP",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Deviation from Centered PDCP",
y = "Density") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold")
)
ggsave("dist_dev_from_centered_PDCP.png", width = 6, height = 5)
# distrobution of absolute centered PDCP for all species
ggplot(dat1, aes(x = abs(cent_spec_mean_pdcp))) +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
geom_density(fill = "skyblue") +
scale_x_continuous(breaks = c(seq(0,0.5,0.05))) +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
theme_bw() +
labs(title="Distribution of Absolute Deviations from Centered PDCP",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Deviation from Centered PDCP",
y = "Density") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold")
)
ggsave("ABS_dist_dev_from_centered_PDCP.png", width = 6, height = 5)
### Fit beta parameters to data via MLE
# look at dist. being parametric about this
f3 <- fitdist(abs(dat1$cent_spec_mean_pdcp),"beta")
# plot boostrapped common distributions reatlive to data for comparison
descdist(abs(dat1$cent_spec_mean_pdcp), boot = 1001)
# save via quartz, may not work on Windows
quartz.save(file = "Cullen_Frey_graph.png", height = 5, width = 6,
type = "png", device = dev.cur(), dpi = 300)
# looking at MLE fit of beta to data
summary(f3)
gofstat(f3) # Goodness-of-Fit statistics
plot(f3) # CDF, QQ< PP, Density plot
# sample form beta distribution based on the MLE estimate of [f3]
b <- rbeta(10000,f3$estimate[1], f3$estimate[2])
# plot empirical data versus samples from parameterized beta distribution
ggplot() +
geom_density(data = data.frame(beta_samp = b),
aes(x = beta_samp), fill = "orange") +
geom_density(data = dat1, aes(x = abs(cent_spec_mean_pdcp)), fill = "skyblue3", alpha = 0.55) +
theme_bw() +
scale_x_continuous(breaks = c(seq(0,0.5,0.05))) +
geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
theme_bw() +
labs(title="Comparirson of Empirical and Simulated Beta Distribution",
subtitle="Orange = Beta(0.57, 15.68), Blue = empirical",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Absolute Deviation from Centered PDCP",
y = "Density") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ggsave("Beta_vs_empirical_dist_centered_PDCP.png", width = 6, height = 5)
###### Simulate plausible evolution paths for each observed Pokémon in sample
### Sequence to sample shape parameters from parametrized beta
### Apply these beta draws to each pokémon species through
### data generating model
draws_from_beta = 100
shape1 = f3$estimate[1]
shape2 = f3$estimate[2]
# Simulate plausible evolution scenarios based on our model
# the melt data for plotting purposes
# sim_pidgey
sim_pidgey <- sim_CP_new(dat1, "Pidgey", shape1, shape2, draws_from_beta)
pidgey_dat <- filter(dat1, species == "Pidgey")
pidgey_melt <- prep_sim_data_for_plot(sim_pidgey, pidgey_dat)
# sim_eevee
sim_eevee <- sim_CP_new(dat1, "Eevee", shape1, shape2, draws_from_beta)
eevee_dat <- filter(dat1, species == "Eevee")
eevee_melt <- prep_sim_data_for_plot(sim_eevee, eevee_dat)
# sim_caterpie
sim_caterpie <- sim_CP_new(dat1, "Caterpie",shape1, shape2, draws_from_beta)
caterpie_dat <- filter(dat1, species == "Caterpie")
caterpie_melt <- prep_sim_data_for_plot(sim_caterpie, caterpie_dat)
# sim_weedle
sim_weedle <- sim_CP_new(dat1, "Weedle",shape1, shape2, draws_from_beta)
weedle_dat <- filter(dat1, species == "Weedle")
weedle_melt <- prep_sim_data_for_plot(sim_weedle, weedle_dat)
# combine all
sim_pokemon <- rbind(pidgey_melt, eevee_melt, caterpie_melt, weedle_melt)
## plot density of empirical post-evolution CP (red line) over
## density of models simulated evolution scenarios (blue lines)
## scenarios based on model and universal beta shape parameter estimates from [f3]
ggplot(data = sim_pokemon, aes(x = value,group = variable, color = model)) +
geom_line(stat="density", size=0.75, alpha=0.25) +
geom_line(data = filter(sim_pokemon, model == "data"),
aes(x = value), stat="density", color = "red", size = 0.25) +
guides(color = guide_legend(override.aes = list(alpha=1))) +
scale_color_manual(values = c("red", "skyblue")) +
facet_grid(species~., scales = "free") +
theme_bw() +
labs(title="Density of Simulated and Empirical Post-Evolution CP per Sepcies",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Post-Evolution CP",
y = "Density") +
theme(
strip.background = element_rect(colour = "white", fill = "white"),
strip.text.x = element_text(colour = "black", size = 10, face = "bold",
family = "Trebuchet MS"),
strip.text.y = element_text(colour = "black", size = 10, face = "bold",
family = "Trebuchet MS"),
panel.margin = unit(0, "lines"),
panel.border = element_rect(colour = "gray90"),
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
legend.position = "bottom",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ggsave("Species_simulated_CP_New.png", width = 8, height = 8)
# Check errors via RMSE of simulated evolution scenerios vs data
pidgey_sim_beta_rmse <- sim_beta_RMSE(pidgey_dat, pidgey_melt)
eevee_sim_beta_rmse <- sim_beta_RMSE(eevee_dat, eevee_melt)
caterpie_sim_beta_rmse <- sim_beta_RMSE(caterpie_dat, caterpie_melt)
weedle_sim_beta_rmse <- sim_beta_RMSE(weedle_dat, weedle_melt)
# combine together
species_sim_beta_metrics <- rbind(pidgey_sim_beta_rmse, eevee_sim_beta_rmse,
caterpie_sim_beta_rmse, weedle_sim_beta_rmse)
# plot jittered distribution of RMSE for the mean post-evolution CP
# for each observed pokémon by species
ggplot(species_sim_beta_metrics, aes(x = species, y = rmse,
group = species, color = species)) +
geom_jitter(width = 0.3) +
theme_bw() +
scale_color_colorblind() +
labs(title="RMSE Error for Mean Beta Sample Post-evolution CP ",
subtitle = "Mean of n = 100 beta samples for each species observation",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Species",
y = "RMSE") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
legend.position = "right"
)
ggsave("Species_beta_CP_New_mean_RMSE.png", width = 6, height = 3)
# table of RMSE results
sim_beta_rmse_table <- group_by(species_sim_beta_metrics, species) %>%
summarise(cp_new_mean = round(mean(cp_new),2),
sim_cp_new_mean = round(mean(means),2),
sd_mean = round(mean(sd),2),
rmse_mean = round(mean(rmse),2),
mae_mean = round(mean(mae),2)) %>%
mutate(pcnt_mae = round(mae_mean / cp_new_mean,3) * 100) %>%
data.frame()
# xtable for blog
print(xtable(sim_beta_rmse_table), type = "html", include.rownames=FALSE)
##### Look at pluasibe variation in beta shape parameters
### instead of one universal MLE estimate, this samples many plausible
### shape paramers via bootstrap, then estiamtes evolution scenerios for
### a range of models that are coherent with the given data
# Boostrap 1001 different pluasible parameters pairs for beta distribution
beta_boot <- bootdist(f3, niter = 1001)
# extract estimates
boot_est <- beta_boot$estim
# get qunatiles of boostrapped parameter estimates and put in table for blog
shp1_boot_est <- round(quantile(boot_est[,1], c(0.025, 0.25, 0.5, 0.75, 0.95)),2)
shp2_boot_est <- round(quantile(boot_est[,2], c(0.025, 0.25, 0.5, 0.75, 0.95)),2)
boot_est_m <- rbind(shp1_boot_est, shp2_boot_est)
print(xtable(boot_est_m), type = "html")
# plot distribution and density of plausible shape parameter pairs
ggplot(beta_boot$estim, aes(x = shape1, y = shape2)) +
geom_point(alpha = 0.95, color = "skyblue3") +
geom_density_2d(color = "gray50", alpha = 0.85) +
theme_bw() +
labs(title="Distribution of Shape Parameters for Beta Distribution",
subtitle="Boostraps, n = 1001",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Shape 1",
y = "Shape 2") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Boostrapped_shape_parameters.png", width = 6, height = 5)
# samples of 100 different pidgeys (e.g. PDCPs), under different parameters consistent with the data
# sim_pidgey with params
param_draws = 100
beta_draws = 1
param_dist <- beta_boot$estim
# Simulate beta_draws post-evolution scenerios from param_draws different
# beta distirbution parameterizations (e.g. other plausbile models that incorprate
# uncertainty in limited number ofobservations)
psim_pidgey <- replicate(param_draws, sim_CP_new_by_param(dat1, "Pidgey", param_dist, beta_draws))
psim_eevee <- replicate(param_draws, sim_CP_new_by_param(dat1, "Eevee", param_dist, beta_draws))
psim_weedle <- replicate(param_draws, sim_CP_new_by_param(dat1, "Weedle", param_dist, beta_draws))
psim_caterpie <- replicate(param_draws, sim_CP_new_by_param(dat1, "Caterpie", param_dist, beta_draws))
# prepare psim data for plotting
psim_pidgey_list <- prep_psim_for_plot(psim_pidgey, pidgey_dat)
psim_eevee_list <- prep_psim_for_plot(psim_eevee, eevee_dat)
psim_weedle_list <- prep_psim_for_plot(psim_weedle, weedle_dat)
psim_caterpie_list <- prep_psim_for_plot(psim_caterpie, caterpie_dat)
# Pidgey parameter Sim plot
ggplot() +
geom_point(data = psim_pidgey_list[[1]], aes(x = ord, y = value),
alpha = 0.50, color = "skyblue3") +
geom_point(data = psim_pidgey_list[[2]], aes(x = ord, y = cp_new), color = "red") +
theme_bw() +
labs(title="Range of Simulated Post Evolution CP - Pidgey",
subtitle="Simulated Beta Parameters, n = 100",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pokémon Ordered by Mean Simulated cp_new",
y = "Post Evolution CP") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Simulated_shape_parameters_pidgey.png", width = 6, height = 5)
# Eevee parameter Sim plot
ggplot() +
geom_point(data = psim_eevee_list[[1]], aes(x = ord, y = value),
alpha = 0.50, color = "skyblue3") +
geom_point(data = psim_eevee_list[[2]], aes(x = ord, y = cp_new), color = "red") +
theme_bw() +
labs(title="Range of Simulated Post Evolution CP = Eevee",
subtitle="Simulated Beta Parameters, n = 100",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pokémon Ordered by Mean Simulated cp_new",
y = "Post Evolution CP") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Simulated_shape_parameters_eevee.png", width = 6, height = 5)
# Weedle parameter Sim plot
ggplot() +
geom_point(data = psim_weedle_list[[1]], aes(x = ord, y = value),
alpha = 0.50, color = "skyblue3") +
geom_point(data = psim_weedle_list[[2]], aes(x = ord, y = cp_new), color = "red") +
theme_bw() +
labs(title="Range of Simulated Post Evolution CP = Weedle",
subtitle="Simulated Beta Parameters, n = 100",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pokémon Ordered by Mean Simulated cp_new",
y = "Post Evolution CP") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Simulated_shape_parameters_weedle.png", width = 6, height = 5)
# Caterpie parameter Sim plot
ggplot() +
geom_point(data = psim_caterpie_list[[1]], aes(x = ord, y = value),
alpha = 0.50, color = "skyblue3") +
geom_point(data = psim_caterpie_list[[2]], aes(x = ord, y = cp_new), color = "red") +
theme_bw() +
labs(title="Range of Simulated Post Evolution CP = Caterpie",
subtitle="Simulated Beta Parameters, n = 100",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pokémon Ordered by Mean Simulated cp_new",
y = "Post Evolution CP") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Simulated_shape_parameters_caterpie.png", width = 6, height = 5)
# RMSE results from simualted beta parameters
pidgey_sim_rmse <- sim_param_RMSE(pidgey_dat, psim_pidgey_list[[1]])
eevee_sim_rmse <- sim_param_RMSE(eevee_dat, psim_eevee_list[[1]])
caterpie_sim_rmse <- sim_param_RMSE(caterpie_dat, psim_caterpie_list[[1]])
weedle_sim_rmse <- sim_param_RMSE(weedle_dat, psim_weedle_list[[1]])
# combine RMSE
species_sim_metrics <- rbind(pidgey_sim_rmse, eevee_sim_rmse,
caterpie_sim_rmse, weedle_sim_rmse)
# plot RMSE of simulated models parameters
ggplot(species_sim_metrics, aes(x = species, y = rmse,
group = species, color = species)) +
geom_jitter(width = 0.3) +
theme_bw() +
scale_color_colorblind() +
labs(title="RMSE Error for Mean Simulated Post-evolution CP ",
subtitle = "Mean of n = 100 simulations for each species observation",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Species",
y = "RMSE") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
legend.position = "right"
)
ggsave("Species_simulated_CP_New_mean_RMSE.png", width = 6, height = 3)
# put metris into a table
sim_rmse_table <- group_by(species_sim_metrics, species) %>%
summarise(cp_new_mean = round(mean(cp_new),2),
sim_cp_new_mean = round(mean(means),2),
sd_mean = round(mean(sd),2),
rmse_mean = round(mean(rmse),2),
mae_mean = round(mean(mae),2)) %>%
mutate(pcnt_mae = round(mae_mean / cp_new_mean,3) * 100) %>%
data.frame()
# print table HTML for blog
print(xtable(sim_rmse_table), type = "html", include.rownames=FALSE)
#### Model parameters in Full Bayes??!!! Are you crazy?!
## estimation using JAGS
library("rjags") # if you dare!!!
# extract abs_cent_spec_mean_pdcp for jags estimation
y <- abs(dat1$cent_spec_mean_pdcp)
# build model text for JAGE
model_string <- "model{
for(i in 1:length(y)) {
y[i] ~ dbeta(alpha, beta) # y distributed as a beta
}
alpha ~ dt(0,1,1)T(0,) # half cauchy prior
beta ~ dt(0,1,1)T(0,) # half cauchy prior
}"
# initialize model with data
model <- jags.model(textConnection(model_string), data = list(y = y), n.chains = 3, n.adapt= 10000)
update(model, 10000); # Burnin for 10000 samples
# sample alpha (shape1) and beta (shape2) from posterior. Can take a while
mcmc_samples <- coda.samples(model, variable.names=c("alpha", "beta"), n.iter=20000)
# plot(mcmc_samples) # plot chain convergence and parameter density
# model parameter distribution stats
summary(mcmc_samples)
# table of stats and print HTML for blog
jags_sum <- summary(mcmc_samples)$quantiles
print(xtable(jags_sum), type = "html")
# plot shape1 and shape 2 parameters
alpha <- mcmc_samples[[1]][,1]
beta <- mcmc_samples[[1]][,2]
# plot correlated parameters with MLE from earlier param estimation and MAP from posterior
ggplot(data = data.frame(alpha = as.numeric(alpha), beta = as.numeric(beta)),
aes(x = alpha, y = beta)) +
geom_point(alpha = 0.15, color = "skyblue3") +
geom_density_2d(color = "gray50", alpha = 0.85) +
geom_point(aes(x = 0.57, y = 15.68), color = "yellow", size = 2) + # MLE
geom_point(aes(x = mean(alpha), y = mean(beta)), color = "red", size = 2) + #MAP
theme_bw() +
labs(title="Posterior of Beta Distribution Shape Parameters",
subtitle="n = 20K samples, red = MAP, yellow = MLE",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Shape 1",
y = "Shape 2") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Bayes_simulated_shape_parameters.png", width = 6, height = 5)
# prior, post, plot
post_plot <- data.frame(value = c(alpha, beta),
parameter = rep(c("alpha", "beta"), each = length(alpha)),
model = "posterior")
alpha_half_cauchy <- rt(nrow(post_plot),1,1)
alpha_half_cauchy <- alpha_half_cauchy[alpha_half_cauchy > 0]
beta_half_cauchy <- rt(nrow(post_plot),1,1)
beta_half_cauchy <- beta_half_cauchy[beta_half_cauchy > 0]
prior_plot <- data.frame(value = c(alpha_half_cauchy,beta_half_cauchy),
parameter = rep(c("alpha", "beta"),
times = c(length(alpha_half_cauchy), length(beta_half_cauchy))),
model = "prior"
)
jags_plot_dat <- rbind(post_plot, prior_plot)
ggplot(data = filter(jags_plot_dat, parameter == "alpha"),
aes(x = value, group = model, color = model)) +
geom_density() +
scale_x_continuous(limits = c(0,10), breaks = seq(0,10,0.5)) +
scale_color_manual(values = c("orange", "purple")) +
theme_bw() +
labs(title="Prior and Posterior of Shape1 Parameter of Beta Distribution",
subtitle="Half Cauchy as T distribution (1,1) > 0",
caption="20K Samples from JAGS model",
x = "Shape1 Parameter",
y = "Density") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("alpha_plot.png", width = 6, height = 5)
ggplot(data = filter(jags_plot_dat, parameter == "beta"),
aes(x = value, group = model, color = model)) +
geom_density() +
scale_x_continuous(limits = c(0,100), breaks = seq(0,100,5)) +
scale_color_manual(values = c("orange", "purple")) +
theme_bw() +
labs(title="Prior and Posterior of Shape2 Parameter of Beta Distribution",
subtitle="Half Cauchy as T distribution (1,1) > 0",
caption="20K Samples from JAGS model",
x = "Shape2 Parameter",
y = "Density") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("beta_plot.png", width = 6, height = 5)
###### Generalized predictions
base = 10
sim_reps = 10000
length_out = 10
# don't need all of this for here, but good summary for all species
spec_ranges <- group_by(dat1, species) %>%
summarise(min_cp = min(cp),
max_cp = max(cp),
mean_cp = mean(cp),
mean_cp_new = mean(cp_new),
mean_pdcp = mean(pcnt_d_cp),
mean_abs_cent_spec_mean_pdcp = mean(abs(cent_spec_mean_pdcp))) %>%
data.frame()
print(xtable(spec_ranges), type = "html", include.rownames=FALSE)
## Predict siualted Pokémon species based on empirical range
# Pidgey
pidgey_cp_range <- sim_new_species(spec_ranges, pidgey_dat)
predicted_pidgey <- pedict_simulated_cp_new(pidgey_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
pidgey_cp_new_ranges <- make_pred_cp_plot_dat(predicted_pidgey, pidgey_cp_range)
# Eevee
eevee_cp_range <- sim_new_species(spec_ranges, eevee_dat)
predicted_eevee <- pedict_simulated_cp_new(eevee_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
eevee_cp_new_ranges <- make_pred_cp_plot_dat(predicted_eevee, eevee_cp_range)
# Weedle
weedle_cp_range <- sim_new_species(spec_ranges, weedle_dat)
predicted_weedle <- pedict_simulated_cp_new(weedle_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
weedle_cp_new_ranges <- make_pred_cp_plot_dat(predicted_weedle, weedle_cp_range)
# Caterpie
caterpie_cp_range <- sim_new_species(spec_ranges, caterpie_dat)
predicted_caterpie <- pedict_simulated_cp_new(caterpie_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
caterpie_cp_new_ranges <- make_pred_cp_plot_dat(predicted_caterpie, caterpie_cp_range)
# line plot with error ribbons, ok, but not too flashy
ggplot(data = pidgey_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
geom_point(data = pidgey_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.60) +
geom_line(color = "firebrick", size = 0.75) +
scale_x_continuous(breaks = seq(min(pidgey_cp_new_ranges$cp),
max(pidgey_cp_new_ranges$cp),by = 20)) +
scale_y_continuous(breaks = seq(min(pidgey_cp_new_ranges$cp_new_min),
max(pidgey_cp_new_ranges$cp_new_max),by = 50)) +
theme_bw() +
labs(title="Range of Model Predictions for CP_new - Pidgey",
subtitle="Mean, 95% prediction interval & min/max prediction",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pre-Evolution CP",
y = "Modeled Post-Evolution CP") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Predicted_cp_new_pidgey.png", width = 6, height = 5)
# eevee plot
ggplot(data = eevee_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
geom_point(data = eevee_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
geom_line(color = "firebrick", size = 0.75) +
scale_x_continuous(breaks = seq(min(eevee_cp_new_ranges$cp),
max(eevee_cp_new_ranges$cp),by = 20)) +
scale_y_continuous(breaks = seq(min(eevee_cp_new_ranges$cp_new_min),
max(eevee_cp_new_ranges$cp_new_max),by = 50)) +
theme_bw() +
labs(title="Range of Model Predictions for CP_new - Eevee",
subtitle="Mean, 95% prediction interval & min/max prediction",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pre-Evolution CP",
y = "Modeled Post-Evolution CP") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Predicted_cp_new_eevee.png", width = 6, height = 5)
# weedle plot
ggplot(data = weedle_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
geom_point(data = weedle_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
geom_line(color = "firebrick", size = 0.75) +
scale_x_continuous(breaks = seq(min(weedle_cp_new_ranges$cp),
max(weedle_cp_new_ranges$cp),by = 20)) +
scale_y_continuous(breaks = seq(min(weedle_cp_new_ranges$cp_new_min),
max(weedle_cp_new_ranges$cp_new_max),by = 50)) +
theme_bw() +
labs(title="Range of Model Predictions for CP_new - weedle",
subtitle="Mean, 95% prediction interval & min/max prediction",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pre-Evolution CP",
y = "Modeled Post-Evolution CP") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Predicted_cp_new_weedle.png", width = 6, height = 5)
# caterpie plot
ggplot(data = caterpie_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
geom_point(data = caterpie_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
geom_line(color = "firebrick", size = 0.75) +
scale_x_continuous(breaks = seq(min(caterpie_cp_new_ranges$cp),
max(caterpie_cp_new_ranges$cp),by = 20)) +
scale_y_continuous(breaks = seq(min(caterpie_cp_new_ranges$cp_new_min),
max(caterpie_cp_new_ranges$cp_new_max),by = 50)) +
theme_bw() +
labs(title="Range of Model Predictions for CP_new - Caterpie",
subtitle="Mean, 95% prediction interval & min/max prediction",
caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
x = "Pre-Evolution CP",
y = "Modeled Post-Evolution CP") +
theme(
axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
axis.title = element_text(size = 10, family = "Trebuchet MS"),
plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
family = "Trebuchet MS"),
plot.title=element_text(family="TrebuchetMS-Bold"),
plot.subtitle=element_text(family="TrebuchetMS-Bold")
)
ggsave("Predicted_cp_new_caterpie.png", width = 6, height = 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment