mrecos/Pokemon_CP_evolution_model.R

## Pokemon_CP_evolution_model.R
########## FUNCTIONS ###################
sign_flip <- function(x){
  flip <- rbinom(1,1,0.5)
  x <- ifelse(flip == 0, x, -x)
  return(x)
}
sim_CP_new <- function(dat, spec, shape1, shape2, reps = 100){
  spec_dat <- filter(dat1, species == eval(spec))
  b_spec <- rbeta(10000, shape1, shape2)
  b_spec <- sapply(b_spec, FUN = sign_flip)
  b_spec <- b_spec + spec_dat$spec_mean_pdcp[1]
  sim_spec <- replicate(reps,spec_dat$cp * (1 + sample(b_spec, nrow(spec_dat))))
  return(sim_spec)
}
prep_sim_data_for_plot <- function(sim_spec, spec_dat){
  sim_spec <- data.frame(sim_spec,
                         cp_new = spec_dat$cp_new)
  spec_melt <- reshape2::melt(sim_spec)
  spec_melt$model <- rep(c("sim","data"),
                         times = c(((ncol(sim_spec)-1)*nrow(sim_spec))
                                   ,nrow(sim_spec)))
  spec_melt$species <- as.character(unique(spec_dat$species))
  return(spec_melt)
}
sim_beta_RMSE <- function(dat, melt, draws_from_beta){
  melt <- filter(melt, model == "sim")
  melt$cp_new <- rep(dat$cp_new, times = draws_from_beta)
  melt$id <- rep(1:nrow(dat), times = draws_from_beta)
  beta_rmse <- group_by(melt, id) %>%
    summarise(means = mean(value),
              sd = sd(value),
              rmse = sqrt(mean((cp_new - value)^2)),
              mae = mean(abs(cp_new - value)),
              species = eval(unique(as.character(dat$species))),
              cp_new = mean(cp_new)) %>%
    data.frame()
  return(beta_rmse)
}
sim_CP_new_by_param <- function(dat, spec, param_dist, reps = 100){
  shape_sample <- param_dist[sample(nrow(param_dist),1),]
  shape1 <- shape_sample[1,1]
  shape2 <- shape_sample[1,2]
  spec_dat <- filter(dat1, species == eval(spec))
  f_spec <- fitdist(abs(spec_dat$cent_spec_mean_pdcp),"beta")
  b_spec <- rbeta(10000, shape1, shape2)
  b_spec <- sapply(b_spec, FUN = sign_flip)
  b_spec <- b_spec + spec_dat$spec_mean_pdcp[1]
  # simulate cp_new of known pokes under [reps] different beta draws
  sim_spec <- replicate(reps,spec_dat$cp * (1 + sample(b_spec, nrow(spec_dat))),
                        simplify = TRUE)
  return(sim_spec)
}
sim_param_RMSE <- function(dat, melt, param_draws){
  melt$cp_new <- rep(dat$cp_new, param_draws)
  sim_rmse <- group_by(melt, Var1) %>%
    summarise(means = mean(value),
              sd = sd(value),
              rmse = sqrt(mean((cp_new - value)^2)),
              mae = mean(abs(cp_new - value)),
              species = eval(unique(as.character(dat$species))),
              cp_new = mean(cp_new)) %>%
    data.frame()
}
prep_psim_for_plot <- function(psim_spec_dat, spec_dat){
  psim_spec <- matrix(psim_spec_dat, nrow = nrow(spec_dat),
                      ncol = param_draws * beta_draws,
                      byrow  = FALSE)
  psim_spec_melt <- melt(psim_spec)
  psim_spec_means <- group_by(psim_spec_melt, Var1) %>%
    summarise(means = mean(value)) %>%
    arrange(means)
  psim_spec_melt$ord <- factor(psim_spec_melt$Var1, levels = psim_spec_means$Var1)
  spec_dat$id <- seq(1:nrow(spec_dat))
  spec_dat$ord <- factor(spec_dat$id, levels = psim_spec_means$Var1)
  return(list(psim_spec_melt, spec_dat))
}
pedict_simulated_cp_new <- function(sim_dat, shape1, shape2, reps = 100){
  # replicate a bunch of beta values from parametrized distribution
  b_spec <- rbeta(10000, shape1, shape2)
  # use sign_flip to randomly flip sign (p = 0.5)
  b_spec <- sapply(b_spec, FUN = sign_flip)
  # add the pos/neg beta (variation) to the mean pdcp for that species
  b_spec <- b_spec + sim_dat$spec_mean_pdcp[1]
  # replicate evolutions scenerios sampled from b_spec on simulated pokes
  sim_spec <- replicate(reps,sim_dat$cp * (1 + sample(b_spec, nrow(sim_dat))))
  return(sim_spec)
}
mround <- function(x,base){
  base*round(x/base)
}
sim_new_species <- function(spec_ranges, spec_dat){
  spec <- eval(unique(spec_dat$specie))
  spec_cp_range <-  data.frame(species = eval(unique(spec_dat$species)),
                               cp = mround(seq(spec_ranges[which(spec_ranges$species == spec),"min_cp"],
                                               spec_ranges[which(spec_ranges$species == spec),"max_cp"],
                                               length.out = length_out), base),
                               spec_mean_pdcp = spec_dat$spec_mean_pdcp[1])
  return(spec_cp_range)
}
make_pred_cp_plot_dat <- function(predicted_spec, spec_cp_range){
  spec_cp_new_ranges <- group_by(melt(predicted_spec), Var1) %>%
    summarise(
      cp_new_min = floor(min(value)),
      cp_new_5 = floor(quantile(value, 0.05)),
      cp_new_mean = floor(mean(value)),
      cp_new_95 = ceiling(quantile(value, 0.95)),
      cp_new_max = ceiling(max(value))) %>%
    mutate(cp = spec_cp_range$cp) %>%
    data.frame()
  return(spec_cp_new_ranges)
}
########## END FUNCTIONS ######################
########## Require Libraries ##################
library("dplyr") # for data wrangling
library("corrplot") # for correlation plot
library("fitdistrplus") # for distribution GoF
library("ggplot2") # for plotting
library("ggalt") # for added plotting functions
library("broom") # for tidy
library("tidyr") # for gather()
library("tibble") # for display of data
library("xtable") # for output of tables for blog
library("ggthemes") # for color-blind safe plot colors
library("reshape2") # for melt function
########## END Require Libraries ##############

setwd("~/Dropbox/R/pokemon/")
pokedat <- as_tibble(read.csv("pokemon.csv"))

dat1 <- mutate(pokedat, delta_cp = cp_new - cp) %>% # net change in cp
  # the % change in cp from old to new
        mutate(pcnt_d_cp = delta_cp / cp) %>%
  # group by species to calculate additonal varaibles
        group_by(species) %>%
  # species grouped mean percent change
        mutate(spec_mean_pdcp = mean(pcnt_d_cp)) %>%
  # species grouped std dev of % changes from old to new cp
        mutate(spec_mean_pdcp_sd = sd(pcnt_d_cp)) %>%
  # difference in % delta cp (pdcp) change from species group mean
        mutate(cent_spec_mean_pdcp = pcnt_d_cp - spec_mean_pdcp) %>%
  # z score for pdcp change within species group
        mutate(z_spec_mean_pdcp = cent_spec_mean_pdcp / spec_mean_pdcp_sd) %>%
        data.frame()

 # Z score to answer the more appropriate question:
 #   what leads to an above average gain in CP per evolution within a given species?

print(xtable(table(dat1$species)), type = "html")

# plot the full 2k by 2k base pairs() plot for fun

# corr plot
c2 <- cor(dat1[,c(3:8,11,14:20,23,26,28:33)])
corrplot(c2, method = "square", order = "hclust",
         tl.col = "black", tl.cex = 1.5)

# mean CP & HP Table
dat1 %>%
  group_by(species) %>%
  summarize(Mean_CP = round(mean(cp),2),
            Mean_HP = round(mean(hp),2)) %>%
  xtable(.) %>%
  print(., type = "html", include.rownames=FALSE)

# R and R^2 table
dat1 %>%
  group_by(species) %>%
  summarize(r = round(cor(cp, hp),3)) %>%
  mutate(r2 = round(r^2,3))  %>%
  xtable(.) %>%
  print(., type = "html", include.rownames=FALSE)

# plot linear correlation of CP & HP by species
ggplot(data = dat1, aes(x = cp, y = hp, color = species)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  # scale_color_pokemon("infernape") +
  scale_color_colorblind() +
  theme_bw() +
  labs(title="Correlation of Pokémon CP and HP",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "CP",
       y = "HP") +
  theme(
    axis.text.x = element_text(size = 10, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 10, family = "Trebuchet MS"),
    axis.title = element_text(size = 12, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
    )
ggsave("CP_HP_corr.png", width = 6, height = 5)

# plot distriubtion of centered PDCP
ggplot(dat1, aes(x = cent_spec_mean_pdcp, fill = species)) +
  geom_density() +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  # scale_fill_pokemon("infernape") +
  scale_fill_colorblind() +
  facet_wrap(~species, nrow = 1) +
  theme_bw() +
  labs(title="Distribution or Percent Change in CP from Species Mean",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Centered PDCP",
       y = "Density") +
  theme(
    strip.background = element_rect(colour = "white", fill = "white"),
    strip.text.x = element_text(colour = "black", size = 10, face = "bold",
                                family = "Trebuchet MS"),
    panel.margin = unit(0, "lines"),
    panel.border = element_rect(colour = "gray90"),
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
    )
ggsave("centered_PDCP.png", width = 6, height = 5)

# resdiuals of centered PDCP
ggplot(data = dat1, aes(x = cp, y = cent_spec_mean_pdcp, color = species)) +
  geom_hline(yintercept = 0, color = "gray60", linetype = 2) +
  geom_point() +
  scale_color_colorblind() +
  facet_wrap(~species, nrow = 2) +
  theme_bw() +
  labs(title="Centered Percent Change in CP from Species Mean",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "CP",
       y = "Density") +
  theme(
    strip.background = element_rect(colour = "white", fill = "white"),
    strip.text.x = element_text(colour = "black", size = 10, face = "bold",
                                family = "Trebuchet MS"),
    panel.margin = unit(0, "lines"),
    panel.border = element_rect(colour = "gray90"),
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    legend.position = "none",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
ggsave("centered_PDCP_over_CP.png", width = 6, height = 5)

# trend of mean species PDCP and variation in PDCP
ggplot(data = dat1, aes(x = spec_mean_pdcp, y = spec_mean_pdcp_sd, color = species)) +
  geom_line(color = "gray65") +
  geom_point(size = 4) +
  scale_color_colorblind() +
  theme_bw() +
  labs(title="Relationship if Mean and Variation in PDCP by Species",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Mean PDCP",
       y = "Std Dev of PDCP") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold")
    )
ggsave("mean_vs_stdDev_PDCP.png", width = 6, height = 5)

# look for universal deviation from mean
quantile(dat1$cent_spec_mean_pdcp) # difference from mean

# distrobution of Centered PDCP for all species
ggplot(dat1, aes(x = cent_spec_mean_pdcp)) +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  geom_density(fill = "skyblue") +
  scale_x_continuous(breaks = c(seq(-0.45,0.15,0.05))) +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  theme_bw() +
  labs(title="Distribution of Deviations from Centered PDCP",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Deviation from Centered PDCP",
       y = "Density") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold")
  )
ggsave("dist_dev_from_centered_PDCP.png", width = 6, height = 5)

# distrobution of absolute centered PDCP for all species
ggplot(dat1, aes(x = abs(cent_spec_mean_pdcp))) +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  geom_density(fill = "skyblue") +
  scale_x_continuous(breaks = c(seq(0,0.5,0.05))) +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  theme_bw() +
  labs(title="Distribution of Absolute Deviations from Centered PDCP",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Deviation from Centered PDCP",
       y = "Density") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold")
  )
ggsave("ABS_dist_dev_from_centered_PDCP.png", width = 6, height = 5)

### Fit beta parameters to data via MLE
# look at dist.  being parametric about this
f3 <- fitdist(abs(dat1$cent_spec_mean_pdcp),"beta")

# plot boostrapped common distributions reatlive to data for comparison
descdist(abs(dat1$cent_spec_mean_pdcp), boot = 1001)
# save via quartz, may not work on Windows
quartz.save(file = "Cullen_Frey_graph.png", height = 5, width = 6,
type = "png", device = dev.cur(), dpi = 300)

# looking at MLE fit of beta to data
summary(f3)
gofstat(f3) # Goodness-of-Fit statistics
plot(f3) # CDF, QQ< PP, Density plot

# sample form beta distribution based on the MLE estimate of [f3]
b <- rbeta(10000,f3$estimate[1], f3$estimate[2])

# plot empirical data versus samples from parameterized beta distribution
ggplot() +
  geom_density(data = data.frame(beta_samp = b),
               aes(x = beta_samp), fill = "orange") +
  geom_density(data = dat1, aes(x = abs(cent_spec_mean_pdcp)), fill = "skyblue3", alpha = 0.55) +
  theme_bw() +
  scale_x_continuous(breaks = c(seq(0,0.5,0.05))) +
  geom_vline(xintercept = 0, color = "gray60", linetype = 2) +
  theme_bw() +
  labs(title="Comparirson of Empirical and Simulated Beta Distribution",
       subtitle="Orange = Beta(0.57, 15.68), Blue = empirical",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Absolute Deviation from Centered PDCP",
       y = "Density") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
ggsave("Beta_vs_empirical_dist_centered_PDCP.png", width = 6, height = 5)

###### Simulate plausible evolution paths for each observed Pokémon in sample
### Sequence to sample shape parameters from parametrized beta
### Apply these beta draws to each pokémon species through
### data generating model

draws_from_beta = 100
shape1 = f3$estimate[1]
shape2 = f3$estimate[2]

# Simulate plausible evolution scenarios based on our model
# the melt data for plotting purposes
# sim_pidgey
sim_pidgey <- sim_CP_new(dat1, "Pidgey", shape1, shape2, draws_from_beta)
pidgey_dat <- filter(dat1, species == "Pidgey")
pidgey_melt <- prep_sim_data_for_plot(sim_pidgey, pidgey_dat)
# sim_eevee
sim_eevee <- sim_CP_new(dat1, "Eevee", shape1, shape2, draws_from_beta)
eevee_dat <- filter(dat1, species == "Eevee")
eevee_melt <- prep_sim_data_for_plot(sim_eevee, eevee_dat)
# sim_caterpie
sim_caterpie <- sim_CP_new(dat1, "Caterpie",shape1, shape2,  draws_from_beta)
caterpie_dat <- filter(dat1, species == "Caterpie")
caterpie_melt <- prep_sim_data_for_plot(sim_caterpie, caterpie_dat)
# sim_weedle
sim_weedle <- sim_CP_new(dat1, "Weedle",shape1, shape2,  draws_from_beta)
weedle_dat <- filter(dat1, species == "Weedle")
weedle_melt <- prep_sim_data_for_plot(sim_weedle, weedle_dat)
# combine all
sim_pokemon <- rbind(pidgey_melt, eevee_melt, caterpie_melt, weedle_melt)

## plot density of empirical post-evolution CP (red line) over
## density of models simulated evolution scenarios (blue lines)
## scenarios based on model and universal beta shape parameter estimates from [f3]
ggplot(data = sim_pokemon, aes(x = value,group = variable, color = model)) +
    geom_line(stat="density", size=0.75, alpha=0.25) +
    geom_line(data = filter(sim_pokemon, model == "data"),
            aes(x = value), stat="density", color = "red", size = 0.25) +
  guides(color = guide_legend(override.aes = list(alpha=1))) +
  scale_color_manual(values = c("red", "skyblue")) +
  facet_grid(species~., scales = "free") +
  theme_bw() +
  labs(title="Density of Simulated and Empirical Post-Evolution CP per Sepcies",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Post-Evolution CP",
       y = "Density") +
  theme(
    strip.background = element_rect(colour = "white", fill = "white"),
    strip.text.x = element_text(colour = "black", size = 10, face = "bold",
                                family = "Trebuchet MS"),
    strip.text.y = element_text(colour = "black", size = 10, face = "bold",
                                family = "Trebuchet MS"),
    panel.margin = unit(0, "lines"),
    panel.border = element_rect(colour = "gray90"),
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    legend.position = "bottom",
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )
ggsave("Species_simulated_CP_New.png", width = 8, height = 8)

# Check errors via RMSE of simulated evolution scenerios vs data
pidgey_sim_beta_rmse <- sim_beta_RMSE(pidgey_dat, pidgey_melt)
eevee_sim_beta_rmse <- sim_beta_RMSE(eevee_dat, eevee_melt)
caterpie_sim_beta_rmse <- sim_beta_RMSE(caterpie_dat, caterpie_melt)
weedle_sim_beta_rmse <- sim_beta_RMSE(weedle_dat, weedle_melt)
# combine together
species_sim_beta_metrics <- rbind(pidgey_sim_beta_rmse, eevee_sim_beta_rmse,
                             caterpie_sim_beta_rmse, weedle_sim_beta_rmse)

# plot jittered distribution of RMSE for the mean post-evolution CP
# for each observed pokémon by species
ggplot(species_sim_beta_metrics, aes(x = species, y = rmse,
                                group = species, color = species)) +
  geom_jitter(width = 0.3) +
  theme_bw() +
  scale_color_colorblind() +
  labs(title="RMSE Error for Mean Beta Sample Post-evolution CP ",
       subtitle = "Mean of n = 100 beta samples for each species observation",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Species",
       y = "RMSE") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    legend.position = "right"
  )
ggsave("Species_beta_CP_New_mean_RMSE.png", width = 6, height = 3)

# table of RMSE results
sim_beta_rmse_table <- group_by(species_sim_beta_metrics, species) %>%
  summarise(cp_new_mean = round(mean(cp_new),2),
            sim_cp_new_mean = round(mean(means),2),
            sd_mean = round(mean(sd),2),
            rmse_mean = round(mean(rmse),2),
            mae_mean = round(mean(mae),2)) %>%
  mutate(pcnt_mae = round(mae_mean / cp_new_mean,3) * 100) %>%
  data.frame()
# xtable for blog
print(xtable(sim_beta_rmse_table), type = "html", include.rownames=FALSE)

##### Look at pluasibe variation in beta shape parameters
### instead of one universal MLE estimate, this samples many plausible
### shape paramers via bootstrap, then estiamtes evolution scenerios for
### a range of models that are coherent with the given data

# Boostrap 1001 different pluasible parameters pairs for beta distribution
beta_boot <- bootdist(f3, niter = 1001)
# extract estimates
boot_est <- beta_boot$estim
# get qunatiles of boostrapped parameter estimates and put in table for blog
shp1_boot_est <- round(quantile(boot_est[,1], c(0.025, 0.25, 0.5, 0.75, 0.95)),2)
shp2_boot_est <- round(quantile(boot_est[,2], c(0.025, 0.25, 0.5, 0.75, 0.95)),2)
boot_est_m <- rbind(shp1_boot_est, shp2_boot_est)
print(xtable(boot_est_m), type = "html")

# plot distribution and density of plausible shape parameter pairs
ggplot(beta_boot$estim, aes(x = shape1, y = shape2)) +
  geom_point(alpha = 0.95, color = "skyblue3") +
  geom_density_2d(color = "gray50", alpha = 0.85) +
  theme_bw() +
  labs(title="Distribution of Shape Parameters for Beta Distribution",
       subtitle="Boostraps, n = 1001",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Shape 1",
       y = "Shape 2") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Boostrapped_shape_parameters.png", width = 6, height = 5)

# samples of 100 different pidgeys (e.g. PDCPs), under different parameters consistent with the data
# sim_pidgey with params
param_draws = 100
beta_draws = 1
param_dist <- beta_boot$estim

# Simulate beta_draws post-evolution scenerios from param_draws different
# beta distirbution parameterizations (e.g. other plausbile models that incorprate
# uncertainty in limited number ofobservations)
psim_pidgey <- replicate(param_draws, sim_CP_new_by_param(dat1, "Pidgey", param_dist, beta_draws))
psim_eevee <- replicate(param_draws, sim_CP_new_by_param(dat1, "Eevee", param_dist, beta_draws))
psim_weedle <- replicate(param_draws, sim_CP_new_by_param(dat1, "Weedle", param_dist, beta_draws))
psim_caterpie <- replicate(param_draws, sim_CP_new_by_param(dat1, "Caterpie", param_dist, beta_draws))

# prepare psim data for plotting
psim_pidgey_list <- prep_psim_for_plot(psim_pidgey, pidgey_dat)
psim_eevee_list <- prep_psim_for_plot(psim_eevee, eevee_dat)
psim_weedle_list <- prep_psim_for_plot(psim_weedle, weedle_dat)
psim_caterpie_list <- prep_psim_for_plot(psim_caterpie, caterpie_dat)

# Pidgey parameter Sim plot
ggplot() +
  geom_point(data = psim_pidgey_list[[1]], aes(x = ord, y = value),
             alpha = 0.50, color = "skyblue3") +
  geom_point(data = psim_pidgey_list[[2]], aes(x = ord, y = cp_new), color = "red") +
  theme_bw() +
  labs(title="Range of Simulated Post Evolution CP - Pidgey",
       subtitle="Simulated Beta Parameters, n = 100",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pokémon Ordered by Mean Simulated cp_new",
       y = "Post Evolution CP") +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Simulated_shape_parameters_pidgey.png", width = 6, height = 5)

# Eevee parameter Sim plot
ggplot() +
  geom_point(data = psim_eevee_list[[1]], aes(x = ord, y = value),
             alpha = 0.50, color = "skyblue3") +
  geom_point(data = psim_eevee_list[[2]], aes(x = ord, y = cp_new), color = "red") +
  theme_bw() +
  labs(title="Range of Simulated Post Evolution CP = Eevee",
       subtitle="Simulated Beta Parameters, n = 100",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pokémon Ordered by Mean Simulated cp_new",
       y = "Post Evolution CP") +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Simulated_shape_parameters_eevee.png", width = 6, height = 5)

# Weedle parameter Sim plot
ggplot() +
  geom_point(data = psim_weedle_list[[1]], aes(x = ord, y = value),
             alpha = 0.50, color = "skyblue3") +
  geom_point(data = psim_weedle_list[[2]], aes(x = ord, y = cp_new), color = "red") +
  theme_bw() +
  labs(title="Range of Simulated Post Evolution CP = Weedle",
       subtitle="Simulated Beta Parameters, n = 100",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pokémon Ordered by Mean Simulated cp_new",
       y = "Post Evolution CP") +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Simulated_shape_parameters_weedle.png", width = 6, height = 5)

# Caterpie parameter Sim plot
ggplot() +
  geom_point(data = psim_caterpie_list[[1]], aes(x = ord, y = value),
             alpha = 0.50, color = "skyblue3") +
  geom_point(data = psim_caterpie_list[[2]], aes(x = ord, y = cp_new), color = "red") +
  theme_bw() +
  labs(title="Range of Simulated Post Evolution CP = Caterpie",
       subtitle="Simulated Beta Parameters, n = 100",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pokémon Ordered by Mean Simulated cp_new",
       y = "Post Evolution CP") +
  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Simulated_shape_parameters_caterpie.png", width = 6, height = 5)

# RMSE results from simualted beta parameters
pidgey_sim_rmse <- sim_param_RMSE(pidgey_dat, psim_pidgey_list[[1]])
eevee_sim_rmse <- sim_param_RMSE(eevee_dat, psim_eevee_list[[1]])
caterpie_sim_rmse <- sim_param_RMSE(caterpie_dat, psim_caterpie_list[[1]])
weedle_sim_rmse <- sim_param_RMSE(weedle_dat, psim_weedle_list[[1]])
# combine RMSE
species_sim_metrics <- rbind(pidgey_sim_rmse, eevee_sim_rmse,
                             caterpie_sim_rmse, weedle_sim_rmse)
# plot RMSE of simulated models parameters
ggplot(species_sim_metrics, aes(x = species, y = rmse,
                                group = species, color = species)) +
  geom_jitter(width = 0.3) +
  theme_bw() +
  scale_color_colorblind() +
  labs(title="RMSE Error for Mean Simulated Post-evolution CP ",
       subtitle = "Mean of n = 100 simulations for each species observation",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Species",
       y = "RMSE") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    legend.position = "right"
  )
ggsave("Species_simulated_CP_New_mean_RMSE.png", width = 6, height = 3)

# put metris into a table
sim_rmse_table <- group_by(species_sim_metrics, species) %>%
  summarise(cp_new_mean = round(mean(cp_new),2),
            sim_cp_new_mean = round(mean(means),2),
            sd_mean = round(mean(sd),2),
            rmse_mean = round(mean(rmse),2),
            mae_mean = round(mean(mae),2)) %>%
  mutate(pcnt_mae = round(mae_mean / cp_new_mean,3) * 100) %>%
  data.frame()
# print table HTML for blog
print(xtable(sim_rmse_table), type = "html", include.rownames=FALSE)

#### Model parameters in Full Bayes??!!! Are you crazy?!
## estimation using JAGS
library("rjags") # if you dare!!!

# extract abs_cent_spec_mean_pdcp for jags estimation
y <- abs(dat1$cent_spec_mean_pdcp)
# build model text for JAGE
model_string <- "model{
for(i in 1:length(y)) {
    y[i] ~ dbeta(alpha, beta) # y distributed as a beta
  }
  alpha ~ dt(0,1,1)T(0,) # half cauchy prior
  beta  ~ dt(0,1,1)T(0,) # half cauchy prior
}"
# initialize model with data
model <- jags.model(textConnection(model_string), data = list(y = y), n.chains = 3, n.adapt= 10000)
update(model, 10000); # Burnin for 10000 samples
# sample alpha (shape1) and beta (shape2) from posterior. Can take a while
mcmc_samples <- coda.samples(model, variable.names=c("alpha", "beta"), n.iter=20000)
# plot(mcmc_samples) # plot chain convergence and parameter density
# model parameter distribution stats
summary(mcmc_samples)

# table of stats and print HTML for blog
jags_sum <- summary(mcmc_samples)$quantiles
print(xtable(jags_sum), type = "html")

# plot shape1 and shape 2 parameters
alpha <- mcmc_samples[[1]][,1]
beta <- mcmc_samples[[1]][,2]

# plot correlated parameters with MLE from earlier param estimation and MAP from posterior
ggplot(data = data.frame(alpha = as.numeric(alpha), beta = as.numeric(beta)),
       aes(x = alpha, y = beta)) +
  geom_point(alpha = 0.15, color = "skyblue3") +
  geom_density_2d(color = "gray50", alpha = 0.85) +
  geom_point(aes(x = 0.57, y = 15.68), color = "yellow", size = 2) + # MLE
  geom_point(aes(x = mean(alpha), y = mean(beta)), color = "red", size = 2) + #MAP
  theme_bw() +
  labs(title="Posterior of Beta Distribution Shape Parameters",
       subtitle="n = 20K samples, red = MAP, yellow = MLE",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Shape 1",
       y = "Shape 2") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Bayes_simulated_shape_parameters.png", width = 6, height = 5)

# prior, post, plot
post_plot <- data.frame(value = c(alpha, beta),
              parameter = rep(c("alpha", "beta"), each = length(alpha)),
              model = "posterior")
alpha_half_cauchy <- rt(nrow(post_plot),1,1)
alpha_half_cauchy <- alpha_half_cauchy[alpha_half_cauchy > 0]
beta_half_cauchy <- rt(nrow(post_plot),1,1)
beta_half_cauchy <- beta_half_cauchy[beta_half_cauchy > 0]
prior_plot <- data.frame(value = c(alpha_half_cauchy,beta_half_cauchy),
              parameter = rep(c("alpha", "beta"),
                    times = c(length(alpha_half_cauchy), length(beta_half_cauchy))),
              model = "prior"
  )
jags_plot_dat <- rbind(post_plot, prior_plot)

ggplot(data = filter(jags_plot_dat, parameter == "alpha"),
                    aes(x = value, group = model, color = model)) +
  geom_density() +
  scale_x_continuous(limits = c(0,10), breaks = seq(0,10,0.5)) +
  scale_color_manual(values = c("orange", "purple")) +
  theme_bw() +
  labs(title="Prior and Posterior of Shape1 Parameter of Beta Distribution",
       subtitle="Half Cauchy as T distribution (1,1) > 0",
       caption="20K Samples from JAGS model",
       x = "Shape1 Parameter",
       y = "Density") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("alpha_plot.png", width = 6, height = 5)

ggplot(data = filter(jags_plot_dat, parameter == "beta"),
       aes(x = value, group = model, color = model)) +
  geom_density() +
  scale_x_continuous(limits = c(0,100), breaks = seq(0,100,5)) +
  scale_color_manual(values = c("orange", "purple")) +
  theme_bw() +
  labs(title="Prior and Posterior of Shape2 Parameter of Beta Distribution",
       subtitle="Half Cauchy as T distribution (1,1) > 0",
       caption="20K Samples from JAGS model",
       x = "Shape2 Parameter",
       y = "Density") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("beta_plot.png", width = 6, height = 5)


###### Generalized predictions


base = 10
sim_reps = 10000
length_out = 10

# don't need all of this for here, but good summary for all species
spec_ranges <- group_by(dat1, species) %>%
  summarise(min_cp = min(cp),
            max_cp = max(cp),
            mean_cp = mean(cp),
            mean_cp_new = mean(cp_new),
            mean_pdcp = mean(pcnt_d_cp),
            mean_abs_cent_spec_mean_pdcp = mean(abs(cent_spec_mean_pdcp))) %>%
  data.frame()
print(xtable(spec_ranges), type = "html", include.rownames=FALSE)
## Predict siualted Pokémon species based on empirical range
# Pidgey
pidgey_cp_range <- sim_new_species(spec_ranges, pidgey_dat)
predicted_pidgey <-  pedict_simulated_cp_new(pidgey_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
pidgey_cp_new_ranges <- make_pred_cp_plot_dat(predicted_pidgey, pidgey_cp_range)
# Eevee
eevee_cp_range <- sim_new_species(spec_ranges, eevee_dat)
predicted_eevee <-  pedict_simulated_cp_new(eevee_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
eevee_cp_new_ranges <- make_pred_cp_plot_dat(predicted_eevee, eevee_cp_range)
# Weedle
weedle_cp_range <- sim_new_species(spec_ranges, weedle_dat)
predicted_weedle <-  pedict_simulated_cp_new(weedle_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
weedle_cp_new_ranges <- make_pred_cp_plot_dat(predicted_weedle, weedle_cp_range)
# Caterpie
caterpie_cp_range <- sim_new_species(spec_ranges, caterpie_dat)
predicted_caterpie <-  pedict_simulated_cp_new(caterpie_cp_range, f3$estimate[1], f3$estimate[2], reps = sim_reps)
caterpie_cp_new_ranges <- make_pred_cp_plot_dat(predicted_caterpie, caterpie_cp_range)


# line plot with error ribbons, ok, but not too flashy
ggplot(data = pidgey_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
  geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
  geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
  geom_point(data = pidgey_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.60) +
  geom_line(color = "firebrick", size = 0.75) +
  scale_x_continuous(breaks = seq(min(pidgey_cp_new_ranges$cp),
                                  max(pidgey_cp_new_ranges$cp),by = 20)) +
  scale_y_continuous(breaks = seq(min(pidgey_cp_new_ranges$cp_new_min),
                                  max(pidgey_cp_new_ranges$cp_new_max),by = 50)) +
  theme_bw() +
  labs(title="Range of Model Predictions for CP_new - Pidgey",
       subtitle="Mean, 95% prediction interval & min/max prediction",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pre-Evolution CP",
       y = "Modeled Post-Evolution CP") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Predicted_cp_new_pidgey.png", width = 6, height = 5)

# eevee plot
ggplot(data = eevee_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
  geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
  geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
  geom_point(data = eevee_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
  geom_line(color = "firebrick", size = 0.75) +
  scale_x_continuous(breaks = seq(min(eevee_cp_new_ranges$cp),
                                  max(eevee_cp_new_ranges$cp),by = 20)) +
  scale_y_continuous(breaks = seq(min(eevee_cp_new_ranges$cp_new_min),
                                  max(eevee_cp_new_ranges$cp_new_max),by = 50)) +
  theme_bw() +
  labs(title="Range of Model Predictions for CP_new - Eevee",
       subtitle="Mean, 95% prediction interval & min/max prediction",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pre-Evolution CP",
       y = "Modeled Post-Evolution CP") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Predicted_cp_new_eevee.png", width = 6, height = 5)

# weedle plot
ggplot(data = weedle_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
  geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
  geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
  geom_point(data = weedle_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
  geom_line(color = "firebrick", size = 0.75) +
  scale_x_continuous(breaks = seq(min(weedle_cp_new_ranges$cp),
                                  max(weedle_cp_new_ranges$cp),by = 20)) +
  scale_y_continuous(breaks = seq(min(weedle_cp_new_ranges$cp_new_min),
                                  max(weedle_cp_new_ranges$cp_new_max),by = 50)) +
  theme_bw() +
  labs(title="Range of Model Predictions for CP_new - weedle",
       subtitle="Mean, 95% prediction interval & min/max prediction",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pre-Evolution CP",
       y = "Modeled Post-Evolution CP") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Predicted_cp_new_weedle.png", width = 6, height = 5)

# caterpie plot
ggplot(data = caterpie_cp_new_ranges, aes(x = cp, y = cp_new_mean)) +
  geom_ribbon(aes(ymin = cp_new_min, ymax = cp_new_max),alpha=0.4, fill = "skyblue") +
  geom_ribbon(aes(ymin = cp_new_5, ymax = cp_new_95),alpha=0.75, fill = "skyblue3") +
  geom_point(data = caterpie_dat, aes(x = cp, y = cp_new), color = "gray30", alpha = 0.50) +
  geom_line(color = "firebrick", size = 0.75) +
  scale_x_continuous(breaks = seq(min(caterpie_cp_new_ranges$cp),
                                  max(caterpie_cp_new_ranges$cp),by = 20)) +
  scale_y_continuous(breaks = seq(min(caterpie_cp_new_ranges$cp_new_min),
                                  max(caterpie_cp_new_ranges$cp_new_max),by = 50)) +
  theme_bw() +
  labs(title="Range of Model Predictions for CP_new - Caterpie",
       subtitle="Mean, 95% prediction interval & min/max prediction",
       caption="Data: https://www.openintro.org/stat/data/?data=pokemon",
       x = "Pre-Evolution CP",
       y = "Modeled Post-Evolution CP") +
  theme(
    axis.text.x = element_text(size = 8, family = "Trebuchet MS"),
    axis.text.y = element_text(size = 8, family = "Trebuchet MS"),
    axis.title = element_text(size = 10, family = "Trebuchet MS"),
    plot.caption = element_text(size = 7, hjust=0, margin=margin(t=5),
                                family = "Trebuchet MS"),
    plot.title=element_text(family="TrebuchetMS-Bold"),
    plot.subtitle=element_text(family="TrebuchetMS-Bold")
  )
ggsave("Predicted_cp_new_caterpie.png", width = 6, height = 5)