padpadpadpad/ASKHelp2.R

## ASKHelp2.R
# example of a VERY simple binomial regression ####

# good coding practice (in my opinion anyway!) ####
# 1. #hashtag your code so you know what it does
# 2. clear workspace and load packages at the top to keep track of what you have loaded
# 3. make sure your working directory is in the right place
# 4. space things out in a way that makes your code readable to you
# 5. google things you do not understand. The answers are out there, go find them
# 6. do not get scared/angry when you get errors. It does get easier.... eventually

# clear workspace #### Good code practice to do first
rm(list = ls())

# set working directory - do not need to do here ####
#setwd("~/where/your/stuff/is")

# load packages ####
# if you do not have these packages - install.packages('package name')
library(dplyr)
library(tidyr)
library(ggplot2)

# create some dummy data ####
years <- 1990:2016
d <- data.frame(infected = rbinom(40, size = 1, prob = 0.5),
                year = sample(years, 40, replace = TRUE)) %>%
  arrange(., year)

# look at the column names of a dataset ####
colnames(d)

# look at the first 6 rows of the dataset ####
head(d)

# rename a value of the data ####
d <- rename(d, transmission = infected)
colnames(d)

# create a new column ####
d <- mutate(d, decade = ifelse(year < 2000, 'nineties', 'noughties'))

# subset the data to just include the nineties
d2 <- filter(d, decade == 'nineties')
unique(d2$year)

# quick plot ###
ggplot() +
  geom_point(aes(year, transmission), d)

# run a simple glm to see if transmission varies with year
fit <- glm(transmission ~ year, family = binomial, data = d)
summary(fit)
# This will change everytime you run it!!!

# predict from our model
# 1. create a new dataframe with dummy x variables
# this must contain all the predictors in the model and their column names need to be THE SAME as in the model
d_new <- data.frame(year = seq(min(d$year), max(d$year), length.out = 50))
# 2. use the model and the new data to predict new y values from our new x values
d_new$pred <- predict(fit, d_new, type = 'response')

# plot data and model ####
ggplot() +
  geom_point(aes(year, transmission), d) +
  geom_line(aes(year, pred), d_new)

# plot does not show us how many points we have at each year
# can do this by creating a new dataframe of counts per year per transmission value (0 or 1)
d3 <- group_by(d, transmission, year) %>%     # tell R we are grouping d by each combination of transmission and year
  summarise(num = n()) %>%                    # count the number in each group
  data.frame()                                # I do not like tibbles


# replot the data ####
ggplot() +
  geom_point(aes(year, transmission, size = num), d3) +
  geom_line(aes(year, pred), linetype = 2, d_new) +
  xlab('Year') +
  ylab('Transmission') +
  theme_bw() +
  scale_size_continuous(breaks = c(1,2, 3), 'Number')

# Final point
# think of checking for temporal autocorrelation as 2000 will be more related to 2001 than to 1996 and you should take this into
# if possible or at least check if it occurs! :)
	# example of a VERY simple binomial regression ####

	# good coding practice (in my opinion anyway!) ####
	# 1. #hashtag your code so you know what it does
	# 2. clear workspace and load packages at the top to keep track of what you have loaded
	# 3. make sure your working directory is in the right place
	# 4. space things out in a way that makes your code readable to you
	# 5. google things you do not understand. The answers are out there, go find them
	# 6. do not get scared/angry when you get errors. It does get easier.... eventually

	# clear workspace #### Good code practice to do first
	rm(list = ls())

	# set working directory - do not need to do here ####
	#setwd("~/where/your/stuff/is")

	# load packages ####
	# if you do not have these packages - install.packages('package name')
	library(dplyr)
	library(tidyr)
	library(ggplot2)

	# create some dummy data ####
	years <- 1990:2016
	d <- data.frame(infected = rbinom(40, size = 1, prob = 0.5),
	year = sample(years, 40, replace = TRUE)) %>%
	arrange(., year)

	# look at the column names of a dataset ####
	colnames(d)

	# look at the first 6 rows of the dataset ####
	head(d)

	# rename a value of the data ####
	d <- rename(d, transmission = infected)
	colnames(d)

	# create a new column ####
	d <- mutate(d, decade = ifelse(year < 2000, 'nineties', 'noughties'))

	# subset the data to just include the nineties
	d2 <- filter(d, decade == 'nineties')
	unique(d2$year)

	# quick plot ###
	ggplot() +
	geom_point(aes(year, transmission), d)

	# run a simple glm to see if transmission varies with year
	fit <- glm(transmission ~ year, family = binomial, data = d)
	summary(fit)
	# This will change everytime you run it!!!

	# predict from our model
	# 1. create a new dataframe with dummy x variables
	# this must contain all the predictors in the model and their column names need to be THE SAME as in the model
	d_new <- data.frame(year = seq(min(d$year), max(d$year), length.out = 50))
	# 2. use the model and the new data to predict new y values from our new x values
	d_new$pred <- predict(fit, d_new, type = 'response')

	# plot data and model ####
	ggplot() +
	geom_point(aes(year, transmission), d) +
	geom_line(aes(year, pred), d_new)

	# plot does not show us how many points we have at each year
	# can do this by creating a new dataframe of counts per year per transmission value (0 or 1)
	d3 <- group_by(d, transmission, year) %>% # tell R we are grouping d by each combination of transmission and year
	summarise(num = n()) %>% # count the number in each group
	data.frame() # I do not like tibbles


	# replot the data ####
	ggplot() +
	geom_point(aes(year, transmission, size = num), d3) +
	geom_line(aes(year, pred), linetype = 2, d_new) +
	xlab('Year') +
	ylab('Transmission') +
	theme_bw() +
	scale_size_continuous(breaks = c(1,2, 3), 'Number')

	# Final point
	# think of checking for temporal autocorrelation as 2000 will be more related to 2001 than to 1996 and you should take this into
	# if possible or at least check if it occurs! :)