Last active
February 2, 2017 22:01
-
-
Save padpadpadpad/6df04c9796927cca384a659a2b499bc9 to your computer and use it in GitHub Desktop.
A simple example of a binomial regression, plotting data and predictions with dplyr and ggplot2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# example of a VERY simple binomial regression #### | |
# good coding practice (in my opinion anyway!) #### | |
# 1. #hashtag your code so you know what it does | |
# 2. clear workspace and load packages at the top to keep track of what you have loaded | |
# 3. make sure your working directory is in the right place | |
# 4. space things out in a way that makes your code readable to you | |
# 5. google things you do not understand. The answers are out there, go find them | |
# 6. do not get scared/angry when you get errors. It does get easier.... eventually | |
# clear workspace #### Good code practice to do first | |
rm(list = ls()) | |
# set working directory - do not need to do here #### | |
#setwd("~/where/your/stuff/is") | |
# load packages #### | |
# if you do not have these packages - install.packages('package name') | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
# create some dummy data #### | |
years <- 1990:2016 | |
d <- data.frame(infected = rbinom(40, size = 1, prob = 0.5), | |
year = sample(years, 40, replace = TRUE)) %>% | |
arrange(., year) | |
# look at the column names of a dataset #### | |
colnames(d) | |
# look at the first 6 rows of the dataset #### | |
head(d) | |
# rename a value of the data #### | |
d <- rename(d, transmission = infected) | |
colnames(d) | |
# create a new column #### | |
d <- mutate(d, decade = ifelse(year < 2000, 'nineties', 'noughties')) | |
# subset the data to just include the nineties | |
d2 <- filter(d, decade == 'nineties') | |
unique(d2$year) | |
# quick plot ### | |
ggplot() + | |
geom_point(aes(year, transmission), d) | |
# run a simple glm to see if transmission varies with year | |
fit <- glm(transmission ~ year, family = binomial, data = d) | |
summary(fit) | |
# This will change everytime you run it!!! | |
# predict from our model | |
# 1. create a new dataframe with dummy x variables | |
# this must contain all the predictors in the model and their column names need to be THE SAME as in the model | |
d_new <- data.frame(year = seq(min(d$year), max(d$year), length.out = 50)) | |
# 2. use the model and the new data to predict new y values from our new x values | |
d_new$pred <- predict(fit, d_new, type = 'response') | |
# plot data and model #### | |
ggplot() + | |
geom_point(aes(year, transmission), d) + | |
geom_line(aes(year, pred), d_new) | |
# plot does not show us how many points we have at each year | |
# can do this by creating a new dataframe of counts per year per transmission value (0 or 1) | |
d3 <- group_by(d, transmission, year) %>% # tell R we are grouping d by each combination of transmission and year | |
summarise(num = n()) %>% # count the number in each group | |
data.frame() # I do not like tibbles | |
# replot the data #### | |
ggplot() + | |
geom_point(aes(year, transmission, size = num), d3) + | |
geom_line(aes(year, pred), linetype = 2, d_new) + | |
xlab('Year') + | |
ylab('Transmission') + | |
theme_bw() + | |
scale_size_continuous(breaks = c(1,2, 3), 'Number') | |
# Final point | |
# think of checking for temporal autocorrelation as 2000 will be more related to 2001 than to 1996 and you should take this into | |
# if possible or at least check if it occurs! :) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment