DomBennett/dplyr_predicts.R

## dplyr_predicts.R
# PREDICTS AND DPLYR
# D. Bennett & L. Graham

# BRIEF:
# PREDICTS is a large dataset of multiple ecological studies that
# have sampled (through counting, trapping, sighting... etc.) a given
# taxonomic group.
# Each study contains information regarding the number of samples taken,
# the taxonomy of the group, the study area etc.
# Here we will read in a small example dataset of PREDICTS and demonstrate
# how you can use dplyr with the question: how many habitats does each study represent?

# LIBS
library(dplyr)

# READ DATA FROM URL AND CONVERT TO TBL_DF
predicts <- read.csv("http://onlinelibrary.wiley.com/store/10.1002/ece3.1303/asset/supinfo/ece31303-sup-0002-DataS1.csv?v=1&s=f1c0f0c5a047aa08c65fb48a3186cecc18faa8a0")
predicts <- tbl_df(predicts)  # convert to tbl_df

# INVESTIGATE
# how many names?
length(names(predicts))
# how many columns?
ncol(predicts)
# print, don't worry dplyr won't spend hours printing!
print(predicts)

# HOW HABITATS ARE REPRESENTED BY EACH STUDY?

# Method 1: for loop
# Identify the studies by creating a new studies column
predicts$SSID <- paste0(predicts$Source_ID, '_', predicts$Study_number)
stds <- unique(predicts$SSID)
# Loop through and identify the the number of habitats in each study
nhabitats <- rep(0, length(stds))
for(i in 1:length(nhabitats)) {
  nhabitats[i] <- length(unique(predicts$Predominant_habitat[predicts$SSID == stds[i]]))
}
res_1 <- data.frame(stds, nhabitats)

# Method 2: group_by + summarise
# group
res_2 <- group_by(predicts, SSID)
# summarise
res_2 <- summarise(res_2, N_habitats=n_distinct(Predominant_habitat))

# Method 3: Behold, the power of the pipe!
res_3 <- mutate(predicts, SSID=paste0(Source_ID, '_', Study_number)) %>%
  group_by(SSID) %>%
  summarise(N_habitats=n_distinct(Predominant_habitat))
	# PREDICTS AND DPLYR
	# D. Bennett & L. Graham

	# BRIEF:
	# PREDICTS is a large dataset of multiple ecological studies that
	# have sampled (through counting, trapping, sighting... etc.) a given
	# taxonomic group.
	# Each study contains information regarding the number of samples taken,
	# the taxonomy of the group, the study area etc.
	# Here we will read in a small example dataset of PREDICTS and demonstrate
	# how you can use dplyr with the question: how many habitats does each study represent?

	# LIBS
	library(dplyr)

	# READ DATA FROM URL AND CONVERT TO TBL_DF
	predicts <- read.csv("http://onlinelibrary.wiley.com/store/10.1002/ece3.1303/asset/supinfo/ece31303-sup-0002-DataS1.csv?v=1&s=f1c0f0c5a047aa08c65fb48a3186cecc18faa8a0")
	predicts <- tbl_df(predicts) # convert to tbl_df

	# INVESTIGATE
	# how many names?
	length(names(predicts))
	# how many columns?
	ncol(predicts)
	# print, don't worry dplyr won't spend hours printing!
	print(predicts)

	# HOW HABITATS ARE REPRESENTED BY EACH STUDY?

	# Method 1: for loop
	# Identify the studies by creating a new studies column
	predicts$SSID <- paste0(predicts$Source_ID, '_', predicts$Study_number)
	stds <- unique(predicts$SSID)
	# Loop through and identify the the number of habitats in each study
	nhabitats <- rep(0, length(stds))
	for(i in 1:length(nhabitats)) {
	nhabitats[i] <- length(unique(predicts$Predominant_habitat[predicts$SSID == stds[i]]))
	}
	res_1 <- data.frame(stds, nhabitats)

	# Method 2: group_by + summarise
	# group
	res_2 <- group_by(predicts, SSID)
	# summarise
	res_2 <- summarise(res_2, N_habitats=n_distinct(Predominant_habitat))

	# Method 3: Behold, the power of the pipe!
	res_3 <- mutate(predicts, SSID=paste0(Source_ID, '_', Study_number)) %>%
	group_by(SSID) %>%
	summarise(N_habitats=n_distinct(Predominant_habitat))