eclarke/indicspecies_tutorial.R

## indicspecies_tutorial.R
# Using indicspecies with a melted data frame

# Their input is actually not hard to work with. First we need to re-create the count matrix.
# This creates a data frame with the sample ID and study group as the first two columns, then each column after that is an OTU name
# (Replace column names as appropriate)
mat <- melted.df %>% reshape2::dcast(SampleID + StudyGroup ~ otu)

# Next, we actually convert things into inputs
# Hadley's stuff hates rownames, so we have to remake them
rownames(mat) = mat$SampleID
# This creates a separate StudyGroup vector (useful later)
StudyGroup = mat$StudyGroup

# Next, we remove these text columns from the matrix, so that we can (if we want) convert it to a well-formed numeric matrix.
mat$SampleID <- NULL
mat$StudyGroup <- NULL

# Running multipatt:
library(indicspecies)
# They use random permutations to assess p-values, so it's good to set a reproducible random seed
set.seed(42)
# Where the magic happens:
indvals <- multipatt(mat, StudyGroup)
# And here are the results:
print(summary(indvals))

# Next, let's train a random forest model on the top indicator species
library(randomForest)
# Select the most discriminative species:
sig.otus <- indvals$sign %>% mutate(otu = rownames(.)) %>% filter(p.value < 0.05)
sig.otu.mat <- mat[, sig.otus$otu]
# Put it in the right format for RandomForests
sig.otus.rf <- data.frame(StudyGroup, sig.otu.mat)
rf.results <- randomForest(StudyGroup ~ ., data = sig.otus.rf, ntree=100)
# Check out the accuracy
print(rf.results)
# Collect the importance metrics
rf.importance = as.data.frame(importance(rf.results))
rf.importance$otu <- rownames(rf.importance)
rf.importance <- arrange(rf.importance, desc(MeanDecreaseGini))
	# Using indicspecies with a melted data frame

	# Their input is actually not hard to work with. First we need to re-create the count matrix.
	# This creates a data frame with the sample ID and study group as the first two columns, then each column after that is an OTU name
	# (Replace column names as appropriate)
	mat <- melted.df %>% reshape2::dcast(SampleID + StudyGroup ~ otu)

	# Next, we actually convert things into inputs
	# Hadley's stuff hates rownames, so we have to remake them
	rownames(mat) = mat$SampleID
	# This creates a separate StudyGroup vector (useful later)
	StudyGroup = mat$StudyGroup

	# Next, we remove these text columns from the matrix, so that we can (if we want) convert it to a well-formed numeric matrix.
	mat$SampleID <- NULL
	mat$StudyGroup <- NULL

	# Running multipatt:
	library(indicspecies)
	# They use random permutations to assess p-values, so it's good to set a reproducible random seed
	set.seed(42)
	# Where the magic happens:
	indvals <- multipatt(mat, StudyGroup)
	# And here are the results:
	print(summary(indvals))

	# Next, let's train a random forest model on the top indicator species
	library(randomForest)
	# Select the most discriminative species:
	sig.otus <- indvals$sign %>% mutate(otu = rownames(.)) %>% filter(p.value < 0.05)
	sig.otu.mat <- mat[, sig.otus$otu]
	# Put it in the right format for RandomForests
	sig.otus.rf <- data.frame(StudyGroup, sig.otu.mat)
	rf.results <- randomForest(StudyGroup ~ ., data = sig.otus.rf, ntree=100)
	# Check out the accuracy
	print(rf.results)
	# Collect the importance metrics
	rf.importance = as.data.frame(importance(rf.results))
	rf.importance$otu <- rownames(rf.importance)
	rf.importance <- arrange(rf.importance, desc(MeanDecreaseGini))