Skip to content

Instantly share code, notes, and snippets.

@khturner
Created December 5, 2017 17:15
Show Gist options
  • Save khturner/e703e83b1bc62a8612a1e6c1c8cf5759 to your computer and use it in GitHub Desktop.
Save khturner/e703e83b1bc62a8612a1e6c1c8cf5759 to your computer and use it in GitHub Desktop.
library(tidyverse)
shared_file <- read_tsv("data/mothur/kws_final.an.shared") %>%
select(-label, -numOtus) %>%
# melt, onvert to relative abundance
gather("OTU", "abundance", starts_with("Otu")) %>%
group_by(Group) %>%
mutate(abundance = abundance / sum(abundance)) %>% ungroup
tax_file <- read_tsv("data/mothur/kws_final.an.cons.taxonomy") %>% select(-Size) %>%
# Clean up and separate taxonomy
mutate(Taxonomy = gsub("\\([^)]*\\)", "", Taxonomy),
Taxonomy = gsub("_unclassified", "", Taxonomy)) %>%
separate(Taxonomy, c("kingdom", "phylum", "class", "order", "family", "genus"), ";")
# Taxonomy-rank based abundance
tax_abundance <- inner_join(shared_file, tax_file) %>%
gather(rank, value, kingdom:genus) %>%
group_by(Group, rank, value) %>%
summarize(abundance = sum(abundance)) %>% ungroup %>%
filter(value != "") %>% # Don't want to lump together all empties
mutate(value = gsub("[-]", "_", value)) %>% # dashes in column names are bad
mutate(key = paste(rank, value, sep = "_")) %>%
select(Group, key, abundance) %>%
spread(key, abundance)
## Join back on
shared_with_tax <- shared_file %>%
spread(OTU, abundance) %>%
inner_join(tax_abundance)
## IS LOG10 ABD MORE RELEVANT?
shared_with_tax <- shared_with_tax %>%
gather(feat, abundance, -Group) %>%
filter(abundance > 0) %>%
mutate(abundance = log10(abundance)) %>%
spread(feat, abundance, fill = -5.5) # Fill with ~LOD
## Add metadata
meta_file <- read_tsv("data/raw/kws_metadata.tsv")
final_data <- inner_join(meta_file, shared_with_tax, by = c("group" = "Group"))
## nzv?
library(caret)
nzv <- nearZeroVar(final_data, saveMetrics = T)
final_nzv_data <- final_data[,which(!nzv$nzv)] %>%
select(-side, -location, -group, -patient, -gender) %>% # For now let's just try to guess the site
mutate(site = factor(site, levels = c("mucosa", "stool", "exit")))
## First pass dumb model
library(randomForest)
rf <- randomForest(site ~ ., data = final_nzv_data, importance = T)
vi <- tibble(feature = rownames(rf$importance), MeanDecreaseGini = tbl_df(rf$importance)$MeanDecreaseGini) %>%
arrange(desc(MeanDecreaseGini))
## viz top x features?
library(forcats)
final_data %>%
gather(feature, abundance, -(group:gender)) %>%
inner_join(head(vi, 12)) %>%
mutate(feature = fct_reorder(feature, -MeanDecreaseGini)) %>%
ggplot(aes(site, 10^abundance)) +
geom_boxplot() +
scale_y_log10("Relative abundance",
breaks = c(1e-4, 1e-2, 1),
labels = c("0.01%", "1%", "100%")) +
annotation_logticks(sides = "l") +
facet_wrap(~feature)
@khturner
Copy link
Author

khturner commented Dec 5, 2017

top12

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment