khturner/taxfeatures.R

## taxfeatures.R
library(tidyverse)

shared_file <- read_tsv("data/mothur/kws_final.an.shared") %>%
  select(-label, -numOtus) %>%
  # melt, onvert to relative abundance
  gather("OTU", "abundance", starts_with("Otu")) %>%
  group_by(Group) %>%
  mutate(abundance = abundance / sum(abundance)) %>% ungroup

tax_file <- read_tsv("data/mothur/kws_final.an.cons.taxonomy") %>% select(-Size) %>%
  # Clean up and separate taxonomy
  mutate(Taxonomy = gsub("\\([^)]*\\)", "", Taxonomy),
         Taxonomy = gsub("_unclassified", "", Taxonomy)) %>%
  separate(Taxonomy, c("kingdom", "phylum", "class", "order", "family", "genus"), ";")

# Taxonomy-rank based abundance
tax_abundance <- inner_join(shared_file, tax_file) %>%
  gather(rank, value, kingdom:genus) %>%
  group_by(Group, rank, value) %>%
  summarize(abundance = sum(abundance)) %>% ungroup %>%
  filter(value != "") %>% # Don't want to lump together all empties
  mutate(value = gsub("[-]", "_", value)) %>% # dashes in column names are bad
  mutate(key = paste(rank, value, sep = "_")) %>%
  select(Group, key, abundance) %>%
  spread(key, abundance)

## Join back on
shared_with_tax <- shared_file %>%
  spread(OTU, abundance) %>%
  inner_join(tax_abundance)

## IS LOG10 ABD MORE RELEVANT?
shared_with_tax <- shared_with_tax %>%
  gather(feat, abundance, -Group) %>%
  filter(abundance > 0) %>%
  mutate(abundance = log10(abundance)) %>%
  spread(feat, abundance, fill = -5.5) # Fill with ~LOD

## Add metadata
meta_file <- read_tsv("data/raw/kws_metadata.tsv")
final_data <- inner_join(meta_file, shared_with_tax, by = c("group" = "Group"))

## nzv?
library(caret)
nzv <- nearZeroVar(final_data, saveMetrics = T)
final_nzv_data <- final_data[,which(!nzv$nzv)] %>%
  select(-side, -location, -group, -patient, -gender) %>% # For now let's just try to guess the site
  mutate(site = factor(site, levels = c("mucosa", "stool", "exit")))

## First pass dumb model
library(randomForest)
rf <- randomForest(site ~ ., data = final_nzv_data, importance = T)
vi <- tibble(feature = rownames(rf$importance), MeanDecreaseGini = tbl_df(rf$importance)$MeanDecreaseGini) %>%
  arrange(desc(MeanDecreaseGini))

## viz top x features?
library(forcats)
final_data %>%
  gather(feature, abundance, -(group:gender)) %>%
  inner_join(head(vi, 12)) %>%
  mutate(feature = fct_reorder(feature, -MeanDecreaseGini)) %>%
  ggplot(aes(site, 10^abundance)) +
  geom_boxplot() +
  scale_y_log10("Relative abundance",
                breaks = c(1e-4, 1e-2, 1),
                labels = c("0.01%", "1%", "100%")) +
  annotation_logticks(sides = "l") +
  facet_wrap(~feature)
	library(tidyverse)

	shared_file <- read_tsv("data/mothur/kws_final.an.shared") %>%
	select(-label, -numOtus) %>%
	# melt, onvert to relative abundance
	gather("OTU", "abundance", starts_with("Otu")) %>%
	group_by(Group) %>%
	mutate(abundance = abundance / sum(abundance)) %>% ungroup

	tax_file <- read_tsv("data/mothur/kws_final.an.cons.taxonomy") %>% select(-Size) %>%
	# Clean up and separate taxonomy
	mutate(Taxonomy = gsub("\\([^)]*\\)", "", Taxonomy),
	Taxonomy = gsub("_unclassified", "", Taxonomy)) %>%
	separate(Taxonomy, c("kingdom", "phylum", "class", "order", "family", "genus"), ";")

	# Taxonomy-rank based abundance
	tax_abundance <- inner_join(shared_file, tax_file) %>%
	gather(rank, value, kingdom:genus) %>%
	group_by(Group, rank, value) %>%
	summarize(abundance = sum(abundance)) %>% ungroup %>%
	filter(value != "") %>% # Don't want to lump together all empties
	mutate(value = gsub("[-]", "_", value)) %>% # dashes in column names are bad
	mutate(key = paste(rank, value, sep = "_")) %>%
	select(Group, key, abundance) %>%
	spread(key, abundance)

	## Join back on
	shared_with_tax <- shared_file %>%
	spread(OTU, abundance) %>%
	inner_join(tax_abundance)

	## IS LOG10 ABD MORE RELEVANT?
	shared_with_tax <- shared_with_tax %>%
	gather(feat, abundance, -Group) %>%
	filter(abundance > 0) %>%
	mutate(abundance = log10(abundance)) %>%
	spread(feat, abundance, fill = -5.5) # Fill with ~LOD

	## Add metadata
	meta_file <- read_tsv("data/raw/kws_metadata.tsv")
	final_data <- inner_join(meta_file, shared_with_tax, by = c("group" = "Group"))

	## nzv?
	library(caret)
	nzv <- nearZeroVar(final_data, saveMetrics = T)
	final_nzv_data <- final_data[,which(!nzv$nzv)] %>%
	select(-side, -location, -group, -patient, -gender) %>% # For now let's just try to guess the site
	mutate(site = factor(site, levels = c("mucosa", "stool", "exit")))

	## First pass dumb model
	library(randomForest)
	rf <- randomForest(site ~ ., data = final_nzv_data, importance = T)
	vi <- tibble(feature = rownames(rf$importance), MeanDecreaseGini = tbl_df(rf$importance)$MeanDecreaseGini) %>%
	arrange(desc(MeanDecreaseGini))

	## viz top x features?
	library(forcats)
	final_data %>%
	gather(feature, abundance, -(group:gender)) %>%
	inner_join(head(vi, 12)) %>%
	mutate(feature = fct_reorder(feature, -MeanDecreaseGini)) %>%
	ggplot(aes(site, 10^abundance)) +
	geom_boxplot() +
	scale_y_log10("Relative abundance",
	breaks = c(1e-4, 1e-2, 1),
	labels = c("0.01%", "1%", "100%")) +
	annotation_logticks(sides = "l") +
	facet_wrap(~feature)