Skip to content

Instantly share code, notes, and snippets.

@meren
Created January 31, 2014 16:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meren/8735046 to your computer and use it in GitHub Desktop.
Save meren/8735046 to your computer and use it in GitHub Desktop.
library(ggplot2)
library(reshape)
library(reshape2)
library(RColorBrewer)
library("vegan")
library(gtools)
library(grid)
library(gridBase)
library(gridExtra)
library(Biostrings)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
PLOT_SITE <- function(df, genera, site = 'TH', metric='horn', method='ward'){
# get the subset for genera and site
df_sub_genera = df[df$genus %in% genera, ]
df_sub_site <- df_sub_genera[df_sub_genera$site == site, ]
sub_matrix <- acast(df_sub_site, sample~oligo, value.var="abundance", fill = 0)
distance <- metric #"manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "morisita", "horn", "mountford", "raup" , "binomial" or "chao"
d <- vegdist(sub_matrix, method=distance)
fit <- hclust(d, method=method) # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
samples_order <- fit$labels[fit$order]
# set the order of samples based on clustering results:
df_sub_site$sample <- ordered(df_sub_site$sample,levels=samples_order)
# oh, very cool
df_sub_site$oligo <- reorder(df_sub_site$oligo, df_sub_site$abundance, FUN=sum)
df_sub_site$oligo <- factor(df_sub_site$oligo, levels=levels(df_sub_site$oligo))
df_sub_site <- df_sub_site[order(df_sub_site$oligo), ]
# prepare ggplot object
p <- ggplot(df_sub_site, aes(x=factor(sample), y=abundance, fill = oligo))
p <- p + geom_bar(position="fill", stat = "identity")
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'none', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + labs(x='', y=paste(genera, collapse=' / '))
p <- p + scale_y_continuous(breaks = NULL)
# color manually:
colors <- sample(rainbow(length(levels(factor(df_sub_site$oligo))), s = 0.5, v = 0.7))
p <- p + scale_fill_manual(limits = levels(factor(df_sub_site$oligo)), values = colors)
print(p)
#
# # start plotting
# #pdf('SINGLE_SITE.pdf', width=16, height=16)
# plot.new()
# gl <- grid.layout(nrow=2, ncol=1)
# vp.1 <- viewport(layout.pos.col=1, layout.pos.row=1)
# vp.2 <- viewport(layout.pos.col=1, layout.pos.row=2)
# pushViewport(viewport(layout=gl))
# pushViewport(vp.1)
# par(new=TRUE)
# plot(fit, cex = 0.7, sub = paste("Distance metric: ",distance,sep=""), xlab = '') # display dendogram
# popViewport()
# pushViewport(vp.2)
# print(p, newpage = FALSE)
# popViewport()
# #dev.off()
}
PLOT_GENUS <- function(df, genera, site='TH', metric='horn', method='ward'){
# get genus
df_sub_genus <- df[df$genus %in% genera, ]
# get clustering based on SITE
df_sub_genus_sub_site <- df_sub_genus[df_sub_genus$site == site, ]
sub_matrix <- acast(df_sub_genus_sub_site, sample~oligo, value.var="abundance", fill = 0)
distance <- metric #"manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "morisita", "horn", "mountford", "raup" , "binomial" or "chao"
d <- vegdist(sub_matrix, method=distance)
fit <- hclust(d, method=method) # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
samples_order <- fit$labels[fit$order]
# set the order of samples based on clustering results:
df_sub_genus$sample <- ordered(df_sub_genus$sample,levels=samples_order)
# oh, very cool
df_sub_genus$oligo <- reorder(df_sub_genus$oligo, df_sub_genus$abundance, FUN=sum)
df_sub_genus$oligo <- factor(df_sub_genus$oligo, levels=levels(df_sub_genus$oligo))
df_sub_genus <- df_sub_genus[order(df_sub_genus$oligo), ]
p <- ggplot(df_sub_genus, aes(x=factor(sample), y=abundance, factor(oligo), color, fill = oligo))
p <- p + geom_bar(position="fill", stat = "identity", width=0.99)
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'none', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + labs(x='', y=length(genera))
p <- p + scale_y_continuous(breaks = NULL)
p <- p + facet_grid(site ~ .)
# color manually:
colors <- sample(rainbow(length(levels(factor(df_sub_genus$oligo))), s = 0.5, v = 0.7))
p <- p + scale_fill_manual(limits = levels(factor(df_sub_genus$oligo)), values = colors)
print(p)
#
# plot.new()
# gl <- grid.layout(nrow=2, ncol=1)
# vp.1 <- viewport(layout.pos.col=1, layout.pos.row=1)
# vp.2 <- viewport(layout.pos.col=1, layout.pos.row=2)
# pushViewport(viewport(layout=gl))
# pushViewport(vp.1)
#
# par(new=TRUE)
# plot(fit, cex = 0.7, sub = paste("Distance metric: ",distance,sep=""), xlab = '') # display dendogram
# popViewport()
# pushViewport(vp.2)
# print(p, newpage = FALSE)
# popViewport()
}
PLOT_OLIGO <- function(df, oligo, site = 'TH', metric='horn', method='average', title = 'no title', labels = NA){
# get the subset for genera and site
df_sub_genera = df[df$oligo %in% oligo, ]
df_sub_site <- df_sub_genera[df_sub_genera$site == site, ]
sub_matrix <- acast(df_sub_site, sample~oligo, value.var="abundance", fill = 0)
distance <- metric #"manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "morisita", "horn", "mountford", "raup" , "binomial" or "chao"
d <- vegdist(sub_matrix, method=distance)
fit <- hclust(d, method=method) # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
samples_order <- fit$labels[fit$order]
# set the order of samples based on clustering results:
df_sub_site$sample <- ordered(df_sub_site$sample,levels=samples_order)
# oh, very cool
df_sub_site$oligo <- reorder(df_sub_site$oligo, df_sub_site$abundance, FUN=sum)
df_sub_site$oligo <- factor(df_sub_site$oligo, levels=levels(df_sub_site$oligo))
df_sub_site <- df_sub_site[order(df_sub_site$oligo), ]
# replace oligos with labels in legend
if(!is.na(labels))
df_sub_site$oligo <- factor(df_sub_site$oligo, levels = oligo, labels = labels)
# color manually:
colors <- sample(rainbow(length(levels(factor(df_sub_site$oligo))), s = 0.5, v = 0.7))
# prepare ggplot object
p <- ggplot(df_sub_site, aes(x=factor(sample), y=abundance, fill = oligo))
p <- p + labs(title=title)
p <- p + geom_bar(position="fill", stat = "identity")
p <- p + theme(axis.text.x = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), axis.ticks.x = element_blank(), legend.position = 'top', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + guides(fill = guide_legend(nrow = 2, title.position = NULL))
p <- p + scale_y_continuous()
p <- p + labs(x='', y="Ratio of oligotypes")
p <- p + scale_fill_manual(limits = levels(factor(df_sub_site$oligo)), values = colors)
# prepare ggplot object
q <- ggplot(df_sub_site, aes(x=factor(sample), y=abundance))
q <- q + stat_summary(fun.y = sum, geom = "bar")
q <- q + theme(axis.text.x = element_text(angle = 90, size=10, vjust=0, face = 'bold'), legend.position = 'none', axis.ticks.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
q <- q + labs(x='', y="Number of reads (square root normalized)")
q <- q + scale_y_reverse()
q <- q + scale_fill_manual(limits = levels(factor(df_sub_site$oligo)), values = colors)
# MAKE SURE THEY HAVE THE SAME WIDTH:
gP <- ggplotGrob(p)
gQ <- ggplotGrob(q)
maxWidth = grid::unit.pmax(gP$widths[2:5], gQ$widths[2:5])
gP$widths[2:5] <- as.list(maxWidth)
gQ$widths[2:5] <- as.list(maxWidth)
grid.arrange(gP, gQ, ncol=1)
}
PLOT_OLIGO_PER_SITE_BOXES <- function(df, oligo, sites = c('BM', 'HP', 'KG', 'PT', 'ST', 'SUBP', 'SUPP', 'SV', 'TD', 'TH'), title = 'no title', labels = NA){
df_sub_genus <- df[df$oligo %in% oligo, ]
df_sub_genus <- df_sub_genus[df_sub_genus$site %in% sites, ]
dfx <- data.frame(sample=character(),
site=character(),
oligo=character(),
abundance=numeric(),
abundance_p=numeric(),
stringsAsFactors=FALSE)
names(dfx) <- c('sample', 'site', 'oligo', 'abundance', 'abundance_p')
N <- 0
for(sample in unique(df_sub_genus$sample)){
sample_reduced <- df_sub_genus[df_sub_genus$sample == sample, ]
for (site in sites){
site_reduced <- sample_reduced[sample_reduced$site == site, ]
total_reads_in_site <- sum(site_reduced$abundance)
for(oligo_in_site in site_reduced$oligo){
N <- N + 1
oligo_reduced <- site_reduced[site_reduced$oligo == oligo_in_site, ]
oligo_in_site_abundance <- oligo_reduced$abundance
dfx[N, ] <- c(sample = sample, site=site, oligo = oligo_in_site, abundance = oligo_in_site_abundance, abundance_p = oligo_in_site_abundance * 100.0 / total_reads_in_site)
}
}
}
dfx$abundance_p <- as.numeric(as.character(dfx$abundance_p))
# replace oligos with labels in legend
if(!is.na(labels))
dfx$oligo <- factor(dfx$oligo, levels = oligo, labels = labels)
p <- ggplot(dfx, aes(x=factor(site), y=abundance_p, color = oligo))
p <- p + geom_boxplot()
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'right', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + labs(title=title)
# color manually:
colors <- sample(rainbow(length(levels(factor(df_sub_genus$oligo))), s = 0.5, v = 0.7))
p <- p + scale_fill_manual(limits = levels(factor(df_sub_genus$oligo)), values = colors)
print(p)
}
PLOT_OLIGO_PER_SITE_BOXES_PERCENTS <- function(df, oligo, sites = c('BM', 'HP', 'KG', 'PT', 'ST', 'SUBP', 'SUPP', 'SV', 'TD', 'TH'), title = 'no title', labels = NA){
df_sub_genus <- df[df$oligo %in% oligo, ]
df_sub_genus <- df_sub_genus[df_sub_genus$site %in% sites, ]
dfx <- data.frame(sample=character(),
site=character(),
oligo=character(),
abundance=numeric(),
abundance_p=numeric(),
stringsAsFactors=FALSE)
names(dfx) <- c('sample', 'site', 'oligo', 'abundance', 'abundance_p')
N <- 0
for(sample in unique(df_sub_genus$sample)){
sample_reduced <- df_sub_genus[df_sub_genus$sample == sample, ]
for (site in sites){
df_site <- df[df$site == site, ]
df_site_sample <- df_site[df_site$sample == sample, ]
tot_num_reads_in_site <- sum(df_site_sample$abundance)
site_reduced <- sample_reduced[sample_reduced$site == site, ]
total_reads_in_site <- sum(site_reduced$abundance)
for(oligo_in_site in site_reduced$oligo){
oligo_reduced <- site_reduced[site_reduced$oligo == oligo_in_site, ]
oligo_in_site_abundance <- oligo_reduced$abundance
oligo_in_site_abundance_p <- oligo_in_site_abundance * 100.0 / tot_num_reads_in_site
N <- N + 1
dfx[N, ] <- c(sample = sample, site=site, oligo = oligo_in_site, abundance = oligo_in_site_abundance, abundance_p = oligo_in_site_abundance_p)
}
}
}
dfx$abundance_p <- as.numeric(as.character(dfx$abundance_p))
outlier_limit = boxplot.stats(dfx$abundance_p)$stats[c(1, 5)][2]
#dfx <- dfx[dfx$abundance_p < outlier_limit, ]
p <- ggplot(dfx, aes(x=factor(site), y=abundance_p, color = site))
p <- p + geom_boxplot(outlier.shape = NA)
p <- p + geom_jitter(position = position_jitter(width = .05, height = 0), size=3, alpha=0.4)
p <- p + theme(axis.text.y = element_text(angle = 90, hjust = 0.5, vjust = 0.5), axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5), axis.ticks.x = element_blank(), legend.position = 'none', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + ylab(title)
p <- p + xlab('')
p <- p + scale_y_sqrt()
return(p)
}
PLOT_OLIGO_PER_SITE_STACKBAR <- function(df, oligo, sites = c('BM', 'HP', 'KG', 'PT', 'ST', 'SUBP', 'SUPP', 'SV', 'TD', 'TH'), title = 'no title', labels = NA, colors = NA){
# test variables:
#oligo = c('Gammaproteobacteria_ATAGAATTTGTTCTC', 'Gammaproteobacteria_ATAGAATTTGATCTC', 'Gammaproteobacteria_ATAAAGTTTGACCTC', 'Gammaproteobacteria_ATAGAGTTTGATCTC', 'Gammaproteobacteria_ATAGAGTTTGACCTC', 'Gammaproteobacteria_ACAGAATTTAATCTC', 'Gammaproteobacteria_ATAGAATATAATCTC', 'Gammaproteobacteria_ATAGAATTTAGTCTC', 'Gammaproteobacteria_ATAGAATTTAATCTC', 'Gammaproteobacteria_ATAGAGTATAATCTC', 'Gammaproteobacteria_ATAGGGTTTAACCTC', 'Gammaproteobacteria_AAAGAGTTTAACCTC', 'Gammaproteobacteria_ATAGAGTTTAATCTC', 'Gammaproteobacteria_ATAGAGTTTAACCTC')
#labels = c('T. aromaticivorans V5 98.7', 'T. aromaticivorans V4 99.2', 'T. aromaticivorans V3 99.6', 'T. aromaticivorans V2 99.6', 'T. aromaticivorans V1 100', 'H. parainfluenzae V9 98.7', 'H. parainfluenzae V8 98.7', 'H. parainfluenzae V7 98.7', 'H. parainfluenzae V6 99.2', 'H. parainfluenzae V5 99.2', 'H. parainfluenzae V4 99.6', 'H. parainfluenzae V3 99.6', 'H. parainfluenzae V2 99.6', 'H. parainfluenzae V1 100')
#title = 'Distribution of T. aromaticivorans and H. parainfluenzae'
#sites = c( 'SUBP', 'SUPP','BM', 'HP', 'KG', 'PT', 'SV', 'TD', 'TH')
df_sub_genus <- df[df$oligo %in% oligo, ]
df_sub_genus <- df_sub_genus[df_sub_genus$site %in% sites, ]
dfx <- data.frame(
site=character(),
oligo=character(),
abundance=numeric(),
stringsAsFactors=FALSE)
names(dfx) <- c('site', 'oligo', 'abundance')
N <- 0
for (site in sites){
site_reduced <- df_sub_genus[df_sub_genus$site == site, ]
for(oligo_in_site in oligo){
N <- N + 1
oligo_reduced <- site_reduced[site_reduced$oligo == oligo_in_site, ]
oligo_in_site_abundance <- sum(oligo_reduced$abundance)
dfx[N, ] <- c(site=site, oligo = oligo_in_site, abundance = oligo_in_site_abundance)
}
}
# replace oligos with labels in legend
if(!is.na(labels)){
dfx$oligo <- factor(dfx$oligo, levels = oligo, labels = labels)
}
dfx$abundance <- as.numeric(as.character(dfx$abundance))
dfx$oligo <- factor(dfx$oligo)
p <- ggplot(dfx, aes(x=factor(site), y=abundance, group= oligo, fill = oligo))
p <- p + geom_bar(position="fill", stat = "identity", width=0.90, colour = 'black')
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'right', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + labs(title=title)
# color manually:
if(is.na(colors))
colors <- sample(rainbow(length(levels(dfx$oligo)), s = 0.7, v = 0.9))
p <- p + scale_fill_manual(limits = levels(dfx$oligo), values = colors)
print(p)
return(colors)
}
PLOT_OLIGO_EVERYWHERE <- function(df, oligo, cluster_site='TH', sites = c('BM', 'HP', 'KG', 'PT', 'ST', 'SUBP', 'SUPP', 'SV', 'TD', 'TH'), metric='horn', method='average', title = 'no title', labels = NA, colors = NA){
# df=df_v1v3
# oligo= man_oligos_v1v3
# sites = man_sites_v1v3
# title = 'most abundant Neisseria v1v3'
# labels = man_labels_v1v3
# colors = man_colors_v1v3
# cluster_site = 'SUBP'
# metric = 'horn'
# method = 'ward'
# get genus
df_sub_genus <- df[df$oligo %in% oligo, ]
df_sub_genus <- df_sub_genus[df_sub_genus$site %in% sites, ]
# get clustering based on SITE
df_sub_genus_sub_site <- df_sub_genus[df_sub_genus$site == cluster_site, ]
sub_matrix <- acast(df_sub_genus_sub_site, sample~oligo, value.var="abundance", fill = 0)
distance <- metric #"manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "morisita", "horn", "mountford", "raup" , "binomial" or "chao"
d <- vegdist(sub_matrix, method=distance)
fit <- hclust(d, method=method) # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
samples_order <- fit$labels[fit$order]
# set the order of samples based on clustering results:
df_sub_genus$sample <- factor(df_sub_genus$sample, levels=samples_order)
# I DON'T KNOW WHAT I'M DOING HERE, THERE ARE NA's INTRODUCED IN THE PREVIOUS LINE:
df_sub_genus <- df_sub_genus[!df_sub_genus$sample %in% c(NA), ]
# oh, very cool
df_sub_genus$oligo <- reorder(df_sub_genus$oligo, df_sub_genus$abundance, FUN=sum)
df_sub_genus$oligo <- factor(df_sub_genus$oligo, levels=levels(df_sub_genus$oligo))
df_sub_genus <- df_sub_genus[order(df_sub_genus$oligo), ]
# replace oligos with labels in legend
if(!is.na(labels)){
df_sub_genus$oligo <- factor(df_sub_genus$oligo, levels = oligo, labels = labels)
}
df_sub_genus <- df_sub_genus[with(df_sub_genus, order(site, oligo)), ]
df_sub_genus$site <- factor(df_sub_genus$site)
p <- ggplot(df_sub_genus, aes(x=factor(sample), y=abundance, labels, color, fill = oligo))
p <- p + geom_bar(position="fill", stat = "identity", width=0.90)
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'right', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
p <- p + labs(title=title)
p <- p + labs(x='', y='')
p <- p + scale_y_continuous(breaks = NULL)
p <- p + facet_grid(site ~ .)
# color manually:
if(is.na(colors)){
colors <- sample(rainbow(length(levels(factor(df_sub_genus$oligo))), s = 0.5, v = 0.7))
}
p <- p + scale_fill_manual(limits = levels(factor(df_sub_genus$oligo)), values = colors)
print(p)
#plot.new()
#gl <- grid.layout(nrow=2, ncol=1)
#vp.1 <- viewport(layout.pos.col=1, layout.pos.row=1)
#vp.2 <- viewport(layout.pos.col=1, layout.pos.row=2)
#pushViewport(viewport(layout=gl))
#pushViewport(vp.1)
#par(new=TRUE)
#plot(fit, cex = 0.7, sub = paste("Distance metric: ",distance,sep=""), xlab = '') # display dendogram
#popViewport()
#pushViewport(vp.2)
#print(p, newpage = FALSE)
#popViewport()
return(samples_order)
}
PLOT_OLIGO_PERCENT <- function(df, oligos, samples_order, site = 'KG', title = 'no title'){
# this function plots the percentage of all reads combined in 'oligos' list to all reads
# observed in the 'site'
dfx <- data.frame(sample=character(),
site=character(),
abundance_of_oligos=numeric(),
abundance_of_all=numeric(),
p_abundance_of_oligos=numeric(),
stringsAsFactors=FALSE)
names(dfx) <- c('sample', 'site', 'abundance_of_oligos', 'abundance_of_all', 'p_abundance_of_oligos')
N <- 0
dfx_sub_site <- df[df$site == site, ]
for(sample in samples_order){
dfx_sample_reduced <- dfx_sub_site[dfx_sub_site$sample == sample, ]
dfx_oligo_reduced <- dfx_sample_reduced[dfx_sample_reduced$oligo %in% oligos, ]
abundance_of_all <- sum(dfx_sample_reduced$abundance)
abundance_of_oligos <- sum(dfx_oligo_reduced$abundance)
p_abundance_of_oligos <- abundance_of_oligos * 100.0 / abundance_of_all
N <- N + 1
dfx[N, ] <- c(sample = sample, site=site, abundance_of_oligos = abundance_of_oligos, abundance_of_all = abundance_of_all, p_abundance_of_oligos = p_abundance_of_oligos)
}
dfx$p_abundance_of_oligos <- as.numeric(as.character(dfx$p_abundance_of_oligos))
dfx$sample <- ordered(dfx$sample,levels=samples_order)
# prepare ggplot object
q <- ggplot(dfx, aes(x=factor(sample), y=p_abundance_of_oligos))
q <- q + stat_summary(fun.y = sum, geom = "bar")
q <- q + theme(axis.text.x = element_text(angle = 90, size=10, vjust=0, face = 'bold'), legend.position = 'none', axis.ticks.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
q <- q + labs(title=title)
q <- q + labs(x='', y=paste("Percentage in ", site, sep=''))
q <- q + scale_y_sqrt()
print(q)
}
DIST_BETWEEN_TWO_SEQS <- function(s1, s2){
#s1 <- 'AAGGCCTACCAAGGCGACGATCAGTAGCGGGTCTGAGAGGATGATCCGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTCTGAAGAAGGCCTTCGGGTTGTAAAGGACTTTTGTCAGGGAAGAAAAGGCTGTTGCTAATATCAGCGGCTGATGACGGTACCTGAAGAATAAGCACCGGCTAACTACGTG'
#s2 <- 'AAGGCCTACCAAGGCGACGATCAGTAGCGGGTCTGAGAGGATGATCCGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATTTTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTCTGAAGAAGGCCTTCGGGTTGTAAAGGACTTTTGTCAGGGAAGAAAAGGCTGTTGCTAATATCGACAGCTGATGACGGTACCTGAAGAATAAGCACCGGCTAACTACGTG'
#s1 <- 'ATAACG'
#s2 <- 'ACCG'
s1 <- gsub("-", "", s1)
s2 <- gsub("-", "", s2)
alignment <- pairwiseAlignment(s1, s2, gapOpening = -2, gapExtension = -8, scoreOnly = FALSE)
s1a <- as.character(pattern(alignment))
s2a <- as.character(subject(alignment))
mismatch_map <- strsplit(c(s1a, s2a), split= '')
num_mismatches <- length(which(mismatch_map[[1]] != mismatch_map[[2]]))
return(100 - (num_mismatches * 100 / nchar(s1a)))
}
OLIGO_DIST <- function(df, oligos, labels=c(), otu_limits=TRUE){
#oligos <- strep_oligotypes_v3v5
#df <- df_v3v5_seqs
#labels <- strep_labels_v3v5
# df = df_v1v3_seqs
# oligos = man_oligos_v1v3
# labels = man_labels_v1v3
df_subset <- df[df$OLIGO %in% oligos, ]
dfx <- data.frame(OLIGO=character(),
REP_SEQ=character(),
stringsAsFactors=FALSE)
names(dfx) <- c('OLIGO', 'REP_SEQ')
N <- 0
for(i in seq(1, length(oligos))){
oligo <- oligos[i]
if(length(labels) > 0){
label <- labels[i]
} else {
label <- oligos[i]
}
N <- N + 1
dfx[N, ] <- c(OLIGO = label, REP_SEQ = as.character(df_subset[df_subset$OLIGO == oligo, ]$REP_SEQ))
}
df_subset <- dfx
if(length(labels) > 0){
df_subset$OLIGO <- ordered(df_subset$OLIGO,levels=labels)
} else {
df_subset$OLIGO <- ordered(df_subset$OLIGO,levels=oligos)
}
df_subset$OLIGO <- factor(df_subset$OLIGO)
num_oligos <- nrow(df_subset)
dist_mat <- matrix(nrow=num_oligos, ncol=num_oligos)
colnames(dist_mat) <- df_subset$OLIGO
rownames(dist_mat) <- df_subset$OLIGO
for(i in 1:num_oligos) {
for(j in 1:num_oligos){
o1 <- df_subset$OLIGO[i]
o2 <- df_subset$OLIGO[j]
dist_mat[i,j] <- DIST_BETWEEN_TWO_SEQS(df_subset[o1, ]$REP_SEQ, df_subset[o2, ]$REP_SEQ)
}
}
updated_df <- melt(dist_mat)
names(updated_df) <- c('OLIGO1', 'OLIGO2', 'DIST')
# find the best order
d <- vegdist(dist_mat, method="canberra")
fit <- hclust(d, method="complete") #, "single", "complete", "average", "mcquitty", "median" or "centroid"
oligos_order <- fit$labels[fit$order]
# set the order of samples based on clustering results:
updated_df$OLIGO1 <- factor(updated_df$OLIGO1, levels=rev(oligos_order))
updated_df$OLIGO2 <- factor(updated_df$OLIGO2, levels=oligos_order)
p <- ggplot(updated_df, aes(OLIGO1, OLIGO2, fill = DIST))
p <- p + geom_tile()
if(otu_limits){
p <- p + scale_fill_gradient2(low = "steelblue", high = "red", mid="white", midpoint=98.5, limits=c(97,100), na.value='#bbfdc6')
} else {
p <- p + scale_fill_gradient2(low = "steelblue", high = "red", mid="white")
}
p <- p + theme(axis.text.x = element_blank(), legend.position = 'right', axis.ticks.x = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
print(p)
}
STANDARD_FIG <- function(df, oligos, sites, title, labels, colors, cluster_site = 'SUBP', metric='horn', method = 'ward'){
PLOT_OLIGO_PER_SITE_STACKBAR(df, oligos, sites = sites, title = title, labels= labels, colors = colors)
samples_order <- PLOT_OLIGO_EVERYWHERE(df, oligos, cluster_site = cluster_site, sites = sites, method = method, title = title, labels = labels, colors = colors, metric = metric)
PLOT_OLIGO_PERCENT(df, oligos, samples_order, site = 'KG', title = paste(title, ' in KG', sep=''))
PLOT_OLIGO_PERCENT(df, oligos, samples_order, site = 'SUBP', title = paste(title, ' in SUBP', sep=''))
PLOT_OLIGO_PERCENT(df, oligos, samples_order, site = 'TD', title = paste(title, ' in TD', sep=''))
}
########################################################################################################
# DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA DATA
########################################################################################################
setwd("~/Dropbox/Paper_Drafts/OralMicrobiota/analysis-v3v5/")
df_v3v5 <- as.data.frame(read.csv('all.env', header=TRUE, sep="\t"))
df_v3v5_seqs <- as.data.frame(read.csv('all-seqs.env', header=TRUE, sep="\t"))
results_v3v5 <- read.table(file = 'results-expanded.txt', header = TRUE , sep = "\t" , quote = '"')
setwd("~/Dropbox/Paper_Drafts/OralMicrobiota/analysis-v1v3/")
df_v1v3 <- as.data.frame(read.csv('all.env', header=TRUE, sep="\t"))
df_v1v3_seqs <- as.data.frame(read.csv('all-seqs.env', header=TRUE, sep="\t"))
all_sites = c('BM', 'HP', 'KG', 'PT', 'SUBP', 'SUPP', 'SV', 'TD', 'TH', 'ST')
oral_sites = c('BM', 'HP', 'KG', 'PT', 'SUBP', 'SUPP', 'SV', 'TD', 'TH')
base_colors <- c('#FF0000', '#FFCC33', '#FFFF00', '#00FF00', '#009900', '#00FFFF', '#3399FF', '#0000CC', '#FF00FF', '#9933FF', '#009999', '#990099', '#999900', '#990000')
levels(df_v3v5$genus)
#"manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "morisita", "horn", "mountford", "raup" , "binomial" or "chao"
# "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
#PLOT_SITE(df, c('Firmicutes'), 'SUBP', 'horn')
#PLOT_GENUS(df, 'Firmicutes', 'SUPP', 'horn')
########################################################################################################
# FIGURES
########################################################################################################
# FIGURE SCATTER V1V3
results <- read.table(file = '~/Dropbox/Paper_Drafts/OralMicrobiota/analysis-v1v3/scatter-plot-data/final_results.txt', header = TRUE , sep = "\t" , quote = "", row.names=1)
results <- results[with(results, order(-OBSERVED_PERCENT_OF_PATIENTS_ORAL_CAVITY)), ]
min(results$BEST_HIT_ID)
pdf('~/scatterx-v1v3.pdf', width=32, height=16)
p <- ggplot(data=results, aes(x=MEAN_ABUNDANCE_IN_ORAL, y=OBSERVED_PERCENT_OF_PATIENTS_ORAL_CAVITY,color=PHYLUM, shape=PHYLUM, fill=PHYLUM))
p <- p + geom_point(size=5)
p <- p + theme(axis.text.x = element_text(hjust = 1, vjust = 1/2), legend.position="right")
p <- p + scale_x_sqrt(breaks=c(0.01,0.1,1,2,4,6,8,10))
p <- p + scale_y_continuous(breaks=c(10,20,30,40,50,60,70,80,90,100))
p <- p + scale_colour_manual(values = c('#FF0000', '#FFCC33', '#444400', '#00FF00', '#009900', '#00DDDD', '#3399FF', '#0000CC', '#FF00FF'))
p <- p + scale_shape_manual(values=c(7, 9, 10, 12, 13, 14, 2, 6, 7))
p <- p + theme_bw()
print(p)
dev.off()
# FIGURE SCATTER V1V3
results <- read.table(file = '~/Dropbox/Paper_Drafts/OralMicrobiota/analysis-v3v5/scatter-plot-data/final_results.txt', header = TRUE , sep = "\t" , quote = "", row.names=1)
results <- results[with(results, order(-OBSERVED_PERCENT_OF_PATIENTS_ORAL_CAVITY)), ]
min(results$BEST_HIT_ID)
pdf('~/scatterx-v3v5.pdf', width=32, height=16)
p <- ggplot(data=results, aes(x=MEAN_ABUNDANCE_IN_ORAL, y=OBSERVED_PERCENT_OF_PATIENTS_ORAL_CAVITY,color=PHYLUM, shape=PHYLUM, fill=PHYLUM))
p <- p + geom_point(size=5)
p <- p + theme(axis.text.x = element_text(hjust = 1, vjust = 1/2), legend.position="right")
p <- p + scale_x_sqrt(breaks=c(0.01,0.1,1,2,4,6,8,10))
p <- p + scale_y_continuous(breaks=c(10,20,30,40,50,60,70,80,90,100))
p <- p + scale_colour_manual(values = c('#FF0000', '#FFCC33', '#444400', '#00FF00', '#009900', '#00DDDD', '#3399FF', '#0000CC', '#FF00FF'))
p <- p + scale_shape_manual(values=c(7, 9, 10, 12, 13, 14, 2, 6, 7))
p <- p + theme_bw()
print(p)
dev.off()
# FINAL STREP FIGURE WITH ALL SITES
final_strep_v3v5 <- c("Firmicutes_GGCTCATTACTTGGTTTCCCTTCTACGT", "Firmicutes_GGCTCATTACTTGGTTTCTCTTCTACGT", "Firmicutes_GGCTCATTGCTTGGTTTCCCTTCTACGT", "Firmicutes_GGTTCGTTATTTGATTCCCCTTCTACGT", "Firmicutes_GGTTCGTTGTTTGACTCCCCTTCTACGT", "Firmicutes_GGTTCATTGTTTGACTCCCCTTCTACGT", "Firmicutes_GGTTCATTGTTTGACTCCTCTTCTACGT", "Firmicutes_GGTTCGTTGTTTGACTTCCCTTCTACGT", "Firmicutes_GGTTCATTGCTTGGCTTCCCTTCTACGT", "Firmicutes_GGTTCGTTGCTTGGCTTCCCTTCTACGT", "Firmicutes_GGTTCGTTGCTTGGCTTCGCTACTACGT")
final_strep_labels <- c("S. mitis / oralis / perosis", "S. infantis", "S. sp. (055).", "S. salivarius / vestibularis", "S. anginosus", "S. parasanguinis / cristatus / australis / sinensis", "S. oligofermentans", "S. intermedius / constellatus", "S. sanguinis / agalactiae", "S. gordonii", "S. mutans")
final_strep_colors <- c("#FF0000", "#FFCC33", "#9933FF", "#FFFF00", "#009999", "#00FF00", "#009900", "#00FFFF", "#0000CC", "#3399FF", "#FF00FF")
pdf('~/final_strep_v3v5.pdf', width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= final_strep_v3v5,
sites = oral_sites,
title = 'Strep all sites',
labels = final_strep_labels,
colors = final_strep_colors,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, final_strep_v3v5, final_strep_labels)
dev.off()
# FINAL FIGURE 4
fuso_l = c("F. nucleatum ss animalis", "F. nucleatum ss vincentii I", "F. nucleatum ss vincentii II", "F. nucleatum ss polymorphum", "F. periodonticum 98.8%", "F. periodonticum")
fuso_o = c("Fusobacteria_ACTGTAGGGCTATGAGAT", "Fusobacteria_ACTGTCGGGCTATGAGAT", "Fusobacteria_ACTGTA-AACTATGAGAT", "Fusobacteria_ACTGTA-AACTATAAGAT", "Fusobacteria_ACTGTAAAGCTAT-AGAT", "Fusobacteria_ACTGTAAAGCTCTAAGAT")
fuso_c = c("#3399FF", "#00FF00", "#009900", "#00FFFF", "#FF0000", "#FFCC33")
porph_l = c("P. sp. (277, 278)", "P. catoniae 98.6%", "P. catoniae 97.2%", "P. catoniae 99.5%", "P. catoniae 98.6%", "P. endodontalis", "P. sp. 98.1% (279)", "P. sp. 98.6% v1 (279)", "P. sp. 99.1% (279)", "P. sp. 99.5% v1 (279)", "P. sp. 98.6% v2 (279)", "P. sp. 99.5% v2 (279)", "P. sp. (279)", "P. sp. 99.1% (279)", "P. gingivalis")
porph_o = c("Bacteroidetes_CAGCT-TAGAGGGTCCTCCCCC", "Bacteroidetes_CAGCT-TAGAAGGTCCGTCCCC", "Bacteroidetes_CCGCT-TAGGAGGTCCGTCCCC", "Bacteroidetes_CAGCT-TAGAAGGTCCATCCCC", "Bacteroidetes_CAGCTTATAAAAGTCTGTCCCC", "Bacteroidetes_CAGCCCATAGCGCTGCTGTCCC", "Bacteroidetes_CCGCT-TTAAGGTTACTCCGTC", "Bacteroidetes_CAGCTTATAAGGTTACCCCGTC", "Bacteroidetes_CAGCTTATAAGGTTACTCCGTC", "Bacteroidetes_CAGCT-TTAAGGTAACTCCGTC", "Bacteroidetes_CAGCT-TTAAGGCAGCTCCGTC", "Bacteroidetes_CAGCT-TTAAGGTTGCTCCGTC", "Bacteroidetes_CAGCT-TTAAGGTTACTCCGTC", "Bacteroidetes_CAGCT-TTAAGGCTGCTCCGTC", "Bacteroidetes_CAGCACAAAGGGAATTGCCGAC")
porph_c = c("#999999", "#9933FF", "#00FFFF", "#3399FF", "#0000CC", "#FF00FF", "#00FF00", "#009900", "#009999", "#990000", "#CC6633", "#FFFF00", "#FFCC33", "#FF0000", "#000000")
camp_l = c("C. showae 99.6%","C. showae 99.2%","C. gracilis","C. concisus 99.6% v1","C. concisus 99.2% v1","C. concicus 99.6% v2","C. concisus 99.2% v2","C. concisus")
camp_o = c("Epsilonproteobacteria_GTTAACGCACGTTG","Epsilonproteobacteria_GCTAACGCACGTTG","Epsilonproteobacteria_GCTGACACCCATTC","Epsilonproteobacteria_GTCAACGCACGTTC","Epsilonproteobacteria_GTCAACGCATACTC","Epsilonproteobacteria_GTCAACGCACACTC","Epsilonproteobacteria_GTCAACGCACACCC","Epsilonproteobacteria_GTCAACGCACATTC")
camp_c = c("#FF0000","#FFCC33","#FFFF00","#FF00FF","#3399FF","#00FF00","#009900","#00FFFF")
# FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA
# V1V3
man_labels_v1v3 = c('Neisseria elongata 99.6% (598)',
'Neisseria flavescens (610)',
'Neisseria flavescens 98.8% (610)',
'Neisseria flavescens 99.6% (610)',
'Neisseria oralis (016, 015, 014)',
'Neisseria polysaccharea / meningitidis (737, 669))',
'Neisseria sicca / mucosa / flava / oralis (764, 682, 609, 014)',
'Neisseria subflava (476)')
man_oligos_v1v3 = c('Betaproteobacteria_ATGCCGAGGCCGTTCAA',
'Betaproteobacteria_ATGCCCTGTTGACGCGA',
'Betaproteobacteria_ATGCCCTGCCGACGTGA',
'Betaproteobacteria_ATGCCCTGTCGACGCGA',
'Betaproteobacteria_ATGCCGCGTCATTCGGA',
'Betaproteobacteria_ATGCCCTGTTAGCGCGA',
'Betaproteobacteria_ATGCCGCGGCCCTTCGA',
'Betaproteobacteria_ATGCCATAGCCCTTCGA')
man_sites_v1v3 = c('KG', 'SUBP', 'PT')
man_colors_v1v3 <- base_colors[1:length(man_oligos_v1v3)]
pdf('~/man_v1v3.pdf', width=12, height=6)
STANDARD_FIG(df=df_v1v3,
oligos= man_oligos_v1v3,
sites = oral_sites,
title = 'most abundant Neisseria v1v3',
labels = man_labels_v1v3,
colors = man_colors_v1v3,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v1v3_seqs, man_oligos_v1v3, man_labels_v1v3)
dev.off()
# FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA FIGURE MOST ABUNDANT NEISSERIA
# V3V5
man_oligos_v3v5 = c("Betaproteobacteria_GAATTTTA-CTCTATCT",
"Betaproteobacteria_GAACTTTA-TTCTATTT",
"Betaproteobacteria_GAACTTTA-TTTCATTT",
"Betaproteobacteria_GAACCTTA-CTCTAACT",
"Betaproteobacteria_GAACCATA-CTCTAACT")
man_labels_v3v5 = c("N. elongata (HOT-598)",
"N. flavescens / subflava (610, 476)",
"N. flavescens / subflava 99.2% (610, 476)",
"N. pharyngis (HOT-729)",
"N. sicca / mucosa / flava (764, 682, 609, 015)")
man_sites_v3v5 = c('KG', 'SUBP', 'PT')
man_colors_v3v5 <- base_colors[1:length(man_oligos_v3v5)]
pdf('~/man_v3v5.pdf', width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= man_oligos_v3v5,
sites = oral_sites,
title = 'most abundant Neisseria v3v5',
labels = man_labels_v3v5,
colors = man_colors_v3v5,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, man_oligos_v3v5, man_labels_v3v5)
dev.off()
# FIGURE HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3 HOT-279 V1V3
h2_labels_v1v3 = c("HOT 279 V1 100% ",
"HOT 279 V2 99.5%",
"HOT 279 V3 99.5%",
"HOT 279 V4 99.1%",
"HOT 279 V5 99.1%",
"HOT 279 V6 98.1%",
"HOT 279 V7 98.6%")
h2_oligos_v1v3 = c("Bacteroidetes_CAGCT-TTAAGGTTACTCCGTC",
"Bacteroidetes_CAGCT-TTAAGGTTGCTCCGTC",
"Bacteroidetes_CAGCT-TTAAGGTAACTCCGTC",
"Bacteroidetes_CAGCT-TTAAGGCTGCTCCGTC",
"Bacteroidetes_CAGCTTATAAGGTTACTCCGTC",
"Bacteroidetes_CAGCTTATAAGGTTACCCCGTC",
"Bacteroidetes_CCGCT-TTAAGGTTACTCCGTC")
h2_sites_v1v3 = c('KG', 'SUBP', 'PT')
h2_colors_v1v3 <- base_colors[1:length(h2_oligos_v1v3)]
pdf('~/h2_v1v3.pdf', width=12, height=6)
STANDARD_FIG(df=df_v1v3,
oligos= h2_oligos_v1v3,
sites = oral_sites,
title = 'HOT 279 v1v3',
labels = h2_labels_v1v3,
colors = h2_colors_v1v3,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v1v3_seqs, h2_oligos_v1v3, h2_labels_v1v3)
dev.off()
# FIGURE HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5 HOT-279 V3V5
h2_oligos_v3v5 = c("Bacteroidetes_AGTATCAGAAGATA-TACAGTAGTTATG", "Bacteroidetes_GGTATCAGAAGATA-TACAGTAGTTATG")
h2_labels_v3v5 = c("HOT 279 V1 100%", "HOT 279 V2 98.3%")
h2_sites_v3v5 = c('KG', 'SUBP', 'PT')
h2_colors_v3v5 <- base_colors[1:length(h2_oligos_v3v5)]
pdf('~/h2_v3v5.pdf', width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= h2_oligos_v3v5,
sites = oral_sites,
title = 'HOT 279 v3v5',
labels = h2_labels_v3v5,
colors = h2_colors_v3v5,
cluster_site = 'KG',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, h2_oligos_v3v5, h2_labels_v3v5)
dev.off()
# FIGURE Strep mitis variants V1V3
sm_oligos_v1v3 = c("Firmicutes_-CTCAGTGTTGCGAGTGGTAGTTCATACTTCG-",
"Firmicutes_-CTTAGTGTTGCGAGTGGTAGTTCACACTTCG-",
"Firmicutes_-CTCAGTATTGCGAGTGGTAGTTCACACTTCG-",
"Firmicutes_-CTCAGTGTTGCGAGTGGTAGTTCACGCTTCG-",
"Firmicutes_-CTCAGTGTTGTGAGTGGTAGTTCACACTTCG-")
sm_labels_v1v3 = c("S. mitis V1 99.6%",
"S. mitis V2 99.6%",
"S. mitis V3 99.6%",
"S. mitis V4 99.6%",
"S. mitis V5 99.6%")
sm_sites_v1v3 = c('KG', 'SUBP', 'PT')
sm_colors_v1v3 <- base_colors[1:length(sm_oligos_v1v3)]
pdf('~/sm_v1v3.pdf', width=12, height=6)
STANDARD_FIG(df=df_v1v3,
oligos= sm_oligos_v1v3,
sites = oral_sites,
title = 'Strep mitis variants v1v3',
labels = sm_labels_v1v3,
colors = sm_colors_v1v3,
cluster_site = 'KG',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v1v3_seqs, sm_oligos_v1v3, sm_labels_v1v3)
dev.off()
# FIGURE STREP OLIGOS V3V5
strep_oligos_v3v5 = c('Firmicutes_GGCTCATTACTTGGTTTCCCTTCTACGT', 'Firmicutes_GGCTCATTACTTGGTTTCTCTTCTACGT', 'Firmicutes_GGTTCGTTATTTGATTCCCCTTCTACGT', 'Firmicutes_GGTTCATTGTTTGACTCCCCTTCTACGT', 'Firmicutes_GGTTCATTGCTTGGCTTCCCTTCTACGT', 'Firmicutes_GGTTCGTTGCTTGGCTTCCCTTCTACGT', 'Firmicutes_GGTTCGTTGTTTGACTTCCCTTCTACGT', 'Firmicutes_GGTTCGTTGCTTGGCTTCGCTACTACGT')
strep_labels_v3v5 = c('S.peroris (728) / S.oralis (707) / S. mitis (677, 398) / S. sp. (431, 423, 071)', 'S. infantis (638) / S. sp. (486, 074, 061, 058)', 'S. salivarius (755) / S. vestibularis (021)', 'S. sinensis (767) / S. parasanguinis I (721) / S. parasanguinis II (411) / S. cristatus (578) / S. australis(073) / S. sp. (069, 067)', 'S. sanguinis (758) / S. agalactiae (537)', 'S. gordonii (622) / S. sp. (056)', 'S. intermedius (644) / S. constellatus (576)', 'S. mutans (686)')
strep_sites_v3v5 = c('SUBP', 'SUPP', 'BM', 'HP', 'KG', 'PT', 'SV', 'TD', 'TH')
strep_colors_v3v5 <- base_colors[1:length(strep_oligos_v3v5)]
pdf('~/strep_oligos_v3v5.pdf', width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= strep_oligos_v3v5,
sites = oral_sites,
title = 'STREP SITE v3v5',
labels = strep_labels_v3v5,
colors = strep_colors_v3v5,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, strep_oligos_v3v5, strep_labels_v3v5)
dev.off()
# FIGURE STREP OLIGOS V1V3
strep_oligos_v1v3 = c('Firmicutes_-CTCAGTGTTGCGAGTGGTAGTTCACACTTCG-',
'Firmicutes_-CTCAATGTTGCGAGTGGTAGTTCACACTTCG-',
'Firmicutes_-CTCTGTCCTCCGAGTGGTAGTTCACACGTCG-',
'Firmicutes_-CTCAGTGTTGCGAGTGGTAGTTCACACATCG-',
'Firmicutes_-CTCAGTCCTGCGAGTGGTAGTTCACACATCG-',
'Firmicutes_-CTCAGTGCTGCGGGTGGTAGTTCACACTTCG-',
'Firmicutes_-CTCTGTATTGCGGGTGGTAGTTCACACTTCG-',
'Firmicutes_-CTCTGTGCTGCGAGTGGTAGTTCATACCACG-',
'Firmicutes_-GTTAGTATTCCGTGTGGTAGTTCACACGTCG-',
'Firmicutes_-CTCAGTCCTGCGAGTGGTAGTTCACGCTTCG-',
'Firmicutes_-CTCAATCCTGCGAGTGGTAGTTCACACATCG-',
'Firmicutes_-CTCTGTCCTGCGAGTGGTAGTTCACACTTCG-')
strep_labels_v1v3 = c('S. mitis I / mitis II / australis / pneumoniae',
'S. oralis / infantis / mitis bv II',
'S. salivarius / S. vestibularis',
'S. parasanguinis I',
'S. parasanguinis II',
'S. sanguinis',
'S. gordonii',
'S. intermedius',
'S. mutans',
'S. peroris',
'S. sp. (HOT-065)',
'S. sp. (HOT-056)')
strep_sites_v1v3 = c('SUBP', 'SUPP', 'BM', 'HP', 'KG', 'PT', 'SV', 'TD', 'TH')
strep_colors_v1v3 <- base_colors[1:length(strep_oligos_v1v3)]
pdf('~/strep_oligos_v1v3.pdf', width=12, height=6)
STANDARD_FIG(df=df_v1v3,
oligos= strep_oligos_v1v3,
sites = oral_sites,
title = 'STREP SITE v1v3',
labels = strep_labels_v1v3,
colors = strep_colors_v1v3,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v1v3_seqs, strep_oligos_v1v3, strep_labels_v1v3)
dev.off()
# FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 FIGURE 4 Simonsiella muelleri
samples_of_interest <- c('158216035', '159227541', '159470302', '160016515', '404239096', '432193348', '533247696', '604812005', '638754422', '737052003', '763880905', '764002286', '764224817', '764649650', '764892411', '765034022', '892969023')
simonsiella_oligos <- c('Betaproteobacteria_GAACTTAA-TGCTAATT', 'Betaproteobacteria_GAGCTTAA-TGCTAATT')
simonsiella_labels <- c('Simonsiella muelleri 98.7%', 'Simonsiella muelleri 98.7%')
simonsiella <- read.table(file = '~/Dropbox/Paper_Drafts/OralMicrobiota/analysis/Simonsiella.env', header = TRUE , sep = "\t" , quote = "")
simonsiella_matrix <- acast(simonsiella, SAMPLE~OLIGO, value.var="ABUNDANCE", fill = 0)
d <- vegdist(simonsiella_matrix, method='horn')
fit <- hclust(d, method='ward') # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid"
samples_order <- fit$labels[fit$order]
simonsiella$SAMPLE <- ordered(simonsiella$SAMPLE,levels=samples_order)
pdf('~/simonsiella.pdf')
p <- ggplot(simonsiella, aes(x=factor(SAMPLE), y=ABUNDANCE, group=OLIGO, fill = OLIGO))
p <- p + geom_bar(position="fill", stat = "identity", width=0.90, colour = 'black')
p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.ticks.y = element_blank(), legend.position = 'right', panel.grid.major = element_blank(), panel.grid.minor = element_blank())
ccc <- sample(heat.colors(2))
p <- p + scale_fill_manual(values = ccc)
print(p)
dev.off()
# FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2 FIGURE 2
hot_279_oligos = c('Bacteroidetes_AGTATCAGAAGATA-TACAGTAGTTATG', 'Bacteroidetes_GGTATCAGAAGATA-TACAGTAGTTATG', 'Bacteroidetes_AGTATCAGATGATA-TACAGTAGTTATG', 'Bacteroidetes_AGTATCAGAAGATA-TACGGTAGTTATG')
hot_279_labels = c('HOT 279 V1 100%', 'HOT 279 V2 98.3%', 'HOT 279 V3 99.6%', 'HOT 279 V4 99.6%')
hot_279_title = 'Distribution of HOT 279 Variants'
hot_279_sites = c('KG', 'SUBP', 'PT')
#saveRDS(hot_279_colors, file="~/Dropbox/Paper_Drafts/OralMicrobiota/figures/hot_279_colors")
hot_279_colors <- readRDS("~/Dropbox/Paper_Drafts/OralMicrobiota/figures/hot_279_colors")
pdf('~/oral_stuff_hot_279.pdf', width=20, height=8)
PLOT_OLIGO_PER_SITE_STACKBAR(df_v3v5, hot_279_oligos, sites = strep_sites_v3v5, title = hot_279_labels, labels=hot_279_labels, colors = hot_279_colors)
samples_order <- PLOT_OLIGO_EVERYWHERE(df_v3v5, hot_279_oligos, cluster_site = 'KG', sites = hot_279_sites, method = 'complete', title = hot_279_title, labels = hot_279_labels, colors = hot_279_colors)
PLOT_OLIGO_PERCENT(df_v3v5, hot_279_oligos, samples_order, site = 'KG', title = paste(hot_279_title, ' in KG', sep=''))
PLOT_OLIGO_PERCENT(df_v3v5, hot_279_oligos, samples_order, site = 'SUBP', title = paste(hot_279_title, ' in SUBP', sep=''))
PLOT_OLIGO_PERCENT(df_v3v5, hot_279_oligos, samples_order, site = 'PT', title = paste(hot_279_title, ' in PT', sep=''))
dev.off()
# FIGURE 3 PARAINFLUENZA V3V5
figure_3_oligos = rev(c('Gammaproteobacteria_ATAGAATTTGTTCTC', 'Gammaproteobacteria_ATAGAATTTGATCTC', 'Gammaproteobacteria_ATAAAGTTTGACCTC', 'Gammaproteobacteria_ATAGAGTTTGATCTC', 'Gammaproteobacteria_ATAGAGTTTGACCTC', 'Gammaproteobacteria_ACAGAATTTAATCTC', 'Gammaproteobacteria_ATAGAATATAATCTC', 'Gammaproteobacteria_ATAGAATTTAGTCTC', 'Gammaproteobacteria_ATAGAATTTAATCTC', 'Gammaproteobacteria_ATAGAGTATAATCTC', 'Gammaproteobacteria_ATAGGGTTTAACCTC', 'Gammaproteobacteria_AAAGAGTTTAACCTC', 'Gammaproteobacteria_ATAGAGTTTAATCTC', 'Gammaproteobacteria_ATAGAGTTTAACCTC'))
figure_3_labels = rev(c('T. aromaticivorans V5 98.7', 'T. aromaticivorans V4 99.2', 'T. aromaticivorans V3 99.6', 'T. aromaticivorans V2 99.6', 'T. aromaticivorans V1 100', 'H. parainfluenzae V9 98.7', 'H. parainfluenzae V8 98.7', 'H. parainfluenzae V7 98.7', 'H. parainfluenzae V6 99.2', 'H. parainfluenzae V5 99.2', 'H. parainfluenzae V4 99.6', 'H. parainfluenzae V3 99.6', 'H. parainfluenzae V2 99.6', 'H. parainfluenzae V1 100'))
figure_3_title = 'Distribution of T. aromaticivorans and H. parainfluenzae'
figure_3_sites = c( 'SUBP', 'SUPP','BM', 'HP', 'KG', 'PT', 'SV', 'TD', 'TH')
figure_3_colors <- base_colors[1:length(figure_3_oligos)]
pdf('~/terra-v3v5.pdf', width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= figure_3_oligos,
sites = oral_sites,
title = figure_3_title,
labels = figure_3_labels,
colors = figure_3_colors,
cluster_site = 'SUBP',
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, figure_3_oligos, figure_3_labels)
dev.off()
# FIGURE 5, SUBP - PT vs SUPP vs PT
results <- results_v3v5
subp_preferred <- as.data.frame(read.csv('SUBP-preferred-taxa-v3v5.txt', header=TRUE, sep="\t"))
supp_preferred <- as.data.frame(read.csv('SUPP-preferred-taxa-v3v5.txt', header=TRUE, sep="\t"))
subp_associated <- results[results$OLIGO %in% subp_preferred$OLIGO & grepl("SUBP", results$ABUNDANT_IN) == 1, ]
supp_associated <- results[results$OLIGO %in% supp_preferred$OLIGO & grepl("SUPP", results$ABUNDANT_IN) == 1, ]
nrow(subp_associated)
nrow(supp_associated)
pdf("~/subp_pt.pdf", width=10, height=10)
ggplot(data=subp_associated, aes(x=SUBP, y=PT)) + geom_point(aes(size=SUBP)) + stat_smooth(method='lm', size=2) + scale_size(range = c(2, 10)) + theme_bw() + theme(legend.position="none")
ggplot(data=supp_associated, aes(x=SUPP, y=PT)) + geom_point(aes(size=SUPP)) + stat_smooth(method='lm', size=2) + scale_size(range = c(2, 10)) + theme_bw() + theme(legend.position="none")
dev.off()
subp_fit <- lm(subp_associated$SUBP ~ subp_associated$PT)
summary(subp_fit)
supp_fit <- lm(supp_associated$SUBP ~ supp_associated$PT)
summary(supp_fit)
# FIGURE Plaque oligos
# TAXA that is abundant in SUPP and SUBB, versus everywhere else with most differential mean abundance:
f7_title = "Oligotype Differentially Abundant in Plaque"
f7_labels = c('01 Corynebacterium matruchotii 98.6%', '02 Fusobacterium ss. nucleatum / polymorphum', '03 Streptococcus sanguinis / agalactiae', '04 Actinomyces oris / naeslundii', '05 Fusobacterium naviforme / vincentii', '06 Streptococcus infantis', '07 Streptococcus salivarius / vestibularis', '08 Fusobacterium periodonticum', '09 Neisseria flavescens / subflavai 97.9%', '10 Prevotella scopos / melaninogenica')
f7_oligos = c('Actinobacteria_ATGAGAATCCGT--TTTCG', 'Fusobacteria_TGTGGAACAATA', 'Firmicutes_GGTTCATTGCTTGGCTTCCCTTCTACGT', 'Actinobacteria_ACGAATGCGCCTCGCTCTG', 'Fusobacteria_TATGGAACAATA', 'Firmicutes_GGCTCATTACTTGGTTTCTCTTCTACGT', 'Firmicutes_GGTTCGTTATTTGATTCCCCTTCTACGT', 'Fusobacteria_CATGAAACATTA', 'Betaproteobacteria_GAACTTTA-TTCTATTT', 'Bacteroidetes_AGTGCA-TAACCT--GG-TATCATCTCG')
f7_colors = c('#00FF00', '#22FF22', '#44FF44', '#66FF66', '#88FF88', '#FF0000', '#FF2222', '#FF4444', '#FF6666', '#FF8888')
f7_colors = base_colors[1:10]
f7_sites = c( 'SUBP', 'SUPP','BM', 'HP', 'KG', 'PT', 'SV', 'TD', 'TH')
pdf('~/oral-figure-7-draft.pdf', width=32, height=16)
PLOT_OLIGO_PER_SITE_STACKBAR(df_v3v5, f7_oligos, sites = f7_sites, title = f7_title, labels=f7_labels, colors = f7_colors)
samples_order <- PLOT_OLIGO_EVERYWHERE(df_v3v5, f7_oligos, cluster_site = 'SUBP', sites = strep_sites_v3v5, method = 'complete', title = f7_title, labels = f7_labels, colors = f7_colors)
dev.off()
# FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS FIGURE TONSILS PATHOGENS
t_pat_sites = c('PT', 'TD')
t_pat <- as.data.frame(read.csv('SUBP-preferred-taxa-v3v5.txt', header=TRUE, sep="\t"))
t_pat_title = 'Tonsil pathogens V1V3'
plots <- list()
for(i in seq(1:length(t_pat$OLIGO))){
print(i)
plots[[i]] <- PLOT_OLIGO_PER_SITE_BOXES_PERCENTS(df_v3v5, t_pat$OLIGO[i], title=t_pat$NAME[i], sites = t_pat_sites)
}
pdf(paste('~/t_pat_v3v5_sqrt.pdf', sep=''), width=1.8 * length(plots), height=8)
multiplot(plotlist = plots, cols = length(plots))
dev.off()
# FIGURE PORPHYROMONAS
por_labels = c("Porphyromonas catoniae (284,283,277,275)",
"Porphyromonas catoniae (284,283,277,275) 99.6%",
"Porphyromonas catoniae (284,283,277,275) 98.3%",
"Porphyromonas sp. (285)",
"Porphyromonas endodontalis (273) v1",
"Porphyromonas endodontalis (273) v2",
"Porphyromonas sp. (279)",
"Porphyromonas sp. (279) 99.6% v1",
"Porphyromonas sp. (279) 99.6% v2",
"Porphyromonas sp. (279) 98.3%",
"Porphyromonas gingivalis (619)",
"Porphyromonas gingivalis (619) 99.6%")
por_colors <- base_colors[1:length(por_oligos)]
por_oligos = c("Bacteroidetes_AATATCAGAAGATA-TACAGTAGTTATG",
"Bacteroidetes_AATATCAGACGATA-TACAGTAGTTATG",
"Bacteroidetes_GATATCAGAAGATA-TACAGTAGTTATG",
"Bacteroidetes_AGGATCGGAAAATA-TACAGTAGATTTG",
"Bacteroidetes_AGGATCGGAAAATA-TACTGTAGATTTG",
"Bacteroidetes_AGGATCGGAAAATA-TAGTGTAGATTTG",
"Bacteroidetes_AGTATCAGAAGATA-TACAGTAGTTATG",
"Bacteroidetes_AGTATCAGAAGATA-TACGGTAGTTATG",
"Bacteroidetes_AGTATCAGATGATA-TACAGTAGTTATG",
"Bacteroidetes_GGTATCAGAAGATA-TACAGTAGTTATG",
"Bacteroidetes_GGTATG-GAAGATA-TACCGTAGTTATG",
"Bacteroidetes_GGTATG-GAAGATA-TACCGTAATTATG")
pdf("~/ORAL_POR.pdf", width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= por_oligos,
labels = por_labels,
sites = oral_sites,
title = "Porphyromonas oligos",
colors = por_colors,
cluster_site = "KG",
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, por_oligos, por_labels)
dev.off()
########################################################################################################################
########################################################################################################################
#
# We wanted to see the distribution of oligos found in all genera that resulted in more than 2 oligos that are not ST specific.
# Genera of interest came from the GAST column in the results.txt using using this command. Then, we filtered out the ones that
# are specific to ST, and less than 2 oligos using this command:
# for genus in Prevotella Bacteroides Treponema Streptococcus Leptotrichia Capnocytophaga Haemophilus Fusobacterium \
# Porphyromonas Johnsonella Actinomyces Neisseria Selenomonas Corynebacterium Bergeyella Veillonella \
# Roseburia Parabacteroides Oscillibacter Oribacterium Campylobacter Barnesiella Alistipes Aggregatibacter \
# Ruminococcus Rothia Propionibacterium Parasutterella Paraprevotella Moryella Megasphaera Kingella Atopobium \
# Actinobacillus Tannerella Sutterella Subdoligranulum Peptococcus Paludibacter Moraxella Lachnospira Gemella \
# Faecalibacterium Eubacterium Dialister Derxia Coprococcus Comamonas Catonella Butyrivibrio Blautia Actinobaculum
# do
# echo -n $genus "= "
# grep $genus results-expanded.txt | awk 'BEGIN{FS="\t"}{if($31 != "ST")print $1}' | awk '{printf("\"%s\", ", $1)}'; echo
# done | awk '{if(NF>3) print $0}'
# following assignments are based on the output the previous command generated.
#
########################################################################################################################
########################################################################################################################
genera = c("Prevotella", "Bacteroides", "Treponema", "Streptococcus", "Leptotrichia", "Capnocytophaga", "Haemophilus",
"Fusobacterium", "Porphyromonas", "Johnsonella", "Actinomyces", "Neisseria", "Selenomonas", "Corynebacterium",
"Bergeyella", "Veillonella", "Oribacterium", "Campylobacter", "Aggregatibacter", "Rothia", "Propionibacterium",
"Moryella", "Megasphaera", "Kingella", "Atopobium", "Actinobacillus", "Tannerella", "Peptococcus", "Paludibacter",
"Moraxella", "Lachnospira", "Gemella", "Eubacterium", "Derxia", "Comamonas", "Catonella", "Actinobaculum")
genera_oligos = list(c("Bacteroidetes_AGAGTTATGAC-CT-G--CGTTATCTCG", "Bacteroidetes_AGCGCA-GGGCCTT-G--CGCCGTCTCG", "Bacteroidetes_AGCGCCAGGGT-TT-T--GCCTATCTCG", "Bacteroidetes_AGCGCTAGAGCATT-AT-GTCGGTTTTG", "Bacteroidetes_AGCGCTAGGGCATT-AT-ATTGGTTTTG", "Bacteroidetes_AGCGCTAGGGCGTT-TA-GTCGGTTTTG", "Bacteroidetes_AGCGCTAGGGCTTT-AT-GTCGGTTTTG", "Bacteroidetes_AGCGCTAGGGCTTT-TA-GTCGGTTTTG", "Bacteroidetes_AGGGTTATGAC-CT-A--CGTTATCTCG", "Bacteroidetes_AGGGTTATGAC-CT-G--CGTTATCTCG", "Bacteroidetes_AGTACCATGACTTT-TA-GTTCGTCTCG", "Bacteroidetes_AGTGCA-GAATCCT-A--AATTATCTCG", "Bacteroidetes_AGTGCA-GAATCTT-A--AATTATCTCG", "Bacteroidetes_AGTGCA-GAATCTT-T--AATTATCTCG", "Bacteroidetes_AGTGCA-GGAC-CT-A--TATCATCTCG", "Bacteroidetes_AGTGCA-GGAC-CT-G--TATCATCTCG", "Bacteroidetes_AGTGCA-GGAC-GT-G--TATCATCTCG", "Bacteroidetes_AGTGCA-GGAC-TT-G--TATCATCTCG", "Bacteroidetes_AGTGCA-TAACCT--GG-TATCATCTCG", "Bacteroidetes_AGTGCA-TGAC-CT-G--TATCATCTCG", "Bacteroidetes_AGTGCA-TGAC-TT-G--TATCATCTCG", "Bacteroidetes_AGTGCA-TGGC-CT-G--TATCATCTCG", "Bacteroidetes_AGTGCC-TGAC-CT-A--TATCATCTCG", "Bacteroidetes_AGTGCC-TGAC-CT-G--TATCATCTCG", "Bacteroidetes_AGTGCCAGAAC-AT-TA-AGTCATCTCG", "Bacteroidetes_AGTGCCAGAAC-AT-TA-GGTCATCTCG", "Bacteroidetes_AGTGCCAGAGT-TT-TA-GTTTGTCTCG", "Bacteroidetes_AGTGCCAGAGT-TT-TT-AATGATCTCG", "Bacteroidetes_AGTGCCAGGGT-TT-T--AGTCATCTCG", "Bacteroidetes_AGTGCCAGGGT-TT-T--GCCTATCTCG", "Bacteroidetes_AGTGCCAGGGT-TT-T--GTCTATCTCG", "Bacteroidetes_AGTGCCAGGGTCTT-AT-GGTCGTCTCG", "Bacteroidetes_AGTGCCAGGGTCTT-GC-GGTCGTCTCG", "Bacteroidetes_AGTGCCAGGGTTGT-AT-GGTCGTCTCG", "Bacteroidetes_AGTGCCAGGGTTGT-GT-GGTCGTCTCG", "Bacteroidetes_AGTGCCAGGGTTTG-AT-GGTCGTCTCG", "Bacteroidetes_AGTGCCAGGGTTTT-GA-GGCCGTCTCG", "Bacteroidetes_AGTGCCAGGGTTTT-GT-GGTCGTCTCG", "Bacteroidetes_AGTGCCGGGGT-TA-T--GTCTATCTCG", "Bacteroidetes_AGTGCTTGGGTTAT-AA-AGTCATCTCG", "Bacteroidetes_AGTGCTTGGGTTTT-AA-AGTCATCTCG", "Bacteroidetes_AGTGCTTTGACTTT-AT-GGTCGTCTCG", "Bacteroidetes_AGTGCTTTGACTTTAAT-GGTCGTCTCG", "Bacteroidetes_AGTGCTTTGACTTTTAT-GGTCGTCTCG", "Bacteroidetes_AGTGTCAGAAC-TA-GA-AGTAATTTAG", "Bacteroidetes_AGTGTCAGAAC-TA-GA-TGTAATTTAG", "Bacteroidetes_AGTGTCAGGAC-TA-TA-TGTAGTTTTG", "Bacteroidetes_ATTGCTTGGGT--T-T--AGCCATCTCG", "Bacteroidetes_ATTGCTTGGGT-TT-T--AGCCATCTCG", "Bacteroidetes_CGGGTTTTGGC-CT-G--CGTTATCTCG", "Bacteroidetes_GGCGCCTGGGT-CT-T--GGTCGTCTCG", "Bacteroidetes_GGCGCCTGGGT-TT-C--GGTCGTCTCG", "Bacteroidetes_GGCGCCTGGGT-TT-G--GGTCGTCTCG", "Bacteroidetes_GGCGCCTGGGT-TT-T--GGTCGTCTCG", "Bacteroidetes_GGCGTTGTGTC-CC--T-CGCCATCTCG", "Bacteroidetes_GGCGTTTTGAC-CC-CT-CGCCATCTCG", "Bacteroidetes_GGCGTTTTGAC-CG-CT-CGCCATCTCG", "Bacteroidetes_GGCGTTTTGGC-CC-CT-CGCCATCTCG", "Bacteroidetes_GGCGTTTTGTC--G-CT-CGCCATCTCG", "Bacteroidetes_GGCGTTTTGTC-CC--T-CGCCATCTCG", "Bacteroidetes_GGTGCA-GGGC-TT-TT-ATTCGTCTCG", "Bacteroidetes_GGTGCT-GGGC-TT-TT-ATTCGTCTCG", "Bacteroidetes_TATGTCAGAACATA-CAATGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-GACAGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-TAATGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-TATTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-CAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-CAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GACAGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GATTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TATTGTAATTTAG"),
c("Bacteroidetes_AGTGTCAGAAC-TA-GA-AGTAATTTAG", "Bacteroidetes_AGTGTCAGAAC-TA-GA-TGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-CAATGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-GACAGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-TAATGTAATTTAG", "Bacteroidetes_TATGTCAGAACATA-TATTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-CAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-CAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GACAGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-GATTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TAATGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TAGTGTAATTTAG", "Bacteroidetes_TGTGTCAGAACATA-TATTGTAATTTAG"),
c("Spirochaetes_CACACTAACCAAACC", "Spirochaetes_CGCACCACGTCAGCC", "Spirochaetes_CTTCTGACCTAAATC", "Spirochaetes_GACACTCGATAAACC", "Spirochaetes_GACACTCGCTAAGTC", "Spirochaetes_GACACTTGATAAACC", "Spirochaetes_GACAGTACGTAAATC", "Spirochaetes_GACCCCTGCCAAATC", "Spirochaetes_GACCCTAGCTAAATC", "Spirochaetes_GATACTCGATAAACC", "Spirochaetes_GGCACTCAATAGACC", "Spirochaetes_GGCACTCGATAAACC", "Spirochaetes_GGCACTCGATAGACC", "Spirochaetes_TTTACGACTTAATTC", "Spirochaetes_TTTATGAACTAAATC", "Spirochaetes_TTTATGAACTAAATT", "Spirochaetes_TTTATGAACTGAATC", "Spirochaetes_TTTATGAACTGAATT", "Spirochaetes_TTTATGAACTTAATC", "Spirochaetes_TTTATGACTTAAATC", "Spirochaetes_TTTCTGACTTAAATC", "Spirochaetes_TTTCTGCCCTAAATC", "Spirochaetes_TTTCTGCCTTAAATC", "Spirochaetes_TTTCTGGCTTAAATC", "Spirochaetes_TTTTTGACTTAAATC", "Spirochaetes_TTTTTGCCCTAACTC"),
c("Firmicutes_GGCTCATTACTTGGCTTCCCTTCTACGT", "Firmicutes_GGCTCATTACTTGGTTTCACTTCTACGT", "Firmicutes_GGCTCATTACTTGGTTTCCCTTCTACGT", "Firmicutes_GGCTCATTACTTGGTTTCCCTTCTGCGT", "Firmicutes_GGCTCATTACTTGGTTTCTCTTCTACGT", "Firmicutes_GGCTCATTACTTGGTTTCTCTTCTGCGT", "Firmicutes_GGCTCATTATTTGGTTTCCCTTCTACGT", "Firmicutes_GGCTCATTGCTTGGCTTCCCTTCTACGT", "Firmicutes_GGCTCATTGCTTGGTTTCCCTTCTACGT", "Firmicutes_GGCTCATTGCTTGGTTTCTCTTCTACGT", "Firmicutes_GGCTCATTGTTTGACTCCCCTTCTACGT", "Firmicutes_GGTACGTTATTTGATTCCCCTTCTACGT", "Firmicutes_GGTTCATTACTTGGTTTCCCTTCTACGT", "Firmicutes_GGTTCATTACTTGGTTTCTCTTCTACGT", "Firmicutes_GGTTCATTATTTGATTCCCCTTCTACGT", "Firmicutes_GGTTCATTGCTTGGCTTCCCTTCTACGT", "Firmicutes_GGTTCATTGTTTGACTCCCCTTCTACGT", "Firmicutes_GGTTCATTGTTTGACTCCTCTTCTACGT", "Firmicutes_GGTTCATTGTTTGATTCCCCTTCTACGT", "Firmicutes_GGTTCGTTATTTGATTCCCCTTCTACGT", "Firmicutes_GGTTCGTTATTTGATTCCTCTTCTACGT", "Firmicutes_GGTTCGTTGCTTGGCTTCCCTTCTACGT", "Firmicutes_GGTTCGTTGCTTGGCTTCGCTACTACGT", "Firmicutes_GGTTCGTTGTTTGACTCCCCTTCTACGT", "Firmicutes_GGTTCGTTGTTTGACTTCCCTTCTACGT", "Firmicutes_GGTTCGTTGTTTGATTCCCCTTCTACGT"),
c("Fusobacteria_ACAGGGCCTGGC", "Fusobacteria_ATAGGACCTGGC", "Fusobacteria_ATAGGGCCTGGC", "Fusobacteria_ATTAGGTTTAAT", "Fusobacteria_CAGGGATTCAGC", "Fusobacteria_CAGGGATTCTGC", "Fusobacteria_CCCGGGTTCTGC", "Fusobacteria_CCTAGGTTCTGC", "Fusobacteria_CCTGGGCCCAGC", "Fusobacteria_TAAGGATTTATA", "Fusobacteria_TAAGGGCCTGGC", "Fusobacteria_TAGAGGTTTAGC", "Fusobacteria_TATGGATTCTGC", "Fusobacteria_TATGGATTTTAT", "Fusobacteria_TCTGGGTTCAGC", "Fusobacteria_TGAGGACTTAGC", "Fusobacteria_TGAGGACTTGGC", "Fusobacteria_TGAGGGCTTAGC", "Fusobacteria_TGCGGATTCAGC", "Fusobacteria_TGCGGATTCTGC", "Fusobacteria_TGTGGATTCTGC", "Fusobacteria_TTAGGGCTTAGC", "Fusobacteria_TTTGGGCCCAGC", "Fusobacteria_TTTGGGCCCTGC", "Fusobacteria_TTTGGGCCTTGC", "Fusobacteria_TTTGGGTTCAGC"),
c("Bacteroidetes_AGAAATGGGTC-GT-AA-GCTGGATTCG", "Bacteroidetes_AGGAATAGGTC-GG-AA-ATTGGATTCG", "Bacteroidetes_AGGAGA-GGTC-GC-AA-GCTGGATTCG", "Bacteroidetes_AGGAGA-GGTCGAT-AA-GCTGGATTCG", "Bacteroidetes_AGGAGA-GGTCGAT-AA-GTTGGATTCG", "Bacteroidetes_AGGAGG-GGTC-GA-AA-TTTGGATTCG", "Bacteroidetes_AGGAGG-GGTC-GT-AA-TTTGGATTCG", "Bacteroidetes_AGTAAGGGGTC-GC-GA-TCTGGATTCG", "Bacteroidetes_AGTAAGGGGTC-GT-GA-TCTGGATTCG", "Bacteroidetes_AGTAATAGGTC-GA-AA-TTTGGATTCG", "Bacteroidetes_AGTAATAGGTC-GC-AA-TTTGGATTCG", "Bacteroidetes_AGTAATAGGTC-GC-GA-TTTGGATTCA", "Bacteroidetes_AGTAATAGGTC-GT-AA-ACTGGATTCA", "Bacteroidetes_AGTAATAGGTC-GT-AA-TCTGGATTCA", "Bacteroidetes_AGTAATAGGTC-TA-AG-ATTGGATTCA", "Bacteroidetes_AGTAATAGGTC-TT-AA-ACTGGATTCA", "Bacteroidetes_AGTAATAGGTCGT--AA-ACTGGATTCG", "Bacteroidetes_AGTAATAGGTCGTT-AA-ACTGGATTCG", "Bacteroidetes_AGTAATGGGTC-GC-GA-TTTGGATTCA", "Bacteroidetes_AGTAATGGGTC-GT-AA-TCTGGATTCA", "Bacteroidetes_AGTAATGGGTC-TA-AG-ATTGGATTCA", "Bacteroidetes_AGTAATTGGTC-GC-AA-TCTGGATTCG", "Bacteroidetes_AGTAATTGGTC-GG-AA-TCTGGATTCG", "Bacteroidetes_AGTAATTGGTC-GT-AA-TCTGGATTCG"),
c("Gammaproteobacteria_AAAGAGTTTAACCTC", "Gammaproteobacteria_ACAGAATTTAATCTC", "Gammaproteobacteria_ATAAAGTTTGACCTC", "Gammaproteobacteria_ATAGAATATAATCTC", "Gammaproteobacteria_ATAGAATTTAATCTC", "Gammaproteobacteria_ATAGAATTTAGTCTC", "Gammaproteobacteria_ATAGAATTTGATCTC", "Gammaproteobacteria_ATAGAATTTGTTCTC", "Gammaproteobacteria_ATAGAGTATAATCTC", "Gammaproteobacteria_ATAGAGTTTAACCTC", "Gammaproteobacteria_ATAGAGTTTAATCTC", "Gammaproteobacteria_ATAGAGTTTGACCTC", "Gammaproteobacteria_ATAGAGTTTGATCTC", "Gammaproteobacteria_ATAGGGTTTAACCTC", "Gammaproteobacteria_ATAGGGTTTAACTTC", "Gammaproteobacteria_ATAGGGTTTGACTTC", "Gammaproteobacteria_ATCGAGTTAAACTTC", "Gammaproteobacteria_ATCGGG-TTAACTTC", "Gammaproteobacteria_ATCGGGTTAAACTTC", "Gammaproteobacteria_ATCGGGTTAAATTTC", "Gammaproteobacteria_ATCGGGTTAATCTTC", "Gammaproteobacteria_ATCGGGTTAGACTTC", "Gammaproteobacteria_ATCGGGTTTAATCTC"),
c("Fusobacteria_AATGAAATAATA", "Fusobacteria_CATGAAACATTA", "Fusobacteria_CCTGAAATATTA", "Fusobacteria_GTTGAAATATTA", "Fusobacteria_TATGAAACATTA", "Fusobacteria_TATGAAATAATA", "Fusobacteria_TATGGAACAATA", "Fusobacteria_TATGGAATAATA", "Fusobacteria_TGTAAAACATTA", "Fusobacteria_TGTGAAACAATA", "Fusobacteria_TGTGAAACATTA", "Fusobacteria_TGTGAAATAATA", "Fusobacteria_TGTGGAACAATA"),
c("Bacteroidetes_AATATCAGAAGATA-TACAGTAGTTATG", "Bacteroidetes_AATATCAGACGATA-TACAGTAGTTATG", "Bacteroidetes_AGGATCGGAAAATA-TACAGTAGATTTG", "Bacteroidetes_AGGATCGGAAAATA-TACTGTAGATTTG", "Bacteroidetes_AGGATCGGAAAATA-TAGTGTAGATTTG", "Bacteroidetes_AGTATCAGAAGATA-TACAGTAGTTATG", "Bacteroidetes_AGTATCAGAAGATA-TACGGTAGTTATG", "Bacteroidetes_AGTATCAGATGATA-TACAGTAGTTATG", "Bacteroidetes_GATATCAGAAGATA-TACAGTAGTTATG", "Bacteroidetes_GGTATCAGAAGATA-TACAGTAGTTATG", "Bacteroidetes_GGTATG-GAAGATA-TACCGTAATTATG", "Bacteroidetes_GGTATG-GAAGATA-TACCGTAGTTATG"),
c("Firmicutes_AGAACATTTGCAGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCCGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCGGCATCCTCAATTTTAA", "Firmicutes_GAATCATTGA-G-TCTCCGCAATTTTAA", "Firmicutes_GGAACATCAG-GCTTTCCTCAATTTTAA", "Firmicutes_GGACCATCAGGTCCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGATCCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGACCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGACCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGGTCCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGTCCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGGTCTTTCCCCATTTTTAA", "Firmicutes_GGACCATCTGGTCCATCCCCTTTTTTAA", "Firmicutes_GGATCATCGAAACTTTCCTCAATTTTAA", "Firmicutes_GGATCATCGAGACTTTCCTCAATTTTAA", "Firmicutes_GGATCATCGGATTTTTCCCCTTTTTTAA"),
c("Actinobacteria_ACGAATGCCT-CG-TTCTG", "Actinobacteria_ACGAATGCGCCTCGCTCTG", "Actinobacteria_ACGAATGCGCCTCGTTCTG", "Actinobacteria_ACGAATGCTC-CG-GTTTG", "Actinobacteria_ACGAATGTCCACT-TTTTG", "Actinobacteria_ACGAATGTTC-CG-GTTTG", "Actinobacteria_ACGAGTGTGCCTCGTTCTG", "Actinobacteria_GCGAATGCGCCTCGCTCTG", "Actinobacteria_GCGAATGCGCCTCGTTCTG", "Actinobacteria_GCGAGTGTTCTCG-CCCTG", "Actinobacteria_GCGAGTGTTCTCG-CTCTG", "Actinobacteria_GTGAACGTCT-CA-ATTTT", "Actinobacteria_GTGAGTGTGT-CG-CTCTG"),
c("Betaproteobacteria_GAACCATA-CTCTAACT", "Betaproteobacteria_GAACCTTA-CTCTAACT", "Betaproteobacteria_GAACCTTA-TGCTATCT", "Betaproteobacteria_GAACTTAA-TGCTAATT", "Betaproteobacteria_GAACTTTA-TTCTATCT", "Betaproteobacteria_GAACTTTA-TTCTATTT", "Betaproteobacteria_GAACTTTA-TTTCATTT", "Betaproteobacteria_GAATTATA-TTCTATTT", "Betaproteobacteria_GAATTTAA-TGATAATT", "Betaproteobacteria_GAATTTTA-CTCTATCT", "Betaproteobacteria_GAATTTTA-TGCTAATT", "Betaproteobacteria_GAATTTTA-TTCTATCT", "Betaproteobacteria_GAGCCTTA-TTCTGACT", "Betaproteobacteria_GAGCTTAA-TGCTAATT", "Betaproteobacteria_GGGCCTTA-TTCTGACT"),
c("Firmicutes_AGAACGTGAGATCCTACCGTTATCGCGC", "Firmicutes_AGACCGTAAGATCCTCCCGTTATCGTAC", "Firmicutes_AGACCGTAAGATCCTTCCGTTATCGTAC", "Firmicutes_AGACCGTAAGATCCTTCCGTTATCGTAT", "Firmicutes_AGATCGTAGGATCCTCCCGTTATTGTAC", "Firmicutes_GGAACGTGAGATCCTACCGTTATCGCGC", "Firmicutes_GGAACGTGAGATCCTACCGTTATCGTAC", "Firmicutes_GGACCGTAAGATCCTCCCGTTATTGTAC", "Firmicutes_GGATCGTAAGATCCTCCCGTTATTGTAC"),
c("Actinobacteria_ATGAGAACAC-CA-CTTCG", "Actinobacteria_ATGAGAATCCGT--TTTCG", "Actinobacteria_ATGAGAATCCGT-TTTTCG", "Actinobacteria_ATGAGAATCT-TT-TTTCG", "Actinobacteria_ATGAGAATCTGT--TTTCG", "Actinobacteria_ATGAGTATGT-TT-AGTCG"),
c("Bacteroidetes_AATATCAGGTT-AT-AT-AATCGATAGG", "Bacteroidetes_AATATCAGGTT-GT-AA-AATCGATAGG", "Bacteroidetes_AATATCAGGTT-GT-AT-AATCGATAGG", "Bacteroidetes_AATATCAGGTT-TT-AG-AATCGATAGG", "Bacteroidetes_AATATCAGGTT-TT-AT-AATCGATAGG", "Bacteroidetes_AATATTTGGTT-TT-AG-TATCGATAGG"),
c("Bacteroidetes_GATGACAGACG-AA-TA-TGTCATTTCT", "Firmicutes_AAAACATAAGATCCTCCCGTTATCATAT", "Firmicutes_AAAACGTAAGATC-TTCCGTTACCACGT", "Firmicutes_AAAACGTAAGATCCTTCCGTTACCACGT", "Firmicutes_AAAACGTAAGATCCTTCCGTTACCGCGT", "Firmicutes_AGAACGTAAGATCCTTCCGTTACCGTGC", "Firmicutes_AGAACGTGAGATCCTACCGTTATCGCGC", "Firmicutes_AGACCGTAAGATCCTCCCGTTATCGTAC", "Firmicutes_AGACCGTAAGATCCTTCCGTTATCGTAC", "Firmicutes_AGACCGTAAGATCCTTCCGTTATCGTAT", "Firmicutes_AGATCGTAGGATCCTCCCGTTATTGTAC", "Firmicutes_GAATCGTAAGATCCTTCCTTTGCCGCGT", "Firmicutes_GGAACGTAAGATCCTTCCGTTACCACGT", "Firmicutes_GGAACGTAAGATCCTTCCGTTACCGCGT", "Firmicutes_GGAACGTGAGATCCTACCGTTATCGCGC", "Firmicutes_GGAACGTGAGATCCTACCGTTATCGTAC", "Firmicutes_GGACCATAAGATCCTCCCGTTACCGCGC", "Firmicutes_GGACCGTAAGATCCTCCCGTTATTGTAC", "Firmicutes_GGATCGTAAGATCCTCCCGTTATTGTAC"),
c("Firmicutes_AGAGCATCTCCGGGATCCACAATTTTAA", "Firmicutes_AGATCATCGGCACTCTCCTCAATTTTAA", "Firmicutes_AGATCATCGGCACTCTCCTCTATTTTAA", "Firmicutes_AGATCATCGGCACTTTCCTCAATTTTAA", "Firmicutes_AGATCATCGGCATTCTCCTCTATTTTAA"),
c("Epsilonproteobacteria_AAT", "Epsilonproteobacteria_ATT", "Epsilonproteobacteria_GAT", "Epsilonproteobacteria_GGC", "Epsilonproteobacteria_GTC", "Epsilonproteobacteria_TCA"),
c("Gammaproteobacteria_ATAGGGTTTGACTTC", "Gammaproteobacteria_ATCGAGTTTAACTTC", "Gammaproteobacteria_ATCGGG-TTAACTTC", "Gammaproteobacteria_ATCGGGCTTGACTAC", "Gammaproteobacteria_ATCGGGTTAGACTTC", "Gammaproteobacteria_ATCGGGTTTAACTAC", "Gammaproteobacteria_ATCGGGTTTGACTTC", "Gammaproteobacteria_ATCGTGTTTGACTTC", "Gammaproteobacteria_ATGGTGTTTGACTTC"),
c("Actinobacteria_ACGAAAACACACA-TTCTG", "Actinobacteria_ATAAGAACACACA-TTCTG", "Actinobacteria_ATGAGAACACACA-TTCTG"),
c("Actinobacteria_ATGAGTGCACACA-TCCTA", "Actinobacteria_ATGAGTGTACACA-TCCTA", "Actinobacteria_ATGAGTGTCCACA-GTCTA"),
c("Firmicutes_AGAACATTTGCAGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCCGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCGGCATCCTCAATTTTAA", "Firmicutes_GGATCATCGGATTTTTCCCCTTTTTTAA"),
c("Firmicutes_GGAACGTAAGATCCTTCCGTTACCACGT", "Firmicutes_GGAACGTAAGATCCTTCCGTTACCGCGT", "Firmicutes_GGACCATAAGATCCTCCCGTTACCGCGC"),
c("Betaproteobacteria_GAACTTTA-TTCTATCT", "Betaproteobacteria_GAATTATA-TTCTATTT", "Betaproteobacteria_GAATTTAA-TGATAATT", "Betaproteobacteria_GAATTTTA-TGCTAATT", "Betaproteobacteria_GAATTTTA-TTCTATCT"),
c("Actinobacteria_GTGAA-TCAA-A--CTCTG", "Actinobacteria_GTGAA-TCAG-A--CTCTG", "Actinobacteria_GTGAA-TTA--AG-CTCTG"),
c("Gammaproteobacteria_ACAGAATTTAATCTC", "Gammaproteobacteria_ATAGGGTTTAACTTC", "Gammaproteobacteria_ATAGGGTTTGACTTC", "Gammaproteobacteria_ATCGAGTTTAACTTC", "Gammaproteobacteria_ATCGGG-TTAACTTC", "Gammaproteobacteria_ATCGGGTTAAACTTC", "Gammaproteobacteria_ATCGGGTTAAATTTC", "Gammaproteobacteria_ATCGGGTTTAACTAC", "Gammaproteobacteria_ATCGGGTTTGACTTC"),
c("Bacteroidetes_AGTACTAGAAG-TA-TA-TGTAGTTATG", "Bacteroidetes_AGTATTAGAAG-TA-TA-TGTAGTTATG"),
c("Firmicutes_GACTCATCTTTTAAATCCGTAACTGTAC", "Firmicutes_GACTCATTTTTTAAATCCCTAACTGTAC"),
c("Bacteroidetes_AGTAACAGAAC-TA-TA-TGTTGATATA", "Bacteroidetes_AGTATCAGAACATA-TACTGTTGATATA"),
c("Gammaproteobacteria_TAAGGGCATGACGCA", "Gammaproteobacteria_TCAGGTTTTAAAGCA"),
c("Firmicutes_AAATCATTCAGAATGTCCGCTATCACGT", "Firmicutes_AGAACATTGAAATTTTCCACAATTTTAA", "Firmicutes_AGAACATTGAATCTTTCCCCATTTTTAA", "Firmicutes_AGAACATTGAATCTTTCCTCATTTTTAA", "Firmicutes_AGAACATTTGCAGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCCGCATCCTCAATTTTAA", "Firmicutes_AGAACATTTGCGGCATCCTCAATTTTAA", "Firmicutes_AGAGCATCTCCGGGATCCACAATTTTAA", "Firmicutes_AGATCATCGGCACTCTCCTCAATTTTAA", "Firmicutes_AGATCATCGGCACTCTCCTCTATTTTAA", "Firmicutes_AGATCATCGGCACTTTCCTCAATTTTAA", "Firmicutes_AGATCATCGGCATTCTCCTCTATTTTAA", "Firmicutes_GAATCATCGA-G-TTTCCGCAATTTTAA", "Firmicutes_GAATCATTGA-G-TCTCCGCAATTTTAA", "Firmicutes_GGAACATCAG-GCTTTCCTCAATTTTAA", "Firmicutes_GGACCATCAGGTCCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGATCCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGACCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGACCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGGTCCTTCCCCATTTTTAA", "Firmicutes_GGACCATCGGGTCCTTCCCCTTTTTTAA", "Firmicutes_GGACCATCGGGTCTTTCCCCATTTTTAA", "Firmicutes_GGACCATCTGGTCCATCCCCTTTTTTAA", "Firmicutes_GGATCATCGAAACTTTCCTCAATTTTAA", "Firmicutes_GGATCATCGAGACTTTCCTCAATTTTAA", "Firmicutes_GGATCATCGGATTTTTCCCCTTTTTTAA"),
c("Firmicutes_GGCTCATTTC-AGGATCTCCTTCTACGT", "Firmicutes_GGCTCATTTC-AGGATCTTCTTCTACGT"),
c("Firmicutes_GAATCATCGA-G-TTTCCGCAATTTTAA", "Firmicutes_GAATCATCGG-GGCTTCCCTTTCTCATG", "Firmicutes_GAATCATTGA-G-TCTCCGCAATTTTAA", "Firmicutes_GGACCATCGGATCCTTCCCCATTTTTAA", "Firmicutes_GGACTATCGG-GGCTTCCTTTTCTCATG"),
c("Betaproteobacteria_AAACATAT--TCTGCAT", "Betaproteobacteria_GAACATAT--TCTGCAT"),
c("Betaproteobacteria_GTGCATC--TTTCAATT", "Betaproteobacteria_GTGCATC-TTTTCAATT"),
c("Firmicutes_AGAACATTGAATCTTTCCCCATTTTTAA", "Firmicutes_AGAACATTGAATCTTTCCTCATTTTTAA"),
c("Actinobacteria_ATGAGTGTTCTCA-ATTTT", "Actinobacteria_GTGAACGTCT-CA-ATTTT"))
for(i in seq(1, length(genera))){
print(genera[i])
genera_oligos_vector <- unlist(genera_oligos[i])
sub_df <- df_v3v5[df_v3v5$oligo %in% genera_oligos_vector, ]
x = c()
counter <- 1
for(site in oral_sites){
x[counter] <- sum(sub_df[sub_df$site == site, ]$abundance)
counter = counter + 1
}
pdf(paste("~/ORAL_", sprintf("%04d", i), "_", genera[i], ".pdf", sep = ""), width=12, height=6)
STANDARD_FIG(df=df_v3v5,
oligos= genera_oligos_vector,
labels = genera_oligos_vector,
sites = oral_sites,
title = genera[i],
colors = rainbow(length(genera_oligos_vector)),
cluster_site = oral_sites[x == max(x)],
metric = 'horn',
method = 'ward')
OLIGO_DIST(df_v3v5_seqs, genera_oligos_vector, genera_oligos_vector)
dev.off()
}
########################################################################################################################
########################################################################################################################
v1_v3_num_nucleotides_chosen_per_oligo <- c(21, 22, 17, 14, 33, 18, 13, 17, 15)
v3_v5_num_nucleotides_chosen_per_oligo <- c(19, 28, 17, 3, 28, 12, 15, 15)
mean(v1_v3_num_nucleotides_chosen_per_oligo)
mean(v3_v5_num_nucleotides_chosen_per_oligo)
########################################################################################################################
########################################################################################################################
all_oligos <- levels(df_v3v5$oligo)
pdf('~/ORAL_ALL.pdf', 32, height=16)
for(s in oral_sites){
PLOT_OLIGO_EVERYWHERE(df_v3v5,
all_oligos,
cluster_site = s,
sites = oral_sites,
method = 'ward',
title = s,
labels = all_oligos,
colors = rainbow(length(all_oligos)),
metric = 'euclidean')
}
dev.off()
########################################################################################################################
########################################################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment