Skip to content

Instantly share code, notes, and snippets.

@dylanbstorey
Last active July 5, 2016 16:08
Show Gist options
  • Save dylanbstorey/99b9f109031f34c5a9233bf0f5d435d7 to your computer and use it in GitHub Desktop.
Save dylanbstorey/99b9f109031f34c5a9233bf0f5d435d7 to your computer and use it in GitHub Desktop.
General Analyses of a Micro Biome
library(ggplot2)
library(reshape2)
library(vegan)
library(gplots)
library(ggrepel)
library(fossil)
global_S <- NULL
get_species<-function(matrix){
IndicesToGrab<-grep("s__",matrix$ID);
matrix$ID<-as.character(matrix$ID);
Species<-matrix[IndicesToGrab,];
Species$ID<-gsub("^.*s__|","",Species$ID)
row.names(Species)=Species$ID
Species<-Species[,-1];
Species
}
get_genus<-function(matrix){
IndicesToGrab<-grep("g__",matrix$ID);
Genera<-matrix[IndicesToGrab,];
Genera$ID<-gsub("^.*g__|","",Genera$ID);
Genus_and_species<-grep("s__",Genera$ID);
Genera<-Genera[-Genus_and_species,];
row.names(Genera)<-Genera$ID;
Genera<-Genera[,-1];
Genera
}
get_family<-function(matrix){
IndicesToGrab<-grep("f__",matrix$ID);
Families<-matrix[IndicesToGrab,];
Families$ID<-gsub("^.*f__|","", Families$ID);
Family_genus_species_list<-grep("g__", Families$ID);
Families<-Families[-Family_genus_species_list,];
Family_species_list <- grep("s__",Families$ID);
Families<-Families[-Family_species_list,];
row.names(Families)<-Families$ID;
Families<-Families[,-1];
Families
}
barplot_proportional_data <-function(matrix,level,outname){
matrix.norm<-data.matrix(matrix);
matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
colnames(matrix.norm) <- colnames(matrix)
rownames(matrix.norm) <-rownames(matrix)
matrix.norm.melt<-melt(matrix.norm);
ggplot(matrix.norm.melt,aes(x=Var2,y=value,fill=Var1))+geom_bar(stat="identity",position="stack")+theme(legend.position="none",panel.grid.major=element_blank(),panel.grid.minor=element_blank(),panel.border=element_blank(),panel.background=element_blank())+theme(axis.text.x = element_text(angle = 45, hjust = 1))+xlab("Sample Name")+ylab("Proportion of Observations")+ggtitle(paste("Proprotional observations at ",level," level"))
ggsave(filename=paste0(outname,".pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
}
rarefy_here<-function(matrix,name){
if (! file.exists(name)){
minObservations<-min(colSums(matrix))
rarefied<-data.frame(rrarefy(t(matrix),minObservations))
write.table(rarefied,file = name)
}
else{
rarefied <- read.table(name,header=TRUE)
}
rarefied
}
log_heatform<-function(matrix,level,outname){
logxform <-log2(matrix)
logxform[logxform==-Inf]<-0
logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
pdf(paste0(outname,".pdf"), width=16 , height=32)
heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=0.3,sepwidth=c(0.05,0.05))
dev.off()
}
log_heatform_big<-function(matrix,level,outname){
logxform <-log2(matrix)
logxform[logxform==-Inf]<-0
logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
pdf(paste0(outname,".pdf"), width=16 , height=32)
heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=1,sepwidth=c(0.05,0.05))
dev.off()
}
alpha_diversity_metrics<-function(matrix,level){
global_S <- NULL
# TAKES RAREFIED DATA
H <- data.frame(diversity(matrix))
simpson <- data.frame(diversity(matrix, "simpson"))
shannon<-data.frame(diversity(matrix , "shannon"))
invsimp <- data.frame(diversity(matrix, "inv"))
alpha <- data.frame(fisher.alpha(matrix))
## Species richness (S) and Pielou's evenness (J):
S <- data.frame(specnumber(matrix))
J <- data.frame(H/log(S))
Diversity<-cbind(simpson, shannon, invsimp, alpha, S, J)
Diversity$Sample<-row.names(Diversity)
names(Diversity)<-c("Simpson", "Shannon", "InvSimpson", "Alpha", 'ClassNo', "Evenness", "Sample")
Diversity<-Diversity[,c(7,1,2,3,4, 5, 6)]
pdf(file=paste0(level,'_estimate_pairs.pdf'))
pairs(cbind(simpson, shannon, alpha, J, S), col="blue")
dev.off()
# Plots of alpha-diversity metrics
ggplot(data=Diversity, aes(x=Sample, y=Simpson)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(level,"_Diversity_Simpson.pdf"),width=5,height=5,units="in",dpi=500)
ggplot(data=Diversity, aes(x=Sample, y=Alpha)) + geom_bar(colour="black", fill="darkblue", stat="identity")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(level,"_Diversity_Alpha.pdf"),width=5,height=5,units="in",dpi=500)
ggplot(data=Diversity, aes(x=Sample, y=Evenness)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(level,"_Diversity_Eveness.pdf"),width=5,height=5,units="in",dpi=500)
ggplot(data=Diversity, aes(x=Sample, y=ClassNo)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(level,"_Diversity_classNo.pdf"),width=5,height=5,units="in",dpi=500)
ggplot(data=Diversity, aes(x=Sample, y=Shannon)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggsave(paste0(level,"_Diversity_Shannon.pdf"),width=5,height=5,units="in",dpi=500)
global_S <<- S
}
species_per_sample<-function(matrix,level){
## Rarefaction curves
raremax <- min(rowSums(t(matrix)))
Srare <- rarefy(t(matrix), raremax)
x<-data.frame(global_S,Srare)
pdf(paste0(level,"_rarefaction.pdf"))
plot(x, xlab = paste0("Observed No. of ",level), ylab = paste0("Rarefied No. of ",level))
dev.off();
pdf(paste0("rarefaction_",level,'.pdf'))
rarecurve(t(matrix), step = 1000, col = "blue", cex = 0.5, sample= raremax, xlab = "Sample Size", ylab = level, label = TRUE, lwd = 2)
dev.off()
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 1000)
draw_circle<- function(center = c(0,0),diameter = 1, npoints = 1000){
r = 25
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
circle<-draw_circle()
bi_plot <-function(matrix, outfile ){
logxform <-log2(matrix)
logxform[logxform==-Inf]<-0
logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
#PCA and Scree
pca<-prcomp(logxform , center = TRUE , scale.=TRUE)
std_dev<-pca$sdev
loadings<-pca$rotation
norm_loadings<-sweep(loadings,2,colSums(loadings),"/")
scores<-pca$x
variance<- std_dev^2
variance<-variance/sum(variance) * 100
variance<-as.data.frame(variance)
variance <-cbind(PC = as.numeric(row.names(variance)),variance)
ggplot(variance, aes(y=variance,x=PC))+geom_bar(stat="identity")+ylab("Proportion of Variance")+xlab("Principle Component")
ggsave(filename=paste0(outfile,"_scree.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500)
vecs<-data.frame(loadings[,c('PC1','PC2')])
vecs = vecs * 25 * 10
tmp_vecs1a<-vecs[order(vecs[,1]),][1:10,]
tmp_vecs1b<-vecs[rev(order(vecs[,1])),][1:10,]
tmp_vecs2a<-vecs[order(vecs[,2]),][1:10,]
tmp_vecs2b<-vecs[rev(order(vecs[,2])),][1:10,]
vecs <-rbind(tmp_vecs1a,tmp_vecs1b,tmp_vecs2a,tmp_vecs2b)
vecs <-unique(vecs)
pca.df <-as.data.frame(pca$x)
pca.df <- cbind( ID = row.names(pca.df),pca.df)
ggplot(pca.df, aes(x=PC1 , y = PC2)) +
geom_point(
size=1,
)+
geom_text_repel(
size=2.5,
aes(label=ID),
segment.color = 'gray',
point.padding = unit(1.0,"lines"),
box.padding = unit(0.1,"lines"),
force = 1,
)+
geom_point(
size=5,
shape = 3,
aes(x=0,y=0),
color="grey",
)+
geom_path(
data=circle,
aes(x , y),
color="red")+
coord_fixed()+
geom_point(
data=vecs,
aes(x=PC1,y=PC2),
color="red",
size=2
)+
geom_text_repel(
data=vecs,
aes(x=PC1,y=PC2,label=rownames(vecs)),
color = "red",
size=2.5,
segment.color = 'gray',
point.padding = unit(1.0,"lines"),
box.padding = unit(0.1,"lines"),
force = 1
)
ggsave(filename=paste0(outfile,"_pca.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
}
#Kraken Mapping
map_rates = as.data.frame(read.table(file="stats.tbl",header=TRUE,sep="\t"))
map_rates.melt <- melt(map_rates,id.vars = 'sample')
ggplot(data=map_rates.melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x= "Sample",y="Proportion Classified",title="Proportion of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))
ggsave(filename = "KrakenClassificationProportion.pdf", plot=last_plot(),width=16,height=12,units="in",dpi=500);
#data prep
kraken_mpa <- read.csv("kraken-mpa.merge",sep="\t",header=TRUE)
kraken_mpa$ID<-as.character(kraken_mpa$ID)
#sub sample Bacteria, Viruses , and Archaea
indices<-grep("Bacteria",kraken_mpa$ID)
bacteria_mpa <-kraken_mpa[indices,]
indices<-grep("Viruses",kraken_mpa$ID)
viruses_mpa <-kraken_mpa[indices,]
indices<-grep("Archaea",kraken_mpa$ID)
archaea_mpa <-kraken_mpa[indices,]
dirlist <- c('AllTaxa', 'Bacteria','Archaea','Viruses','Views')
for( i in dirlist){
if (! dir.exists(i)){
dir.create(i)
}
}
setwd('AllTaxa')
kraken_mpa.rarefy.family <-rarefy_here(get_family(kraken_mpa) ,'kraken_mpa_rarefy_family.txt')
kraken_mpa.rarefy.genus <-rarefy_here(get_genus(kraken_mpa) ,'kraken_mpa_rarefy_genus.txt')
kraken_mpa.rarefy.species <-rarefy_here(get_species(kraken_mpa),'kraken_mpa_rarefy_species.txt')
barplot_proportional_data(get_family(kraken_mpa),'family', 'All_taxa_family_proportion')
log_heatform(kraken_mpa.rarefy.family,'family','All_taxa_family_heat')
bi_plot(kraken_mpa.rarefy.family,"All_taxa_family_biplot")
alpha_diversity_metrics(kraken_mpa.rarefy.family,'Families')
species_per_sample(get_family(kraken_mpa),'Families')
barplot_proportional_data(get_genus(kraken_mpa),'genus','All_taxa_genus_proportion')
log_heatform(kraken_mpa.rarefy.genus,'genus','All_taxa_genus_heat')
bi_plot(kraken_mpa.rarefy.genus,'All_taxa_genus_biplot')
alpha_diversity_metrics(kraken_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(kraken_mpa),'Genera')
barplot_proportional_data(get_species(kraken_mpa),'species','All_taxa_species_proportion')
log_heatform(kraken_mpa.rarefy.species,'species','All_taxa_species_heat')
bi_plot(kraken_mpa.rarefy.species,'All_taxa_species_biplot')
alpha_diversity_metrics(kraken_mpa.rarefy.species,'Species')
species_per_sample(get_species(kraken_mpa),'Species')
setwd('..')
setwd('Bacteria')
bacteria_mpa.rarefy.family <-rarefy_here(get_family(bacteria_mpa) ,'bacteria_mpa_rarefy_family.txt')
bacteria_mpa.rarefy.genus <-rarefy_here(get_genus(bacteria_mpa) ,'bacteria_mpa_rarefy_genus.txt')
bacteria_mpa.rarefy.species <-rarefy_here(get_species(bacteria_mpa),'bacteria_mpa_rarefy_species.txt')
barplot_proportional_data(get_family(bacteria_mpa),'family','Bacteria_familiy_proportion')
log_heatform(bacteria_mpa.rarefy.family,'family','Bacteria_family_heat')
bi_plot(bacteria_mpa.rarefy.family,'Bacteria_family_biplot')
barplot_proportional_data(get_genus(bacteria_mpa),'genus','Bacteria_genus_proportion')
log_heatform(bacteria_mpa.rarefy.genus,'genus','Bacteria_genus_heat')
bi_plot(bacteria_mpa.rarefy.genus,'Bacteria_genus_biplot')
barplot_proportional_data(get_species(bacteria_mpa),'species','Bacteria_species_proportion')
log_heatform(bacteria_mpa.rarefy.species,'species','Bacteria_species_heat')
bi_plot(bacteria_mpa.rarefy.species,'Bacteria_species_biplot')
alpha_diversity_metrics(bacteria_mpa.rarefy.family,'Families')
species_per_sample(get_family(bacteria_mpa),'Families')
alpha_diversity_metrics(bacteria_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(bacteria_mpa),'Genera')
alpha_diversity_metrics(bacteria_mpa.rarefy.species,'Species')
species_per_sample(get_species(bacteria_mpa),'Species')
setwd('..')
setwd('Viruses')
viruses_mpa.rarefy.family <-rarefy_here(get_family(viruses_mpa) ,'viruses_mpa_rarefy_family.txt')
viruses_mpa.rarefy.genus <-rarefy_here(get_genus(viruses_mpa) ,'viruses_mpa_rarefy_genus.txt')
viruses_mpa.rarefy.species <-rarefy_here(get_species(viruses_mpa),'viruses_mpa_rarefy_species.txt')
barplot_proportional_data(get_family(viruses_mpa),'family','Virus_family_proportion')
log_heatform(viruses_mpa.rarefy.family,'family','Virus_family_heat')
bi_plot(viruses_mpa.rarefy.family,'Viruses_family_biplot')
barplot_proportional_data(get_genus(viruses_mpa),'genus','Virus_genus_proportion')
log_heatform(viruses_mpa.rarefy.genus,'genus','Virus_genus_heat')
bi_plot(viruses_mpa.rarefy.genus,'Viruses_genus_biplot')
barplot_proportional_data(get_species(viruses_mpa),'species','Virus_species_proportion')
log_heatform(viruses_mpa.rarefy.species,'species','Virus_species_heat')
bi_plot(viruses_mpa.rarefy.species,'Virus_species_biplot')
alpha_diversity_metrics(viruses_mpa.rarefy.family,'Families')
species_per_sample(get_family(viruses_mpa),'Families')
alpha_diversity_metrics(viruses_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(viruses_mpa),'Genera')
alpha_diversity_metrics(viruses_mpa.rarefy.species,'Species')
species_per_sample(get_species(viruses_mpa),'Species')
setwd('..')
setwd('Archaea')
#archaea_mpa.rarefy.family <-rarefy_here(get_family(archaea_mpa) ,'archaea_mpa_rarefy_family.txt')
archaea_mpa.rarefy.genus <-rarefy_here(get_genus(archaea_mpa) ,'archaea_mpa_rarefy_genus.txt')
archaea_mpa.rarefy.species <-rarefy_here(get_species(archaea_mpa),'archaea_mpa_rarefy_species.txt')
#barplot_proportional_data(get_family(archaea_mpa),'family','Archaea_family_proportion')
#log_heatform(archaea_mpa.rarefy.family,'family','Archaea_family_heat')
barplot_proportional_data(get_genus(archaea_mpa),'genus','Archaea_genus_proportion')
log_heatform(archaea_mpa.rarefy.genus,'family','Archaea_genus_heat')
bi_plot(archaea_mpa.rarefy.genus,'Archaea_genus_biplot')
barplot_proportional_data(get_species(archaea_mpa),'species','Archaea_species_proportion')
log_heatform(archaea_mpa.rarefy.species,'species','Archaea_speceis_heat')
bi_plot(archaea_mpa.rarefy.species,'Archaea_species_biplot')
#alpha_diversity_metrics(archaea_mpa.rarefy.family,'Families')
#species_per_sample(get_family(archaea_mpa),'Families')
alpha_diversity_metrics(archaea_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(archaea_mpa),'Genera')
alpha_diversity_metrics(archaea_mpa.rarefy.species,'Species')
species_per_sample(get_species(archaea_mpa),'Species')
setwd('..')
setwd('Views')
bacteria_genera <- c('g__Salmonella' , 'g__Campylobacter', 'g__Staphylococcus','g__Clostridium','g__Chlamydia','g__Streptococcus','g__Escherichia','g__Enterococcus','g__Enterobacter', 'g__Helicobacter', 'g__Bacillus','g__Listeria','g__Alteromonas','g__Pseudomonas','g__Lactobacillus','g__Lactococcus','g__Altermonas')
views_mpa <- list()
for (i in bacteria_genera){
indices <- grep(i,bacteria_mpa$ID)
views_mpa <- rbind (views_mpa,bacteria_mpa[indices,] )
}
views_mpa.rarefy.genus <- rarefy_here(get_genus(views_mpa),'views_mpa_rarefy_genus.txt')
views_mpa.rarefy.species <-rarefy_here(get_species(views_mpa),'views_mpa_rarefy_species.txt')
log_heatform_big(views_mpa.rarefy.genus,'genus','views_genus_heat')
log_heatform_big(views_mpa.rarefy.species,'species','views_species_heat')
setwd('..')
---
title: "Swine Micro-biome"
author: "Dylan B Storey"
output:
pdf_document:
fig_caption: yes
keep_tex: yes
html_document:
fig_caption: yes
keep_md: yes
---
```{r, library load , echo=FALSE,message=FALSE}
library(ggplot2)
library(reshape2)
library(vegan)
library(gplots)
library(ggrepel)
library(fossil)
library(knitr)
library(pander)
```
```{r , function declaration , echo=FALSE, message=FALSE}
global_S <- NULL
get_species<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the species rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains species rows
#'
IndicesToGrab<-grep("s__",matrix$ID);
matrix$ID<-as.character(matrix$ID);
Species<-matrix[IndicesToGrab,];
Species$ID<-gsub("^.*s__|","",Species$ID)
row.names(Species)=Species$ID
Species<-Species[,-1];
Species
}
get_genus<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the genus rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains genus rows
#'
IndicesToGrab<-grep("g__",matrix$ID);
Genera<-matrix[IndicesToGrab,];
Genera$ID<-gsub("^.*g__|","",Genera$ID);
Genus_and_species<-grep("s__",Genera$ID);
Genera<-Genera[-Genus_and_species,];
row.names(Genera)<-Genera$ID;
Genera<-Genera[,-1];
Genera
}
get_family<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the family rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains family rows
#'
IndicesToGrab<-grep("f__",matrix$ID);
Families<-matrix[IndicesToGrab,];
Families$ID<-gsub("^.*f__|","", Families$ID);
Family_genus_species_list<-grep("g__", Families$ID);
Families<-Families[-Family_genus_species_list,];
Family_species_list <- grep("s__",Families$ID);
Families<-Families[-Family_species_list,];
row.names(Families)<-Families$ID;
Families<-Families[,-1];
Families
}
barplot_proportional_data<-function(matrix,level){
#' Given a matrix of data , creates a stacked bargraph of the data as proportions
#'
#' @param matrix , the incoming MPA matrix
#' @param level , the taxonomic level to put in the title
#'
#' @return the ggplot2 object
matrix.norm<-data.matrix(matrix)
matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
colnames(matrix.norm) <- colnames(matrix)
rownames(matrix.norm) <-rownames(matrix)
matrix.norm.melt<-melt(matrix.norm);
pp1<-ggplot(matrix.norm.melt,aes(x=Var2,y=value,fill=Var1))+geom_bar(stat="identity",position="stack")+theme(legend.position="none",panel.grid.major=element_blank(),panel.grid.minor=element_blank(),panel.border=element_blank(),panel.background=element_blank())+theme(axis.text.x = element_text(angle = 45, hjust = 1,size=4))+xlab("Sample Name")+ylab("Proportion of Observations")+ggtitle(paste("Proprotional observations at ",level," level"))
return(pp1)
}
rarefy_here<-function(matrix,name=NULL){
#' Cached rarefication , because rarefying data is expensive a cached version is useful
#' This function caches both the incoming matrix AND the rarefied matrix
#'
#' @param matrix , the matrix count data to be rarefied
#' @param name (default=NULL i.e. optional) where to cache the data
#'
#' @return the rarefied matrix of \code{matrix}
#'
if (!is.null(name)){
name = paste0(name,'.rare.txt')
original_table_name = paste0(name,'.no_rare.txt')
if (! file.exists(name)){
#file doesn't exist , rarefy and cache
Observations<-colSums(matrix)
rarefied<-data.frame(rrarefy(t(matrix),min(Observations)))
write.table(t(matrix),file=original_table_name)
write.table(rarefied,file = name)
}
else{
#load from disk
rarefied <- read.table(name,header=TRUE)
}
}
else{
rarefied<-data.frame(rrarefy(t(matrix),min(Observations)))
}
return(rarefied)
}
log2_heatmap<-function(matrix,level, main = NULL, cexRow = 0.3 , trace = "none", keysize =1 , xlab = "Sample ID", ylab = NULL, sepwidth=c(0.05,0.05), ColumnColors = NULL,...){
#' creates a heatmap of the matrix after a log2 transformation
#'
#' @param matrix the incoming matrix
#' @param level the taxonomic level (for title)
#'
#' @return heatmap.2 object
#' @example my_plot <- log2_heatmap(data)
#' @example my_plot()
my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 1000)
logxform <-log2(matrix)
logxform[logxform==-Inf]<-0
logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
hclustfunc <- function(x) hclust(x, method="average")
distfunc <- function(x) dist(x, method="canberra")
# perform clustering on rows and columns
cl.col <- hclustfunc(distfunc(matrix))
col_colors<-ColumnColors[cl.col$order]
plot_heatmap<- function() heatmap.2(as.matrix(t(logxform)),
col=my_palette,
main=main,
distfun = distfunc,
hclustfun = hclustfunc,
cexRow = cexRow,
trace = trace,
keysize = keysize,
xlab = xlab,
ylab = ylab,
sepwidth = sepwidth,
ColSideColors = col_colors ,
...
)
}
draw_circle<- function(center = c(0,0),r = 1, npoints = 100){
#' Draws a circle and places data into a data-frame for later use
#' @param center , where should I center on
#' @param diameter , how wide
#' @param npoints , how many points to draw
#'
#' @return dataframe of x, y co-ordinates
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy , r = r))
}
```
```{r , echo=FALSE,message=FALSE}
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
require(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
```
```{r, echo=FALSE,message=FALSE}
better_bi_plot <- function(matrix , logtransform = TRUE , PC_selection = c(1,2) , disp_load = 10 , color_by = NULL, vector_scale = 25 , scree = TRUE){
#' Calculates PCA and build a bi-plot using ggrepel for a prettier output
#' @param matrix , the matrix of data
#' @param logtransform , should the data be transformed
#' @param PC_selection , a list of PCs to retain for plotting defaults to c(1,2)
#' @param disp_load , which loadings should be displayed (defaults to the strongest 10)
#'
#' @returns bi_plot data frame , all plots are in bi_plot$plot , screeplot in bi_plot$screeplot , pca in bi_plot$pca
#build circle
r= 1
tt <- seq(0,2*pi,length.out = 1000)
xx <- 0 + r * cos(tt)
yy <- 0 + r * sin(tt)
circle<- (data.frame(x = xx, y = yy , r = r))
#Transform if wished
if (logtransform == TRUE){
matrix <- log2(matrix)
matrix[matrix==-Inf]<-0
matrix<-matrix[,apply(matrix, 2, var, na.rm=TRUE) != 0]
}
#PCA perform pca
pca <- prcomp(matrix, center = TRUE, scale.=TRUE)
#Scree Plot
std_deviation <- pca$sdev
variance<-std_deviation^2
variance<-variance / sum(variance)
variance<-as.data.frame(variance)
variance <-cbind(PC = as.numeric(row.names(variance)),variance)
if (scree == TRUE){
screeplot <- ggplot(variance, aes(y=variance,x=PC))+geom_bar(stat="identity")+ylab("Proportion of Variance")+xlab("Principle Component")
plot(screeplot)
}
loadings <-pca$rotation
loadings <- sweep(loadings , 2,pca
$sdev , FUN="*")
scores <- pca$x
#start building bi-plots for ALL the PC pairs
for( i in 1:length(PC_selection)){
if (i != length(PC_selection)){
j = i+1
vecs <- data.frame(loadings[,c(i,j)])
#get the top and bottom vectors
tmp_vecs1a<-vecs[order(vecs[,1]),][1:disp_load,]
tmp_vecs1b<-vecs[rev(order(vecs[,1])),][1:disp_load,]
tmp_vecs2a<-vecs[order(vecs[,2]),][1:disp_load,]
tmp_vecs2b<-vecs[rev(order(vecs[,2])),][1:disp_load,]
vecs <-rbind(tmp_vecs1a,tmp_vecs1b,tmp_vecs2a,tmp_vecs2b)
vecs <-unique(vecs)
pca.df <- as.data.frame(pca$x[,c(i,j)])
pca.df <- cbind(ID = row.names(pca.df),pca.df)
pca.df <- merge(pca.df , color_by, by=0 , all=TRUE)
pca.df$Row.names<-NULL
pca_plot <- ggplot(as.data.frame(pca.df), aes_string(x=colnames(pca.df)[2] , y=colnames(pca.df)[3], colour='factor'))+
geom_point(size = 2) +
geom_text_repel(
size=2.5,
aes(label=ID),
segment.color = 'gray',
point.padding = unit(1.0,"lines"),
box.padding = unit(0.1,"lines"),
force = 1,
)+
geom_point(
size=5,
shape = 3,
aes(x=0,y=0),
colour = 'black'
)+
scale_color_hue(l=40)
loadings_plot <- ggplot()+
geom_path(
data=circle,
aes(x , y),
color="black")+
coord_fixed()+
geom_point(
data=vecs,
aes_string(x=colnames(vecs)[1] , y=colnames(vecs)[2]),
color="black",
size=2
)+
geom_text_repel(
data=vecs,
aes_string(x=colnames(vecs)[1] , y=colnames(vecs)[2], label='rownames(vecs)'),
color = "black",
size=2.5,
segment.color = 'gray',
point.padding = unit(1.0,"lines"),
box.padding = unit(0.1,"lines"),
force = 1
) +
geom_point(
size=5,
shape = 3,
aes(x=0,y=0),
colour = 'black'
)
multiplot(pca_plot , loadings_plot, cols = 2)
}
}
}
```
```{r load-mapping-rates , echo = FALSE , fig.width=5}
map_rates = as.data.frame(read.table(file="kraken_stats.tbl",header=TRUE,sep="\t"))
map_rates.proportion_melt <- melt(map_rates,id.vars = 'sample' , measure.vars=c('unclassified.proportion','classified.proportion'))
map_rates.count_melt <- melt(map_rates,id.vars = 'sample' , measure.vars=c('unclassified.count','classified.count'))
row.names(map_rates) <- map_rates$sample
map_rates$sample <- NULL
```
## Sequencing reads per sample:
```{r, mapping-bargraph , echo=FALSE, fig.cap="Read counts for all samples"}
ggplot(data=map_rates.count_melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45,hjust=1, size=4))+labs(x= "Sample",y="Number Classified",title="Number of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))
```
```{r, mapping-rates , echo = FALSE , fig.cap="Proportion of classified and un-classified reads"}
ggplot(data=map_rates.proportion_melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45,hjust=1,size=4))+labs(x= "Sample",y="Proportion Classified",title="Proportion of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))
```
```{r,data-prep, echo=FALSE}
#Sample Groups
sample_groups = read.table(file="sample_info.csv",sep = ",",header=TRUE)
sample_groups <- subset(sample_groups, sample_groups$sample_name != '30-03-R')
sample_groups <- subset(sample_groups, sample_groups$sample_name != '30-05-I')
sample_groups <- subset(sample_groups, sample_groups$sample_name != '31-13-R')
row.names(sample_groups)<-sample_groups$sample_name
#getting our groupings together
normal_feed <- c('06-10-R','07-13-R','08-11-R','09-03-R','30-11-R','33-12-R')
malnourished <- c('06-13-R','07-09-R','08-06-R','09-05-R','30-10-R','33-10-R')
malnourished_gm <- c('29-13-R','31-02-R','31-03-R','33-05-R','33-08-R')
malnourished_hlz <- c('29-02-R','29-12-R','31-08-R','31-11-R','33-04-R','33-06-R')
malnourished_etec <- c('29-15-R','29-16-R','30-01-R','31-07-R')
malnourished_gm_etec <- c('29-01-R','29-11-R','31-09-R')
malnourished_hlz_etec <- c('29-03-R','29-08-R','30-05-R','31-10-R')
ileum_normal_feed <- c('06-10-I','08-11-I','33-12-I')
ileum_malnourished <- c('07-09-I','09-05-I','30-10-I')
ileum_malnourished_gm <- c('29-13-I','31-02-I','31-03-I','31-13-I')
ileum_malnourished_hlz <- c('29-12-I','31-08-I','33-04-I')
ileum_malnourished_etec <- c('29-15-I','29-16-I')
ileum_malnourished_hlz_etec <- c('29-03-I','31-10-I')
ileum_malnourished_gm_etec <- c('29-11-I','30-03-I','31-09-I')
names = c(normal_feed ,malnourished ,malnourished_gm ,malnourished_hlz,malnourished_etec ,malnourished_gm_etec ,malnourished_hlz_etec ,ileum_normal_feed ,ileum_malnourished ,ileum_malnourished_gm ,ileum_malnourished_hlz,ileum_malnourished_etec ,ileum_malnourished_hlz_etec ,ileum_malnourished_gm_etec)
kraken_mpa <- read.csv("swine-merged.mpa",sep="\t",header=TRUE,check.names = FALSE)
kraken_mpa$ID<-as.character(kraken_mpa$ID)
kraken_mpa$'30-03-R' <- NULL
kraken_mpa$'30-05-I'<-NULL
kraken_mpa$'31-13-R'<-NULL
kraken_mpa <- kraken_mpa[c("ID",names)]
c_normal_feed <- rep( 'green' , length(normal_feed))
c_ileum_normal_feed <- rep( 'green', length(ileum_normal_feed))
c_malnourished <- rep( 'gold',length(malnourished))
c_ileum_malnourished <- rep( 'gold', length(ileum_malnourished))
c_malnourished_gm <- rep( 'cornflowerblue',length(malnourished_gm))
c_ileum_malnourished_gm <- rep( 'cornflowerblue', length(ileum_malnourished_gm))
c_ileum_malnourished_hlz <- rep( 'dodgerblue', length(ileum_malnourished_hlz))
c_malnourished_hlz <- rep( 'dodgerblue',length(malnourished_hlz))
c_malnourished_etec <- rep( 'red4',length(malnourished_etec))
c_ileum_malnourished_etec <- rep( 'red4', length(ileum_malnourished_etec))
c_malnourished_gm_etec <- rep( 'tomato',length(malnourished_gm_etec))
c_ileum_malnourished_gm_etec <- rep( 'tomato', length(ileum_malnourished_gm_etec))
c_ileum_malnourished_hlz_etec <- rep( 'firebrick', length(ileum_malnourished_hlz_etec))
c_malnourished_hlz_etec <- rep( 'firebrick',length(malnourished_hlz_etec))
column_colors <- c(c_normal_feed ,c_malnourished ,c_malnourished_gm ,c_malnourished_hlz,c_malnourished_etec ,c_malnourished_gm_etec ,c_malnourished_hlz_etec ,c_ileum_normal_feed ,c_ileum_malnourished ,c_ileum_malnourished_gm ,c_ileum_malnourished_hlz,c_ileum_malnourished_etec ,c_ileum_malnourished_hlz_etec ,c_ileum_malnourished_gm_etec)
dirlist <- c('all_taxa', 'bacteria','archaea','viruses')
for( i in dirlist){
if (! dir.exists(i)){
dir.create(i)
}
}
```
```{r}
setwd('all_taxa')
kraken_mpa.rarefy.family <-rarefy_here(get_family(kraken_mpa) ,'all_taxa_family')
kraken_mpa.rarefy.genus <-rarefy_here(get_genus(kraken_mpa) ,'all_taxa_genus')
kraken_mpa.rarefy.species <-rarefy_here(get_species(kraken_mpa),'all_taxa_species')
setwd('..')
```
```{r}
indices<-grep("Bacteria",kraken_mpa$ID)
bacteria_mpa <-kraken_mpa[indices,]
setwd('bacteria')
bacteria_mpa.rarefy.family <-rarefy_here(get_family(bacteria_mpa) ,'bacteria_family')
bacteria_mpa.rarefy.genus <-rarefy_here(get_genus(bacteria_mpa) ,'bacteria_genus')
bacteria_mpa.rarefy.species <-rarefy_here(get_species(bacteria_mpa),'bacteria_species')
setwd('..')
indices<-grep("Viruses",kraken_mpa$ID)
virus_mpa <-kraken_mpa[indices,]
setwd('viruses')
virus_mpa.rarefy.family <-rarefy_here(get_family(virus_mpa) ,'viruses_family')
virus_mpa.rarefy.genus <-rarefy_here(get_genus(virus_mpa) ,'viruses_genus')
virus_mpa.rarefy.species <-rarefy_here(get_species(virus_mpa),'viruses_species')
setwd('..')
indices<-grep("Archaea",kraken_mpa$ID)
archaea_mpa <-kraken_mpa[indices,]
setwd('archaea')
archaea_mpa.rarefy.family <-rarefy_here(get_family(kraken_mpa) ,'archeaa_family')
archaea_mpa.rarefy.genus <-rarefy_here(get_genus(kraken_mpa) ,'archeaa_genus')
archaea_mpa.rarefy.species <-rarefy_here(get_species(kraken_mpa),'archeaa_species')
setwd('..')
```
#Proportional Measures
##All Taxa
```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Family level "}
p1<-barplot_proportional_data(get_family(kraken_mpa),'Family')
plot(p1)
```
```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Genus level "}
p1<-barplot_proportional_data(get_genus(kraken_mpa),'Genus')
plot(p1)
```
##Bacteria
```{r, echo=FALSE , fig.cap = "Proportion of observations for Bacteria at Family level "}
p1<-barplot_proportional_data(get_family(bacteria_mpa),'Family')
plot(p1)
```
```{r, echo=FALSE , fig.cap = "Proportion of observations for Bacteria at Genus level ",fig.height=11, fig.width=8}
p1<-barplot_proportional_data(get_genus(bacteria_mpa),'Genus')
plot(p1)
```
## Viral
```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Family level "}
p1<-barplot_proportional_data(get_family(virus_mpa),'Family')
plot(p1)
```
```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Genus level "}
p1<-barplot_proportional_data(get_genus(virus_mpa),'Genus')
plot(p1)
```
#Clustering
##All Taxa
```{r, echo=FALSE , fig.cap = "Clustered heat map of all taxa at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(kraken_mpa.rarefy.family , ColumnColors = column_colors )
p2()
```
```{r, echo=FALSE , fig.cap = "Clustered heat map of all taxa at Genus level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(kraken_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```
##Bacteria
```{r, echo=FALSE , fig.cap = "Clustered heat map of Bacteria at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(bacteria_mpa.rarefy.family, ColumnColors = column_colors)
p2()
```
```{r, echo=FALSE , fig.cap = "Clustered heat map of Bacteria at Genus level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(bacteria_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```
##Viral Ecology
```{r, echo=FALSE , fig.cap = "Clustered heat map of Viruses at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(virus_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```
```{r, echo=FALSE , fig.cap = "Clustered heat map of Viruses at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(virus_mpa.rarefy.family, ColumnColors = column_colors)
p2()
```
#PCA and Loadings
## All Groups
```{r , build-coloring-factors}
groups<-as.data.frame(sample_groups$group)
row.names(groups) <- row.names(sample_groups)
colnames(groups) <-'factor'
nutrition<-as.data.frame(sample_groups$nutrition)
row.names(nutrition) <- row.names(sample_groups)
colnames(nutrition) <-'factor'
collection<-as.data.frame(sample_groups$collection_type)
row.names(collection)<- row.names(sample_groups)
colnames(collection) <- 'factor'
supplement<-as.data.frame(sample_groups$supplement)
row.names(supplement) <- row.names(sample_groups)
colnames(supplement)<-'factor'
etec_challenge <- as.data.frame(sample_groups$etec_challenge)
row.names(etec_challenge) <- row.names(sample_groups)
colnames(etec_challenge)<-'factor'
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by treatment"}
better_bi_plot( kraken_mpa.rarefy.genus , color_by = groups)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by collection location"}
better_bi_plot( kraken_mpa.rarefy.genus , color_by = collection , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by collection nutrition"}
better_bi_plot( kraken_mpa.rarefy.genus , color_by = nutrition , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by supplement"}
better_bi_plot( kraken_mpa.rarefy.genus , color_by = supplement , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by challenge"}
better_bi_plot( kraken_mpa.rarefy.genus , color_by = etec_challenge, scree = FALSE)
```
```{r}
fecal<-subset(sample_groups, collection_type == 'fecal', select = sample_name)
illial <-subset(sample_groups, collection_type == 'ileum', select = sample_name)
fecal.rarefy.genus <- kraken_mpa.rarefy.genus[row.names(fecal),]
fecal.groups <- as.data.frame(groups[row.names(fecal),])
row.names(fecal.groups)<-row.names(fecal)
colnames(fecal.groups)<-'factor'
fecal.nutrition <- as.data.frame( nutrition[row.names(fecal),])
fecal.collection <- as.data.frame( collection[row.names(fecal),])
fecal.supplement <- as.data.frame( supplement[row.names(fecal),])
fecal.etec_challenge <- as.data.frame( etec_challenge[row.names(fecal),])
row.names(fecal.nutrition ) <- row.names(fecal)
row.names(fecal.collection ) <- row.names(fecal)
row.names(fecal.supplement ) <- row.names(fecal)
row.names(fecal.etec_challenge) <- row.names(fecal)
colnames(fecal.nutrition )<- 'factor'
colnames(fecal.collection )<- 'factor'
colnames(fecal.supplement )<- 'factor'
colnames(fecal.etec_challenge)<- 'factor'
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of fecal samples based on all measurements, samples colored by group"}
better_bi_plot(fecal.rarefy.genus, color_by = fecal.groups)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by nutrition"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.nutrition , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by collection location"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.collection , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by supplement"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.supplement , scree = FALSE)
```
```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by challenge"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.etec_challenge, scree = FALSE)
```
```{r , warning=FALSE}
fecal.rarefy.genus.t <- t(kraken_mpa.rarefy.genus)
fecal.rarefy.genus.t <- fecal.rarefy.genus.t[rowSums(fecal.rarefy.genus.t) != 0, ]
pvalues = c()
bacteria = c()
for( row in row.names(fecal.rarefy.genus.t)){
x<- fecal.rarefy.genus.t[row,row.names(fecal)]
y<- fecal.rarefy.genus.t[row,row.names(illial)]
test <- wilcox.test(x, y , exact = TRUE , conf.int = TRUE )
if(test$p.value <= 0.05){
pvalues <- c (pvalues , test$p.value)
bacteria <- c (bacteria, row)
}
# if(test$p.value < 0.5){
# boxplot(x,y,main = row)
# }
}
pander(data.frame(bacteria , pvalues))
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment