dylanbstorey/GeneralAnalyeses

## GeneralAnalyeses
library(ggplot2)
library(reshape2)
library(vegan)
library(gplots)
library(ggrepel)
library(fossil)

global_S <- NULL


get_species<-function(matrix){
  IndicesToGrab<-grep("s__",matrix$ID);
  matrix$ID<-as.character(matrix$ID);
  Species<-matrix[IndicesToGrab,];
  Species$ID<-gsub("^.*s__|","",Species$ID)
  row.names(Species)=Species$ID
  Species<-Species[,-1];
  Species
  }

get_genus<-function(matrix){
  IndicesToGrab<-grep("g__",matrix$ID);
  Genera<-matrix[IndicesToGrab,];
  Genera$ID<-gsub("^.*g__|","",Genera$ID);
  Genus_and_species<-grep("s__",Genera$ID);
  Genera<-Genera[-Genus_and_species,];
  row.names(Genera)<-Genera$ID;
  Genera<-Genera[,-1];
  Genera
  }

get_family<-function(matrix){
  IndicesToGrab<-grep("f__",matrix$ID);
  Families<-matrix[IndicesToGrab,];
  Families$ID<-gsub("^.*f__|","", Families$ID);
  Family_genus_species_list<-grep("g__", Families$ID);
  Families<-Families[-Family_genus_species_list,];
  Family_species_list <- grep("s__",Families$ID);
  Families<-Families[-Family_species_list,];
  row.names(Families)<-Families$ID;
  Families<-Families[,-1];
  Families
  }

barplot_proportional_data <-function(matrix,level,outname){
  matrix.norm<-data.matrix(matrix);
  matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
  matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
  colnames(matrix.norm) <- colnames(matrix)
  rownames(matrix.norm) <-rownames(matrix)
  matrix.norm.melt<-melt(matrix.norm);
  ggplot(matrix.norm.melt,aes(x=Var2,y=value,fill=Var1))+geom_bar(stat="identity",position="stack")+theme(legend.position="none",panel.grid.major=element_blank(),panel.grid.minor=element_blank(),panel.border=element_blank(),panel.background=element_blank())+theme(axis.text.x = element_text(angle = 45, hjust = 1))+xlab("Sample Name")+ylab("Proportion of Observations")+ggtitle(paste("Proprotional  observations at ",level," level"))
  ggsave(filename=paste0(outname,".pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
}


rarefy_here<-function(matrix,name){
  if (! file.exists(name)){
    minObservations<-min(colSums(matrix))
    rarefied<-data.frame(rrarefy(t(matrix),minObservations))
    write.table(rarefied,file = name)
    }
  else{
    rarefied <- read.table(name,header=TRUE)
    }
    rarefied
  }

log_heatform<-function(matrix,level,outname){
  logxform <-log2(matrix)
  logxform[logxform==-Inf]<-0
  logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
  pdf(paste0(outname,".pdf"), width=16 , height=32)
  heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=0.3,sepwidth=c(0.05,0.05))
  dev.off()
  }

log_heatform_big<-function(matrix,level,outname){
  logxform <-log2(matrix)
  logxform[logxform==-Inf]<-0
  logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
  pdf(paste0(outname,".pdf"), width=16 , height=32)
  heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=1,sepwidth=c(0.05,0.05))
  dev.off()
}


alpha_diversity_metrics<-function(matrix,level){
  global_S <- NULL
# TAKES RAREFIED DATA
  H <- data.frame(diversity(matrix))
  simpson <- data.frame(diversity(matrix, "simpson"))
  shannon<-data.frame(diversity(matrix , "shannon"))
  invsimp <- data.frame(diversity(matrix, "inv"))
  alpha <- data.frame(fisher.alpha(matrix))
  ## Species richness (S) and Pielou's evenness (J):
  S <- data.frame(specnumber(matrix))
  J <- data.frame(H/log(S))
  Diversity<-cbind(simpson, shannon, invsimp, alpha, S, J)
  Diversity$Sample<-row.names(Diversity)


  names(Diversity)<-c("Simpson", "Shannon",  "InvSimpson", "Alpha", 'ClassNo', "Evenness", "Sample")
  Diversity<-Diversity[,c(7,1,2,3,4, 5, 6)]
  pdf(file=paste0(level,'_estimate_pairs.pdf'))
  pairs(cbind(simpson, shannon, alpha, J, S), col="blue")
  dev.off()
  # Plots of alpha-diversity metrics

  ggplot(data=Diversity, aes(x=Sample, y=Simpson)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
  ggsave(paste0(level,"_Diversity_Simpson.pdf"),width=5,height=5,units="in",dpi=500)
  ggplot(data=Diversity, aes(x=Sample, y=Alpha)) + geom_bar(colour="black", fill="darkblue", stat="identity")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
  ggsave(paste0(level,"_Diversity_Alpha.pdf"),width=5,height=5,units="in",dpi=500)
  ggplot(data=Diversity, aes(x=Sample, y=Evenness)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
  ggsave(paste0(level,"_Diversity_Eveness.pdf"),width=5,height=5,units="in",dpi=500)
  ggplot(data=Diversity, aes(x=Sample, y=ClassNo)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
  ggsave(paste0(level,"_Diversity_classNo.pdf"),width=5,height=5,units="in",dpi=500)
  ggplot(data=Diversity, aes(x=Sample, y=Shannon)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
  ggsave(paste0(level,"_Diversity_Shannon.pdf"),width=5,height=5,units="in",dpi=500)
  global_S <<- S
  }

species_per_sample<-function(matrix,level){
  ## Rarefaction curves
  raremax <- min(rowSums(t(matrix)))
  Srare <- rarefy(t(matrix), raremax)
  x<-data.frame(global_S,Srare)

  pdf(paste0(level,"_rarefaction.pdf"))
  plot(x, xlab = paste0("Observed No. of ",level), ylab = paste0("Rarefied No. of ",level))
  dev.off();

  pdf(paste0("rarefaction_",level,'.pdf'))
  rarecurve(t(matrix), step = 1000, col = "blue", cex = 0.5, sample= raremax, xlab = "Sample Size", ylab = level, label = TRUE, lwd = 2)
  dev.off()


    }
my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 1000)

draw_circle<- function(center = c(0,0),diameter = 1, npoints = 1000){
  r = 25
  tt <- seq(0,2*pi,length.out = npoints)
  xx <- center[1] + r * cos(tt)
  yy <- center[2] + r * sin(tt)
  return(data.frame(x = xx, y = yy))
}

circle<-draw_circle()


bi_plot <-function(matrix, outfile ){
  logxform <-log2(matrix)
  logxform[logxform==-Inf]<-0
  logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]

  #PCA and Scree
  pca<-prcomp(logxform , center = TRUE , scale.=TRUE)
  std_dev<-pca$sdev
  loadings<-pca$rotation
  norm_loadings<-sweep(loadings,2,colSums(loadings),"/")
  scores<-pca$x
  variance<- std_dev^2
  variance<-variance/sum(variance) * 100
  variance<-as.data.frame(variance)
  variance <-cbind(PC = as.numeric(row.names(variance)),variance)
  ggplot(variance, aes(y=variance,x=PC))+geom_bar(stat="identity")+ylab("Proportion of Variance")+xlab("Principle Component")
  ggsave(filename=paste0(outfile,"_scree.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500)

  vecs<-data.frame(loadings[,c('PC1','PC2')])
  vecs = vecs * 25 * 10

  tmp_vecs1a<-vecs[order(vecs[,1]),][1:10,]
  tmp_vecs1b<-vecs[rev(order(vecs[,1])),][1:10,]
  tmp_vecs2a<-vecs[order(vecs[,2]),][1:10,]
  tmp_vecs2b<-vecs[rev(order(vecs[,2])),][1:10,]
  vecs <-rbind(tmp_vecs1a,tmp_vecs1b,tmp_vecs2a,tmp_vecs2b)
  vecs <-unique(vecs)


  pca.df <-as.data.frame(pca$x)
  pca.df <- cbind( ID = row.names(pca.df),pca.df)

  ggplot(pca.df, aes(x=PC1 , y = PC2)) +
    geom_point(
      size=1,
    )+
    geom_text_repel(
      size=2.5,
      aes(label=ID),
      segment.color = 'gray',
      point.padding = unit(1.0,"lines"),
      box.padding = unit(0.1,"lines"),
      force = 1,
    )+
    geom_point(
      size=5,
      shape = 3,
      aes(x=0,y=0),
      color="grey",
    )+
    geom_path(
      data=circle,
      aes(x , y),
      color="red")+
    coord_fixed()+
    geom_point(
      data=vecs,
      aes(x=PC1,y=PC2),
      color="red",
      size=2
    )+
    geom_text_repel(
      data=vecs,
      aes(x=PC1,y=PC2,label=rownames(vecs)),
      color = "red",
      size=2.5,
      segment.color = 'gray',
      point.padding = unit(1.0,"lines"),
      box.padding = unit(0.1,"lines"),
      force = 1
    )
  ggsave(filename=paste0(outfile,"_pca.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
}


#Kraken Mapping
map_rates = as.data.frame(read.table(file="stats.tbl",header=TRUE,sep="\t"))
map_rates.melt <- melt(map_rates,id.vars = 'sample')
ggplot(data=map_rates.melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x= "Sample",y="Proportion Classified",title="Proportion of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))
ggsave(filename = "KrakenClassificationProportion.pdf", plot=last_plot(),width=16,height=12,units="in",dpi=500);
#data prep
kraken_mpa <- read.csv("kraken-mpa.merge",sep="\t",header=TRUE)
kraken_mpa$ID<-as.character(kraken_mpa$ID)

#sub sample Bacteria, Viruses , and Archaea
indices<-grep("Bacteria",kraken_mpa$ID)
bacteria_mpa <-kraken_mpa[indices,]

indices<-grep("Viruses",kraken_mpa$ID)
viruses_mpa <-kraken_mpa[indices,]

indices<-grep("Archaea",kraken_mpa$ID)
archaea_mpa <-kraken_mpa[indices,]

dirlist <- c('AllTaxa', 'Bacteria','Archaea','Viruses','Views')

for( i in dirlist){
  if (! dir.exists(i)){
    dir.create(i)
    }
  }


setwd('AllTaxa')
kraken_mpa.rarefy.family     <-rarefy_here(get_family(kraken_mpa) ,'kraken_mpa_rarefy_family.txt')
kraken_mpa.rarefy.genus      <-rarefy_here(get_genus(kraken_mpa)  ,'kraken_mpa_rarefy_genus.txt')
kraken_mpa.rarefy.species    <-rarefy_here(get_species(kraken_mpa),'kraken_mpa_rarefy_species.txt')

barplot_proportional_data(get_family(kraken_mpa),'family', 'All_taxa_family_proportion')
log_heatform(kraken_mpa.rarefy.family,'family','All_taxa_family_heat')
bi_plot(kraken_mpa.rarefy.family,"All_taxa_family_biplot")
alpha_diversity_metrics(kraken_mpa.rarefy.family,'Families')
species_per_sample(get_family(kraken_mpa),'Families')

barplot_proportional_data(get_genus(kraken_mpa),'genus','All_taxa_genus_proportion')
log_heatform(kraken_mpa.rarefy.genus,'genus','All_taxa_genus_heat')
bi_plot(kraken_mpa.rarefy.genus,'All_taxa_genus_biplot')
alpha_diversity_metrics(kraken_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(kraken_mpa),'Genera')

barplot_proportional_data(get_species(kraken_mpa),'species','All_taxa_species_proportion')
log_heatform(kraken_mpa.rarefy.species,'species','All_taxa_species_heat')
bi_plot(kraken_mpa.rarefy.species,'All_taxa_species_biplot')

alpha_diversity_metrics(kraken_mpa.rarefy.species,'Species')
species_per_sample(get_species(kraken_mpa),'Species')


setwd('..')

setwd('Bacteria')
bacteria_mpa.rarefy.family   <-rarefy_here(get_family(bacteria_mpa) ,'bacteria_mpa_rarefy_family.txt')
bacteria_mpa.rarefy.genus    <-rarefy_here(get_genus(bacteria_mpa)  ,'bacteria_mpa_rarefy_genus.txt')
bacteria_mpa.rarefy.species  <-rarefy_here(get_species(bacteria_mpa),'bacteria_mpa_rarefy_species.txt')

barplot_proportional_data(get_family(bacteria_mpa),'family','Bacteria_familiy_proportion')
log_heatform(bacteria_mpa.rarefy.family,'family','Bacteria_family_heat')
bi_plot(bacteria_mpa.rarefy.family,'Bacteria_family_biplot')


barplot_proportional_data(get_genus(bacteria_mpa),'genus','Bacteria_genus_proportion')
log_heatform(bacteria_mpa.rarefy.genus,'genus','Bacteria_genus_heat')
bi_plot(bacteria_mpa.rarefy.genus,'Bacteria_genus_biplot')

barplot_proportional_data(get_species(bacteria_mpa),'species','Bacteria_species_proportion')
log_heatform(bacteria_mpa.rarefy.species,'species','Bacteria_species_heat')
bi_plot(bacteria_mpa.rarefy.species,'Bacteria_species_biplot')


alpha_diversity_metrics(bacteria_mpa.rarefy.family,'Families')
species_per_sample(get_family(bacteria_mpa),'Families')
alpha_diversity_metrics(bacteria_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(bacteria_mpa),'Genera')
alpha_diversity_metrics(bacteria_mpa.rarefy.species,'Species')
species_per_sample(get_species(bacteria_mpa),'Species')


setwd('..')

setwd('Viruses')
viruses_mpa.rarefy.family    <-rarefy_here(get_family(viruses_mpa) ,'viruses_mpa_rarefy_family.txt')
viruses_mpa.rarefy.genus     <-rarefy_here(get_genus(viruses_mpa)  ,'viruses_mpa_rarefy_genus.txt')
viruses_mpa.rarefy.species   <-rarefy_here(get_species(viruses_mpa),'viruses_mpa_rarefy_species.txt')

barplot_proportional_data(get_family(viruses_mpa),'family','Virus_family_proportion')
log_heatform(viruses_mpa.rarefy.family,'family','Virus_family_heat')
bi_plot(viruses_mpa.rarefy.family,'Viruses_family_biplot')


barplot_proportional_data(get_genus(viruses_mpa),'genus','Virus_genus_proportion')
log_heatform(viruses_mpa.rarefy.genus,'genus','Virus_genus_heat')
bi_plot(viruses_mpa.rarefy.genus,'Viruses_genus_biplot')


barplot_proportional_data(get_species(viruses_mpa),'species','Virus_species_proportion')
log_heatform(viruses_mpa.rarefy.species,'species','Virus_species_heat')
bi_plot(viruses_mpa.rarefy.species,'Virus_species_biplot')

alpha_diversity_metrics(viruses_mpa.rarefy.family,'Families')
species_per_sample(get_family(viruses_mpa),'Families')
alpha_diversity_metrics(viruses_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(viruses_mpa),'Genera')
alpha_diversity_metrics(viruses_mpa.rarefy.species,'Species')
species_per_sample(get_species(viruses_mpa),'Species')

setwd('..')


setwd('Archaea')
#archaea_mpa.rarefy.family    <-rarefy_here(get_family(archaea_mpa) ,'archaea_mpa_rarefy_family.txt')
archaea_mpa.rarefy.genus     <-rarefy_here(get_genus(archaea_mpa)  ,'archaea_mpa_rarefy_genus.txt')
archaea_mpa.rarefy.species   <-rarefy_here(get_species(archaea_mpa),'archaea_mpa_rarefy_species.txt')

#barplot_proportional_data(get_family(archaea_mpa),'family','Archaea_family_proportion')
#log_heatform(archaea_mpa.rarefy.family,'family','Archaea_family_heat')

barplot_proportional_data(get_genus(archaea_mpa),'genus','Archaea_genus_proportion')
log_heatform(archaea_mpa.rarefy.genus,'family','Archaea_genus_heat')
bi_plot(archaea_mpa.rarefy.genus,'Archaea_genus_biplot')

barplot_proportional_data(get_species(archaea_mpa),'species','Archaea_species_proportion')
log_heatform(archaea_mpa.rarefy.species,'species','Archaea_speceis_heat')
bi_plot(archaea_mpa.rarefy.species,'Archaea_species_biplot')


#alpha_diversity_metrics(archaea_mpa.rarefy.family,'Families')
#species_per_sample(get_family(archaea_mpa),'Families')
alpha_diversity_metrics(archaea_mpa.rarefy.genus,'Genera')
species_per_sample(get_genus(archaea_mpa),'Genera')
alpha_diversity_metrics(archaea_mpa.rarefy.species,'Species')
species_per_sample(get_species(archaea_mpa),'Species')

setwd('..')


setwd('Views')
bacteria_genera <- c('g__Salmonella' , 'g__Campylobacter', 'g__Staphylococcus','g__Clostridium','g__Chlamydia','g__Streptococcus','g__Escherichia','g__Enterococcus','g__Enterobacter', 'g__Helicobacter', 'g__Bacillus','g__Listeria','g__Alteromonas','g__Pseudomonas','g__Lactobacillus','g__Lactococcus','g__Altermonas')
views_mpa <- list()
for (i in bacteria_genera){
  indices <- grep(i,bacteria_mpa$ID)
  views_mpa <- rbind (views_mpa,bacteria_mpa[indices,] )
}
views_mpa.rarefy.genus <- rarefy_here(get_genus(views_mpa),'views_mpa_rarefy_genus.txt')
views_mpa.rarefy.species <-rarefy_here(get_species(views_mpa),'views_mpa_rarefy_species.txt')
log_heatform_big(views_mpa.rarefy.genus,'genus','views_genus_heat')
log_heatform_big(views_mpa.rarefy.species,'species','views_species_heat')
setwd('..')

## Microbiom.rmd
---
title: "Swine Micro-biome"
author: "Dylan B Storey"
output:
  pdf_document:
    fig_caption: yes
    keep_tex: yes
  html_document:
    fig_caption: yes
    keep_md: yes
---

```{r, library load , echo=FALSE,message=FALSE}

library(ggplot2)
library(reshape2)
library(vegan)
library(gplots)
library(ggrepel)
library(fossil)
library(knitr)
library(pander)
```

```{r , function declaration , echo=FALSE, message=FALSE}
global_S <- NULL

get_species<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the species rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains species rows
#'
  IndicesToGrab<-grep("s__",matrix$ID);
  matrix$ID<-as.character(matrix$ID);
  Species<-matrix[IndicesToGrab,];
  Species$ID<-gsub("^.*s__|","",Species$ID)
  row.names(Species)=Species$ID
  Species<-Species[,-1];
  Species
  }

get_genus<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the genus rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains genus rows
#'
  IndicesToGrab<-grep("g__",matrix$ID);
  Genera<-matrix[IndicesToGrab,];
  Genera$ID<-gsub("^.*g__|","",Genera$ID);
  Genus_and_species<-grep("s__",Genera$ID);
  Genera<-Genera[-Genus_and_species,];
  row.names(Genera)<-Genera$ID;
  Genera<-Genera[,-1];
  Genera
  }

get_family<-function(matrix){
#' Given an matrix where taxon ID is mpa format - gets the family rows and cleans up the name accordingly
#'
#' @param matrix the incomoning mpa matrix
#' @return a subset of \code{matrix} that only contains family rows
#'
  IndicesToGrab<-grep("f__",matrix$ID);
  Families<-matrix[IndicesToGrab,];
  Families$ID<-gsub("^.*f__|","", Families$ID);
  Family_genus_species_list<-grep("g__", Families$ID);
  Families<-Families[-Family_genus_species_list,];
  Family_species_list <- grep("s__",Families$ID);
  Families<-Families[-Family_species_list,];
  row.names(Families)<-Families$ID;
  Families<-Families[,-1];
  Families
  }

barplot_proportional_data<-function(matrix,level){
#' Given a matrix of data , creates a stacked bargraph of the data as proportions
#'
#' @param matrix , the incoming MPA matrix
#' @param level  , the taxonomic level to put in the title
#'
#' @return the ggplot2 object
	matrix.norm<-data.matrix(matrix)
	matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
  	colnames(matrix.norm) <- colnames(matrix)
  	rownames(matrix.norm) <-rownames(matrix)
  	matrix.norm.melt<-melt(matrix.norm);

	pp1<-ggplot(matrix.norm.melt,aes(x=Var2,y=value,fill=Var1))+geom_bar(stat="identity",position="stack")+theme(legend.position="none",panel.grid.major=element_blank(),panel.grid.minor=element_blank(),panel.border=element_blank(),panel.background=element_blank())+theme(axis.text.x = element_text(angle = 45, hjust = 1,size=4))+xlab("Sample Name")+ylab("Proportion of Observations")+ggtitle(paste("Proprotional  observations at ",level," level"))

	return(pp1)
	}

rarefy_here<-function(matrix,name=NULL){
#' Cached rarefication , because rarefying data is expensive a cached version is useful
#' This function caches both the incoming matrix AND the rarefied matrix
#'
#' @param matrix , the matrix count data to be rarefied
#' @param name (default=NULL i.e. optional) where to cache the data
#'
#' @return the rarefied matrix of \code{matrix}
#'

	if (!is.null(name)){
  		name = paste0(name,'.rare.txt')
  		original_table_name = paste0(name,'.no_rare.txt')
		if (! file.exists(name)){
			#file doesn't exist , rarefy and cache
	    	Observations<-colSums(matrix)
	    	rarefied<-data.frame(rrarefy(t(matrix),min(Observations)))
	    	write.table(t(matrix),file=original_table_name)
	    	write.table(rarefied,file = name)
    		}
   		else{
   			#load from disk
    		rarefied <- read.table(name,header=TRUE)
    		}
    	}
    else{
    	rarefied<-data.frame(rrarefy(t(matrix),min(Observations)))
    	}

    return(rarefied)
	}

log2_heatmap<-function(matrix,level, main = NULL, cexRow = 0.3 , trace = "none", keysize =1 , xlab = "Sample ID", ylab = NULL, sepwidth=c(0.05,0.05), ColumnColors = NULL,...){

#' creates a heatmap of the matrix after a log2 transformation
#'
#' @param matrix the incoming matrix
#' @param level  the taxonomic level (for title)
#'
#' @return heatmap.2 object
#' @example my_plot <- log2_heatmap(data)
#' @example my_plot()

  my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 1000)
  logxform <-log2(matrix)
  logxform[logxform==-Inf]<-0
  logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]

  hclustfunc <- function(x) hclust(x, method="average")
  distfunc <- function(x) dist(x, method="canberra")

  # perform clustering on rows and columns
  cl.col <- hclustfunc(distfunc(matrix))
  col_colors<-ColumnColors[cl.col$order]

  plot_heatmap<- function() heatmap.2(as.matrix(t(logxform)),
  							col=my_palette,
  							main=main,
  							distfun = distfunc,
  							hclustfun = hclustfunc,
  							cexRow = cexRow,
  							trace = trace,
  							keysize = keysize,
  							xlab = xlab,
  							ylab = ylab,
  							sepwidth = sepwidth,
  							ColSideColors = col_colors ,
  							...
  							)


	}

draw_circle<- function(center = c(0,0),r = 1, npoints = 100){
  #' Draws a circle and places data into a data-frame for later use
  #' @param center , where should I center on
  #' @param diameter , how wide
  #' @param npoints , how many points to draw
  #'
  #' @return dataframe of x, y co-ordinates


  tt <- seq(0,2*pi,length.out = npoints)
  xx <- center[1] + r * cos(tt)
  yy <- center[2] + r * sin(tt)
  return(data.frame(x = xx, y = yy , r = r))
}

```

```{r ,  echo=FALSE,message=FALSE}
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)

  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

```


```{r,  echo=FALSE,message=FALSE}

better_bi_plot <- function(matrix , logtransform = TRUE , PC_selection = c(1,2) , disp_load = 10 , color_by = NULL, vector_scale = 25 , scree = TRUE){
#' Calculates PCA and build a bi-plot using ggrepel for a prettier output
#' @param matrix 		, the matrix of data
#' @param logtransform 	, should the data be transformed
#' @param PC_selection 	, a list of PCs to retain for plotting defaults to c(1,2)
#' @param disp_load 	, which loadings should be displayed  (defaults to the strongest 10)
#'
#' @returns bi_plot data frame , all plots are in bi_plot$plot , screeplot in bi_plot$screeplot , pca in bi_plot$pca


#build circle

  r= 1
  tt <- seq(0,2*pi,length.out = 1000)
  xx <- 0 + r * cos(tt)
  yy <- 0 + r * sin(tt)
  circle<- (data.frame(x = xx, y = yy , r = r))

#Transform if wished
	if (logtransform == TRUE){
		matrix <- log2(matrix)
		matrix[matrix==-Inf]<-0
  		matrix<-matrix[,apply(matrix, 2, var, na.rm=TRUE) != 0]
		}

#PCA perform pca

	pca <- prcomp(matrix, center = TRUE, scale.=TRUE)

#Scree Plot
	std_deviation <- pca$sdev
	variance<-std_deviation^2
	variance<-variance / sum(variance)
	variance<-as.data.frame(variance)
	variance <-cbind(PC = as.numeric(row.names(variance)),variance)

	if (scree == TRUE){
		screeplot <- ggplot(variance, aes(y=variance,x=PC))+geom_bar(stat="identity")+ylab("Proportion of Variance")+xlab("Principle Component")
		plot(screeplot)
		}


	loadings <-pca$rotation

	loadings <- sweep(loadings , 2,pca
	                  $sdev , FUN="*")


	scores <- pca$x


#start building bi-plots for ALL the PC pairs
	for( i in 1:length(PC_selection)){

		if (i != length(PC_selection)){
			j = i+1
		    vecs <- data.frame(loadings[,c(i,j)])

		    #get the top and bottom vectors
		    tmp_vecs1a<-vecs[order(vecs[,1]),][1:disp_load,]
		    tmp_vecs1b<-vecs[rev(order(vecs[,1])),][1:disp_load,]
		    tmp_vecs2a<-vecs[order(vecs[,2]),][1:disp_load,]
		    tmp_vecs2b<-vecs[rev(order(vecs[,2])),][1:disp_load,]
		    vecs <-rbind(tmp_vecs1a,tmp_vecs1b,tmp_vecs2a,tmp_vecs2b)
		    vecs <-unique(vecs)


		    pca.df <- as.data.frame(pca$x[,c(i,j)])
		    pca.df <- cbind(ID = row.names(pca.df),pca.df)

  		  pca.df <- merge(pca.df , color_by, by=0 , all=TRUE)
        pca.df$Row.names<-NULL

        pca_plot <- ggplot(as.data.frame(pca.df), aes_string(x=colnames(pca.df)[2] , y=colnames(pca.df)[3], colour='factor'))+
		  geom_point(size = 2) +
			geom_text_repel(
			      size=2.5,
			      aes(label=ID),
			      segment.color = 'gray',
			      point.padding = unit(1.0,"lines"),
			      box.padding = unit(0.1,"lines"),
			      force = 1,
			      )+
			    geom_point(
			      size=5,
			      shape = 3,
			      aes(x=0,y=0),
			      colour = 'black'
			   	  )+
		      scale_color_hue(l=40)

		      loadings_plot <- ggplot()+
			    geom_path(
			      data=circle,
			      aes(x , y),
			      color="black")+
			    coord_fixed()+
			    geom_point(
			      data=vecs,
			      aes_string(x=colnames(vecs)[1] , y=colnames(vecs)[2]),
			      color="black",
			      size=2
			     )+
			    geom_text_repel(
			      data=vecs,
			      aes_string(x=colnames(vecs)[1] , y=colnames(vecs)[2], label='rownames(vecs)'),
			      color = "black",
			      size=2.5,
			      segment.color = 'gray',
			      point.padding = unit(1.0,"lines"),
			      box.padding = unit(0.1,"lines"),
			      force = 1
			      )	+
		        geom_point(
			        size=5,
			        shape = 3,
			        aes(x=0,y=0),
			        colour = 'black'
			      )
		      multiplot(pca_plot , loadings_plot, cols = 2)

		   }
		}
	}


```


```{r load-mapping-rates , echo = FALSE , fig.width=5}
map_rates = as.data.frame(read.table(file="kraken_stats.tbl",header=TRUE,sep="\t"))
map_rates.proportion_melt <- melt(map_rates,id.vars = 'sample' , measure.vars=c('unclassified.proportion','classified.proportion'))
map_rates.count_melt <- melt(map_rates,id.vars = 'sample' , measure.vars=c('unclassified.count','classified.count'))

row.names(map_rates) <- map_rates$sample
map_rates$sample <- NULL
```


## Sequencing reads per sample:

```{r, mapping-bargraph , echo=FALSE, fig.cap="Read counts for all samples"}
ggplot(data=map_rates.count_melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45,hjust=1, size=4))+labs(x= "Sample",y="Number Classified",title="Number of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))

```

```{r, mapping-rates ,  echo = FALSE , fig.cap="Proportion of classified and un-classified reads"}

ggplot(data=map_rates.proportion_melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=45,hjust=1,size=4))+labs(x= "Sample",y="Proportion Classified",title="Proportion of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))

```


```{r,data-prep, echo=FALSE}


#Sample Groups
sample_groups = read.table(file="sample_info.csv",sep = ",",header=TRUE)

sample_groups <- subset(sample_groups, sample_groups$sample_name != '30-03-R')
sample_groups <- subset(sample_groups, sample_groups$sample_name != '30-05-I')
sample_groups <- subset(sample_groups, sample_groups$sample_name != '31-13-R')

row.names(sample_groups)<-sample_groups$sample_name

#getting our groupings together
normal_feed  				<- c('06-10-R','07-13-R','08-11-R','09-03-R','30-11-R','33-12-R')
malnourished 				<- c('06-13-R','07-09-R','08-06-R','09-05-R','30-10-R','33-10-R')
malnourished_gm 			<- c('29-13-R','31-02-R','31-03-R','33-05-R','33-08-R')
malnourished_hlz			<- c('29-02-R','29-12-R','31-08-R','31-11-R','33-04-R','33-06-R')
malnourished_etec 			<- c('29-15-R','29-16-R','30-01-R','31-07-R')
malnourished_gm_etec 		<- c('29-01-R','29-11-R','31-09-R')
malnourished_hlz_etec 		<- c('29-03-R','29-08-R','30-05-R','31-10-R')

ileum_normal_feed  			<- c('06-10-I','08-11-I','33-12-I')
ileum_malnourished 			<- c('07-09-I','09-05-I','30-10-I')
ileum_malnourished_gm 		<- c('29-13-I','31-02-I','31-03-I','31-13-I')
ileum_malnourished_hlz		<- c('29-12-I','31-08-I','33-04-I')
ileum_malnourished_etec 	<- c('29-15-I','29-16-I')
ileum_malnourished_hlz_etec <- c('29-03-I','31-10-I')
ileum_malnourished_gm_etec	<- c('29-11-I','30-03-I','31-09-I')

names = c(normal_feed  ,malnourished ,malnourished_gm ,malnourished_hlz,malnourished_etec ,malnourished_gm_etec ,malnourished_hlz_etec ,ileum_normal_feed  ,ileum_malnourished ,ileum_malnourished_gm ,ileum_malnourished_hlz,ileum_malnourished_etec ,ileum_malnourished_hlz_etec ,ileum_malnourished_gm_etec)

kraken_mpa <- read.csv("swine-merged.mpa",sep="\t",header=TRUE,check.names = FALSE)
kraken_mpa$ID<-as.character(kraken_mpa$ID)

kraken_mpa$'30-03-R' <- NULL
kraken_mpa$'30-05-I'<-NULL
kraken_mpa$'31-13-R'<-NULL
kraken_mpa <- kraken_mpa[c("ID",names)]

c_normal_feed  					<- rep( 'green' , length(normal_feed))
c_ileum_normal_feed  			<- rep( 'green', length(ileum_normal_feed))
c_malnourished 					<- rep( 'gold',length(malnourished))
c_ileum_malnourished 			<- rep( 'gold', length(ileum_malnourished))

c_malnourished_gm 				<- rep( 'cornflowerblue',length(malnourished_gm))
c_ileum_malnourished_gm 		<- rep( 'cornflowerblue', length(ileum_malnourished_gm))
c_ileum_malnourished_hlz		<- rep( 'dodgerblue', length(ileum_malnourished_hlz))
c_malnourished_hlz				<- rep( 'dodgerblue',length(malnourished_hlz))

c_malnourished_etec 			<- rep( 'red4',length(malnourished_etec))
c_ileum_malnourished_etec 	    <- rep( 'red4', length(ileum_malnourished_etec))

c_malnourished_gm_etec 			<- rep( 'tomato',length(malnourished_gm_etec))
c_ileum_malnourished_gm_etec	<- rep( 'tomato', length(ileum_malnourished_gm_etec))

c_ileum_malnourished_hlz_etec	<- rep( 'firebrick', length(ileum_malnourished_hlz_etec))
c_malnourished_hlz_etec 		<- rep( 'firebrick',length(malnourished_hlz_etec))

column_colors <- c(c_normal_feed  ,c_malnourished ,c_malnourished_gm ,c_malnourished_hlz,c_malnourished_etec ,c_malnourished_gm_etec ,c_malnourished_hlz_etec ,c_ileum_normal_feed  ,c_ileum_malnourished ,c_ileum_malnourished_gm ,c_ileum_malnourished_hlz,c_ileum_malnourished_etec ,c_ileum_malnourished_hlz_etec ,c_ileum_malnourished_gm_etec)


dirlist <- c('all_taxa', 'bacteria','archaea','viruses')

for( i in dirlist){
  if (! dir.exists(i)){
    dir.create(i)
    }
  }


```
```{r}

setwd('all_taxa')
kraken_mpa.rarefy.family     <-rarefy_here(get_family(kraken_mpa) ,'all_taxa_family')
kraken_mpa.rarefy.genus      <-rarefy_here(get_genus(kraken_mpa)  ,'all_taxa_genus')
kraken_mpa.rarefy.species    <-rarefy_here(get_species(kraken_mpa),'all_taxa_species')
setwd('..')
```
```{r}

indices<-grep("Bacteria",kraken_mpa$ID)
bacteria_mpa <-kraken_mpa[indices,]
setwd('bacteria')
bacteria_mpa.rarefy.family     <-rarefy_here(get_family(bacteria_mpa) ,'bacteria_family')
bacteria_mpa.rarefy.genus      <-rarefy_here(get_genus(bacteria_mpa)  ,'bacteria_genus')
bacteria_mpa.rarefy.species    <-rarefy_here(get_species(bacteria_mpa),'bacteria_species')
setwd('..')


indices<-grep("Viruses",kraken_mpa$ID)
virus_mpa <-kraken_mpa[indices,]
setwd('viruses')
virus_mpa.rarefy.family     <-rarefy_here(get_family(virus_mpa) ,'viruses_family')
virus_mpa.rarefy.genus      <-rarefy_here(get_genus(virus_mpa)  ,'viruses_genus')
virus_mpa.rarefy.species    <-rarefy_here(get_species(virus_mpa),'viruses_species')
setwd('..')


indices<-grep("Archaea",kraken_mpa$ID)
archaea_mpa <-kraken_mpa[indices,]

setwd('archaea')
archaea_mpa.rarefy.family     <-rarefy_here(get_family(kraken_mpa) ,'archeaa_family')
archaea_mpa.rarefy.genus      <-rarefy_here(get_genus(kraken_mpa)  ,'archeaa_genus')
archaea_mpa.rarefy.species    <-rarefy_here(get_species(kraken_mpa),'archeaa_species')
setwd('..')


```

#Proportional Measures

##All Taxa
```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Family level "}
p1<-barplot_proportional_data(get_family(kraken_mpa),'Family')
plot(p1)
```

```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Genus level "}
p1<-barplot_proportional_data(get_genus(kraken_mpa),'Genus')
plot(p1)
```

##Bacteria

```{r, echo=FALSE , fig.cap = "Proportion of observations for Bacteria at Family level "}
p1<-barplot_proportional_data(get_family(bacteria_mpa),'Family')
plot(p1)
```

```{r, echo=FALSE , fig.cap = "Proportion of observations for Bacteria at Genus level ",fig.height=11, fig.width=8}
p1<-barplot_proportional_data(get_genus(bacteria_mpa),'Genus')
plot(p1)
```

## Viral

```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Family level "}
p1<-barplot_proportional_data(get_family(virus_mpa),'Family')
plot(p1)
```

```{r, echo=FALSE , fig.cap = "Proportion of observations for all taxa at Genus level "}
p1<-barplot_proportional_data(get_genus(virus_mpa),'Genus')
plot(p1)
```


#Clustering

##All Taxa

```{r, echo=FALSE , fig.cap = "Clustered heat map of all taxa at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(kraken_mpa.rarefy.family , ColumnColors = column_colors )
p2()
```


```{r, echo=FALSE , fig.cap = "Clustered heat map of all taxa at Genus level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(kraken_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```


##Bacteria

```{r, echo=FALSE , fig.cap = "Clustered heat map of Bacteria at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(bacteria_mpa.rarefy.family, ColumnColors = column_colors)
p2()
```


```{r, echo=FALSE , fig.cap = "Clustered heat map of Bacteria at Genus level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(bacteria_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```


##Viral Ecology

```{r, echo=FALSE , fig.cap = "Clustered heat map of Viruses at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(virus_mpa.rarefy.genus, ColumnColors = column_colors)
p2()
```

```{r, echo=FALSE , fig.cap = "Clustered heat map of Viruses at Family level ",fig.height=12 , fig.width=12}
p2<-log2_heatmap(virus_mpa.rarefy.family, ColumnColors = column_colors)
p2()
```


#PCA and Loadings


## All Groups
```{r , build-coloring-factors}

groups<-as.data.frame(sample_groups$group)
row.names(groups) <- row.names(sample_groups)
colnames(groups) <-'factor'

nutrition<-as.data.frame(sample_groups$nutrition)
row.names(nutrition) <- row.names(sample_groups)
colnames(nutrition) <-'factor'

collection<-as.data.frame(sample_groups$collection_type)
row.names(collection)<- row.names(sample_groups)
colnames(collection) <- 'factor'


supplement<-as.data.frame(sample_groups$supplement)
row.names(supplement) <- row.names(sample_groups)
colnames(supplement)<-'factor'

etec_challenge <- as.data.frame(sample_groups$etec_challenge)
row.names(etec_challenge) <- row.names(sample_groups)
colnames(etec_challenge)<-'factor'

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by treatment"}

better_bi_plot( kraken_mpa.rarefy.genus , color_by = groups)

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by collection location"}

better_bi_plot( kraken_mpa.rarefy.genus , color_by = collection , scree = FALSE)

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by collection nutrition"}

better_bi_plot( kraken_mpa.rarefy.genus , color_by = nutrition , scree = FALSE)

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by supplement"}

better_bi_plot( kraken_mpa.rarefy.genus , color_by = supplement , scree = FALSE)

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of based on all measurements, samples colored by challenge"}

better_bi_plot( kraken_mpa.rarefy.genus , color_by = etec_challenge, scree = FALSE)

```


```{r}
fecal<-subset(sample_groups, collection_type == 'fecal', select = sample_name)
illial <-subset(sample_groups, collection_type == 'ileum', select = sample_name)
fecal.rarefy.genus <- kraken_mpa.rarefy.genus[row.names(fecal),]


fecal.groups <- as.data.frame(groups[row.names(fecal),])
row.names(fecal.groups)<-row.names(fecal)
colnames(fecal.groups)<-'factor'

fecal.nutrition 		<- as.data.frame( nutrition[row.names(fecal),])
fecal.collection 		<- as.data.frame( collection[row.names(fecal),])
fecal.supplement 		<- as.data.frame( supplement[row.names(fecal),])
fecal.etec_challenge	<- as.data.frame( etec_challenge[row.names(fecal),])

row.names(fecal.nutrition 	) <- row.names(fecal)
row.names(fecal.collection 	) <- row.names(fecal)
row.names(fecal.supplement 	) <- row.names(fecal)
row.names(fecal.etec_challenge) <- row.names(fecal)


colnames(fecal.nutrition 	)<- 'factor'
colnames(fecal.collection 	)<- 'factor'
colnames(fecal.supplement 	)<- 'factor'
colnames(fecal.etec_challenge)<- 'factor'

```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and Loading plots of fecal samples based on all measurements, samples colored by group"}
better_bi_plot(fecal.rarefy.genus, color_by = fecal.groups)
```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by nutrition"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.nutrition , scree = FALSE)
```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by collection location"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.collection , scree = FALSE)
```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by supplement"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.supplement ,  scree = FALSE)
```

```{r , fig.height = 12 , fig.width = 18 , fig.caption="PCA and loading plots of fecal samples , colored by challenge"}
better_bi_plot( fecal.rarefy.genus, color_by = fecal.etec_challenge, scree = FALSE)
```


```{r , warning=FALSE}

fecal.rarefy.genus.t <- t(kraken_mpa.rarefy.genus)
fecal.rarefy.genus.t <- fecal.rarefy.genus.t[rowSums(fecal.rarefy.genus.t) != 0, ]


pvalues = c()
bacteria = c()

for( row in row.names(fecal.rarefy.genus.t)){
  x<- fecal.rarefy.genus.t[row,row.names(fecal)]
  y<- fecal.rarefy.genus.t[row,row.names(illial)]
  test <- wilcox.test(x, y , exact = TRUE , conf.int =  TRUE )

  if(test$p.value <= 0.05){
  pvalues <- c (pvalues , test$p.value)
  bacteria <- c (bacteria, row)
  }

#  if(test$p.value < 0.5){
#    boxplot(x,y,main = row)
#   }
}
pander(data.frame(bacteria , pvalues))
```
	library(ggplot2)
	library(reshape2)
	library(vegan)
	library(gplots)
	library(ggrepel)
	library(fossil)

	global_S <- NULL



	get_species<-function(matrix){
	IndicesToGrab<-grep("s__",matrix$ID);
	matrix$ID<-as.character(matrix$ID);
	Species<-matrix[IndicesToGrab,];
	Species$ID<-gsub("^.*s__\|","",Species$ID)
	row.names(Species)=Species$ID
	Species<-Species[,-1];
	Species
	}

	get_genus<-function(matrix){
	IndicesToGrab<-grep("g__",matrix$ID);
	Genera<-matrix[IndicesToGrab,];
	Genera$ID<-gsub("^.*g__\|","",Genera$ID);
	Genus_and_species<-grep("s__",Genera$ID);
	Genera<-Genera[-Genus_and_species,];
	row.names(Genera)<-Genera$ID;
	Genera<-Genera[,-1];
	Genera
	}

	get_family<-function(matrix){
	IndicesToGrab<-grep("f__",matrix$ID);
	Families<-matrix[IndicesToGrab,];
	Families$ID<-gsub("^.*f__\|","", Families$ID);
	Family_genus_species_list<-grep("g__", Families$ID);
	Families<-Families[-Family_genus_species_list,];
	Family_species_list <- grep("s__",Families$ID);
	Families<-Families[-Family_species_list,];
	row.names(Families)<-Families$ID;
	Families<-Families[,-1];
	Families
	}

	barplot_proportional_data <-function(matrix,level,outname){
	matrix.norm<-data.matrix(matrix);
	matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
	matrix.norm<-sweep(matrix.norm,2,colSums(matrix.norm),'/');
	colnames(matrix.norm) <- colnames(matrix)
	rownames(matrix.norm) <-rownames(matrix)
	matrix.norm.melt<-melt(matrix.norm);
	ggplot(matrix.norm.melt,aes(x=Var2,y=value,fill=Var1))+geom_bar(stat="identity",position="stack")+theme(legend.position="none",panel.grid.major=element_blank(),panel.grid.minor=element_blank(),panel.border=element_blank(),panel.background=element_blank())+theme(axis.text.x = element_text(angle = 45, hjust = 1))+xlab("Sample Name")+ylab("Proportion of Observations")+ggtitle(paste("Proprotional observations at ",level," level"))
	ggsave(filename=paste0(outname,".pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
	}


	rarefy_here<-function(matrix,name){
	if (! file.exists(name)){
	minObservations<-min(colSums(matrix))
	rarefied<-data.frame(rrarefy(t(matrix),minObservations))
	write.table(rarefied,file = name)
	}
	else{
	rarefied <- read.table(name,header=TRUE)
	}
	rarefied
	}

	log_heatform<-function(matrix,level,outname){
	logxform <-log2(matrix)
	logxform[logxform==-Inf]<-0
	logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
	pdf(paste0(outname,".pdf"), width=16 , height=32)
	heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=0.3,sepwidth=c(0.05,0.05))
	dev.off()
	}

	log_heatform_big<-function(matrix,level,outname){
	logxform <-log2(matrix)
	logxform[logxform==-Inf]<-0
	logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]
	pdf(paste0(outname,".pdf"), width=16 , height=32)
	heatmap.2(as.matrix(t(logxform)), col=my_palette,keysize = .75, margins=c(15,10), trace="none", xlab="Sample ID",ylab="Taxon",main=paste0(c("Clustering of samples at ", level," level" )),cexRow=1,sepwidth=c(0.05,0.05))
	dev.off()
	}



	alpha_diversity_metrics<-function(matrix,level){
	global_S <- NULL
	# TAKES RAREFIED DATA
	H <- data.frame(diversity(matrix))
	simpson <- data.frame(diversity(matrix, "simpson"))
	shannon<-data.frame(diversity(matrix , "shannon"))
	invsimp <- data.frame(diversity(matrix, "inv"))
	alpha <- data.frame(fisher.alpha(matrix))
	## Species richness (S) and Pielou's evenness (J):
	S <- data.frame(specnumber(matrix))
	J <- data.frame(H/log(S))
	Diversity<-cbind(simpson, shannon, invsimp, alpha, S, J)
	Diversity$Sample<-row.names(Diversity)


	names(Diversity)<-c("Simpson", "Shannon", "InvSimpson", "Alpha", 'ClassNo', "Evenness", "Sample")
	Diversity<-Diversity[,c(7,1,2,3,4, 5, 6)]
	pdf(file=paste0(level,'_estimate_pairs.pdf'))
	pairs(cbind(simpson, shannon, alpha, J, S), col="blue")
	dev.off()
	# Plots of alpha-diversity metrics

	ggplot(data=Diversity, aes(x=Sample, y=Simpson)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
	ggsave(paste0(level,"_Diversity_Simpson.pdf"),width=5,height=5,units="in",dpi=500)
	ggplot(data=Diversity, aes(x=Sample, y=Alpha)) + geom_bar(colour="black", fill="darkblue", stat="identity")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
	ggsave(paste0(level,"_Diversity_Alpha.pdf"),width=5,height=5,units="in",dpi=500)
	ggplot(data=Diversity, aes(x=Sample, y=Evenness)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
	ggsave(paste0(level,"_Diversity_Eveness.pdf"),width=5,height=5,units="in",dpi=500)
	ggplot(data=Diversity, aes(x=Sample, y=ClassNo)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
	ggsave(paste0(level,"_Diversity_classNo.pdf"),width=5,height=5,units="in",dpi=500)
	ggplot(data=Diversity, aes(x=Sample, y=Shannon)) + geom_bar(colour="black", fill="darkblue", stat="identity") +theme(axis.text.x = element_text(angle = 45, hjust = 1))
	ggsave(paste0(level,"_Diversity_Shannon.pdf"),width=5,height=5,units="in",dpi=500)
	global_S <<- S
	}

	species_per_sample<-function(matrix,level){
	## Rarefaction curves
	raremax <- min(rowSums(t(matrix)))
	Srare <- rarefy(t(matrix), raremax)
	x<-data.frame(global_S,Srare)

	pdf(paste0(level,"_rarefaction.pdf"))
	plot(x, xlab = paste0("Observed No. of ",level), ylab = paste0("Rarefied No. of ",level))
	dev.off();

	pdf(paste0("rarefaction_",level,'.pdf'))
	rarecurve(t(matrix), step = 1000, col = "blue", cex = 0.5, sample= raremax, xlab = "Sample Size", ylab = level, label = TRUE, lwd = 2)
	dev.off()


	}
	my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 1000)

	draw_circle<- function(center = c(0,0),diameter = 1, npoints = 1000){
	r = 25
	tt <- seq(0,2*pi,length.out = npoints)
	xx <- center[1] + r * cos(tt)
	yy <- center[2] + r * sin(tt)
	return(data.frame(x = xx, y = yy))
	}

	circle<-draw_circle()


	bi_plot <-function(matrix, outfile ){
	logxform <-log2(matrix)
	logxform[logxform==-Inf]<-0
	logxform<-logxform[,apply(logxform, 2, var, na.rm=TRUE) != 0]

	#PCA and Scree
	pca<-prcomp(logxform , center = TRUE , scale.=TRUE)
	std_dev<-pca$sdev
	loadings<-pca$rotation
	norm_loadings<-sweep(loadings,2,colSums(loadings),"/")
	scores<-pca$x
	variance<- std_dev^2
	variance<-variance/sum(variance) * 100
	variance<-as.data.frame(variance)
	variance <-cbind(PC = as.numeric(row.names(variance)),variance)
	ggplot(variance, aes(y=variance,x=PC))+geom_bar(stat="identity")+ylab("Proportion of Variance")+xlab("Principle Component")
	ggsave(filename=paste0(outfile,"_scree.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500)

	vecs<-data.frame(loadings[,c('PC1','PC2')])
	vecs = vecs * 25 * 10

	tmp_vecs1a<-vecs[order(vecs[,1]),][1:10,]
	tmp_vecs1b<-vecs[rev(order(vecs[,1])),][1:10,]
	tmp_vecs2a<-vecs[order(vecs[,2]),][1:10,]
	tmp_vecs2b<-vecs[rev(order(vecs[,2])),][1:10,]
	vecs <-rbind(tmp_vecs1a,tmp_vecs1b,tmp_vecs2a,tmp_vecs2b)
	vecs <-unique(vecs)


	pca.df <-as.data.frame(pca$x)
	pca.df <- cbind( ID = row.names(pca.df),pca.df)

	ggplot(pca.df, aes(x=PC1 , y = PC2)) +
	geom_point(
	size=1,
	)+
	geom_text_repel(
	size=2.5,
	aes(label=ID),
	segment.color = 'gray',
	point.padding = unit(1.0,"lines"),
	box.padding = unit(0.1,"lines"),
	force = 1,
	)+
	geom_point(
	size=5,
	shape = 3,
	aes(x=0,y=0),
	color="grey",
	)+
	geom_path(
	data=circle,
	aes(x , y),
	color="red")+
	coord_fixed()+
	geom_point(
	data=vecs,
	aes(x=PC1,y=PC2),
	color="red",
	size=2
	)+
	geom_text_repel(
	data=vecs,
	aes(x=PC1,y=PC2,label=rownames(vecs)),
	color = "red",
	size=2.5,
	segment.color = 'gray',
	point.padding = unit(1.0,"lines"),
	box.padding = unit(0.1,"lines"),
	force = 1
	)
	ggsave(filename=paste0(outfile,"_pca.pdf"),plot=last_plot(),width=16,height=12,units="in",dpi=500);
	}


	#Kraken Mapping
	map_rates = as.data.frame(read.table(file="stats.tbl",header=TRUE,sep="\t"))
	map_rates.melt <- melt(map_rates,id.vars = 'sample')
	ggplot(data=map_rates.melt, aes(x=sample,y=value,fill=variable))+geom_bar(stat="identity")+theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+labs(x= "Sample",y="Proportion Classified",title="Proportion of Classified and Unclassified Reads \n Kraken")+guides(fill=guide_legend(title="Read Type"))
	ggsave(filename = "KrakenClassificationProportion.pdf", plot=last_plot(),width=16,height=12,units="in",dpi=500);
	#data prep
	kraken_mpa <- read.csv("kraken-mpa.merge",sep="\t",header=TRUE)
	kraken_mpa$ID<-as.character(kraken_mpa$ID)

	#sub sample Bacteria, Viruses , and Archaea
	indices<-grep("Bacteria",kraken_mpa$ID)
	bacteria_mpa <-kraken_mpa[indices,]

	indices<-grep("Viruses",kraken_mpa$ID)
	viruses_mpa <-kraken_mpa[indices,]

	indices<-grep("Archaea",kraken_mpa$ID)
	archaea_mpa <-kraken_mpa[indices,]

	dirlist <- c('AllTaxa', 'Bacteria','Archaea','Viruses','Views')

	for( i in dirlist){
	if (! dir.exists(i)){
	dir.create(i)
	}
	}



	setwd('AllTaxa')
	kraken_mpa.rarefy.family <-rarefy_here(get_family(kraken_mpa) ,'kraken_mpa_rarefy_family.txt')
	kraken_mpa.rarefy.genus <-rarefy_here(get_genus(kraken_mpa) ,'kraken_mpa_rarefy_genus.txt')
	kraken_mpa.rarefy.species <-rarefy_here(get_species(kraken_mpa),'kraken_mpa_rarefy_species.txt')

	barplot_proportional_data(get_family(kraken_mpa),'family', 'All_taxa_family_proportion')
	log_heatform(kraken_mpa.rarefy.family,'family','All_taxa_family_heat')
	bi_plot(kraken_mpa.rarefy.family,"All_taxa_family_biplot")
	alpha_diversity_metrics(kraken_mpa.rarefy.family,'Families')
	species_per_sample(get_family(kraken_mpa),'Families')

	barplot_proportional_data(get_genus(kraken_mpa),'genus','All_taxa_genus_proportion')
	log_heatform(kraken_mpa.rarefy.genus,'genus','All_taxa_genus_heat')
	bi_plot(kraken_mpa.rarefy.genus,'All_taxa_genus_biplot')
	alpha_diversity_metrics(kraken_mpa.rarefy.genus,'Genera')
	species_per_sample(get_genus(kraken_mpa),'Genera')

	barplot_proportional_data(get_species(kraken_mpa),'species','All_taxa_species_proportion')
	log_heatform(kraken_mpa.rarefy.species,'species','All_taxa_species_heat')
	bi_plot(kraken_mpa.rarefy.species,'All_taxa_species_biplot')

	alpha_diversity_metrics(kraken_mpa.rarefy.species,'Species')
	species_per_sample(get_species(kraken_mpa),'Species')


	setwd('..')

	setwd('Bacteria')
	bacteria_mpa.rarefy.family <-rarefy_here(get_family(bacteria_mpa) ,'bacteria_mpa_rarefy_family.txt')
	bacteria_mpa.rarefy.genus <-rarefy_here(get_genus(bacteria_mpa) ,'bacteria_mpa_rarefy_genus.txt')
	bacteria_mpa.rarefy.species <-rarefy_here(get_species(bacteria_mpa),'bacteria_mpa_rarefy_species.txt')

	barplot_proportional_data(get_family(bacteria_mpa),'family','Bacteria_familiy_proportion')
	log_heatform(bacteria_mpa.rarefy.family,'family','Bacteria_family_heat')
	bi_plot(bacteria_mpa.rarefy.family,'Bacteria_family_biplot')


	barplot_proportional_data(get_genus(bacteria_mpa),'genus','Bacteria_genus_proportion')
	log_heatform(bacteria_mpa.rarefy.genus,'genus','Bacteria_genus_heat')
	bi_plot(bacteria_mpa.rarefy.genus,'Bacteria_genus_biplot')

	barplot_proportional_data(get_species(bacteria_mpa),'species','Bacteria_species_proportion')
	log_heatform(bacteria_mpa.rarefy.species,'species','Bacteria_species_heat')
	bi_plot(bacteria_mpa.rarefy.species,'Bacteria_species_biplot')


	alpha_diversity_metrics(bacteria_mpa.rarefy.family,'Families')
	species_per_sample(get_family(bacteria_mpa),'Families')
	alpha_diversity_metrics(bacteria_mpa.rarefy.genus,'Genera')
	species_per_sample(get_genus(bacteria_mpa),'Genera')
	alpha_diversity_metrics(bacteria_mpa.rarefy.species,'Species')
	species_per_sample(get_species(bacteria_mpa),'Species')


	setwd('..')

	setwd('Viruses')
	viruses_mpa.rarefy.family <-rarefy_here(get_family(viruses_mpa) ,'viruses_mpa_rarefy_family.txt')
	viruses_mpa.rarefy.genus <-rarefy_here(get_genus(viruses_mpa) ,'viruses_mpa_rarefy_genus.txt')
	viruses_mpa.rarefy.species <-rarefy_here(get_species(viruses_mpa),'viruses_mpa_rarefy_species.txt')

	barplot_proportional_data(get_family(viruses_mpa),'family','Virus_family_proportion')
	log_heatform(viruses_mpa.rarefy.family,'family','Virus_family_heat')
	bi_plot(viruses_mpa.rarefy.family,'Viruses_family_biplot')


	barplot_proportional_data(get_genus(viruses_mpa),'genus','Virus_genus_proportion')
	log_heatform(viruses_mpa.rarefy.genus,'genus','Virus_genus_heat')
	bi_plot(viruses_mpa.rarefy.genus,'Viruses_genus_biplot')


	barplot_proportional_data(get_species(viruses_mpa),'species','Virus_species_proportion')
	log_heatform(viruses_mpa.rarefy.species,'species','Virus_species_heat')
	bi_plot(viruses_mpa.rarefy.species,'Virus_species_biplot')

	alpha_diversity_metrics(viruses_mpa.rarefy.family,'Families')
	species_per_sample(get_family(viruses_mpa),'Families')
	alpha_diversity_metrics(viruses_mpa.rarefy.genus,'Genera')
	species_per_sample(get_genus(viruses_mpa),'Genera')
	alpha_diversity_metrics(viruses_mpa.rarefy.species,'Species')
	species_per_sample(get_species(viruses_mpa),'Species')

	setwd('..')


	setwd('Archaea')
	#archaea_mpa.rarefy.family <-rarefy_here(get_family(archaea_mpa) ,'archaea_mpa_rarefy_family.txt')
	archaea_mpa.rarefy.genus <-rarefy_here(get_genus(archaea_mpa) ,'archaea_mpa_rarefy_genus.txt')
	archaea_mpa.rarefy.species <-rarefy_here(get_species(archaea_mpa),'archaea_mpa_rarefy_species.txt')

	#barplot_proportional_data(get_family(archaea_mpa),'family','Archaea_family_proportion')
	#log_heatform(archaea_mpa.rarefy.family,'family','Archaea_family_heat')

	barplot_proportional_data(get_genus(archaea_mpa),'genus','Archaea_genus_proportion')
	log_heatform(archaea_mpa.rarefy.genus,'family','Archaea_genus_heat')
	bi_plot(archaea_mpa.rarefy.genus,'Archaea_genus_biplot')

	barplot_proportional_data(get_species(archaea_mpa),'species','Archaea_species_proportion')
	log_heatform(archaea_mpa.rarefy.species,'species','Archaea_speceis_heat')
	bi_plot(archaea_mpa.rarefy.species,'Archaea_species_biplot')


	#alpha_diversity_metrics(archaea_mpa.rarefy.family,'Families')
	#species_per_sample(get_family(archaea_mpa),'Families')
	alpha_diversity_metrics(archaea_mpa.rarefy.genus,'Genera')
	species_per_sample(get_genus(archaea_mpa),'Genera')
	alpha_diversity_metrics(archaea_mpa.rarefy.species,'Species')
	species_per_sample(get_species(archaea_mpa),'Species')

	setwd('..')


	setwd('Views')
	bacteria_genera <- c('g__Salmonella' , 'g__Campylobacter', 'g__Staphylococcus','g__Clostridium','g__Chlamydia','g__Streptococcus','g__Escherichia','g__Enterococcus','g__Enterobacter', 'g__Helicobacter', 'g__Bacillus','g__Listeria','g__Alteromonas','g__Pseudomonas','g__Lactobacillus','g__Lactococcus','g__Altermonas')
	views_mpa <- list()
	for (i in bacteria_genera){
	indices <- grep(i,bacteria_mpa$ID)
	views_mpa <- rbind (views_mpa,bacteria_mpa[indices,] )
	}
	views_mpa.rarefy.genus <- rarefy_here(get_genus(views_mpa),'views_mpa_rarefy_genus.txt')
	views_mpa.rarefy.species <-rarefy_here(get_species(views_mpa),'views_mpa_rarefy_species.txt')
	log_heatform_big(views_mpa.rarefy.genus,'genus','views_genus_heat')
	log_heatform_big(views_mpa.rarefy.species,'species','views_species_heat')
	setwd('..')