robiwangriff/ngs2gel.r

## ngs2gel.r
#################################################################################################################
#install
#library(devtools)
#source_gist("https://gist.github.com/robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86")

#spc= a species(cols) by samples (rows) data frame
#env= an environmental metadata data frame, aligned with spc. Used to order and label samples on x axis (#the order loaded on gel)
#tax= a taxonomy file with sequences/OTU ids as row names - normally would match colnames of spc df. Though shouldnt matter.

#subs= Unlike real gels, we cant plot the abundance of every species. This sets a relative abundance threshold for taxa to plot.
#band_multiplier= band phatness
#order_samps= column(s) in env to order x axis, outermost first eg c("Fertiliser","block")
#label_samps= column(s) in env to label x axis, outermost first eg c("Fertiliser","block")
#y_labs_remover=1 We cant label every taxon on y axis, this sets an abundance threshold for taxa to label as: y_labs_remover*subs
#y_labs_size= text size y labels
#band_col= band colour
#label_tax_1= column in tax denoting the facetted taxonomic labels on right hand size - eg go broad, "Phylum"...or "phylum"
#label_tax_2= column in tax denoting the taxonomic labels on left hand side eg more resolved - "order"

#ISSUES: very hacky, some code redundant and much can be improved:
  #1. optimising ggrepel  params for plotting many y axis labs.
  #2. making the function more generalizable
  #3. indicator labelling
  #...
##########################################################################################################################

ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
label_tax_2="order", band_col="grey",y_labs_remover=2,y_labs_size=4){

#libs used
require(reshape2)
require(vegan)
require(ggh4x)
library(cowplot)
library(ggrepel)

#convert species df to proportions (0 to 1 scale)
spc<-decostand(spc,"total")

#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
spc[spc<subs]<-0
spc<-spc[,colSums(spc)>0]
spc[spc==0]<-NA

#######################################################################
#reorder species randomly (NOT DONE)
#set.seed(32)
#ord<-sample(ncol(spc))
#spc<-spc[,ord]
#colnames(spc)<-1:ncol(spc)

########################################################################
#sort species according to taxonomy
#index the species names so they are no longer long sequences

taxa.sel<-colnames(spc)#separate seqs

#pull tax info for the subsetted taxa
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)

#add in unknown classification
tax_df[is.na(tax_df)]<-"Unknown"

#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
tax_df<-tax_df[do.call(order, tax_df), ]

#sort spc df according to ordered tax df
spc<-spc[,match(rownames(tax_df), colnames(spc))]
identical(rownames(tax_df),colnames(spc))#check

#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
colnames(spc)<-sprintf("%03d",1:ncol(spc))
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa

#################################################################################
#reorder rows (samples) according to "order_samps" argument
spc<-spc[with(env,do.call(order,mget(order_samps))),]
env<-env[with(env,do.call(order,mget(order_samps))),]
env<-env[,order_samps]
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"

##################################################################################
#make long df for ggplotting,  with xy coords and abundances (for band fatness)
dat.w<-data.frame(spc,env,check.names=F)
dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))

names(dat.l)[names(dat.l) == 'variable'] <- 'y'
names(dat.l)[names(dat.l) == 'value'] <- 'abund'

#add in selected taxon levels for labelling taxa to long df
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]

#make a new name for labelling taxa
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")

##############################################################
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
#min
dat.l$y_min<--1

#max which will be number of taxa per tax_1 designation (the facet) plus some padding
#count number of taxa per tax_1 taxonomic designation
tab<-data.frame(table(tax_df[,label_tax_1]))
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1

############################################################################################
#we also wont have room on y axis to label all species, so need to subset somehow

#first need make a column containing labels for plotting using ggrepel.
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated

dat.l$labels<-dat.l$id
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name

#We can further filter these eg by abundance if the y is looking too busy
# This could be expanded to only include "indicators" or anything really of interest

#make table with abundances each taxon (ie make long df wide again)
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent
dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df

#not used but potentially useful for advanced labelling
#require(ggtext)
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)

#########################################################################################################
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
# They then get bound by cowplot

pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
geom_blank()+
scale_x_discrete(NULL, guide = "axis_nested")  +
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
theme(strip.text.y = element_text(size = 6))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))

axis<-ggplot(dat.l,aes(x=0, y=y))+
geom_blank()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
theme(panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
  strip.text.y = element_blank())+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
                 color = dat.l$y_lab_col,  ## ggplot2 theme_grey() axis text
                 size = y_labs_size,  ## ggplot2 theme_grey() axis text
                  max.overlaps=Inf,
    seed         = 42,
    force        = 4,
    force_pull   = 0.0,
    nudge_x      = -1,
    direction    = "x",
    hjust        = 0,
    segment.size = 0.0001,
    segment.alpha = 0.1,
    box.padding  = 0.000
)+
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
                     breaks = NULL, labels = NULL, name = NULL)+
  theme(panel.background = element_blank(),
        plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
 geom_blank(aes(y = y_min-2)) +
 geom_blank(aes(y = y_max+2))

#align and plot
aligned <- align_plots(axis, pl, align = "h", axis = "l")
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(1, 5))

}
	#################################################################################################################
	#install
	#library(devtools)
	#source_gist("https://gist.github.com/robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86")

	#spc= a species(cols) by samples (rows) data frame
	#env= an environmental metadata data frame, aligned with spc. Used to order and label samples on x axis (#the order loaded on gel)
	#tax= a taxonomy file with sequences/OTU ids as row names - normally would match colnames of spc df. Though shouldnt matter.

	#subs= Unlike real gels, we cant plot the abundance of every species. This sets a relative abundance threshold for taxa to plot.
	#band_multiplier= band phatness
	#order_samps= column(s) in env to order x axis, outermost first eg c("Fertiliser","block")
	#label_samps= column(s) in env to label x axis, outermost first eg c("Fertiliser","block")
	#y_labs_remover=1 We cant label every taxon on y axis, this sets an abundance threshold for taxa to label as: y_labs_remover*subs
	#y_labs_size= text size y labels
	#band_col= band colour
	#label_tax_1= column in tax denoting the facetted taxonomic labels on right hand size - eg go broad, "Phylum"...or "phylum"
	#label_tax_2= column in tax denoting the taxonomic labels on left hand side eg more resolved - "order"

	#ISSUES: very hacky, some code redundant and much can be improved:
	#1. optimising ggrepel params for plotting many y axis labs.
	#2. making the function more generalizable
	#3. indicator labelling
	#...
	##########################################################################################################################

	ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
	label_tax_2="order", band_col="grey",y_labs_remover=2,y_labs_size=4){

	#libs used
	require(reshape2)
	require(vegan)
	require(ggh4x)
	library(cowplot)
	library(ggrepel)

	#convert species df to proportions (0 to 1 scale)
	spc<-decostand(spc,"total")

	#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
	spc[spc<subs]<-0
	spc<-spc[,colSums(spc)>0]
	spc[spc==0]<-NA

	#######################################################################
	#reorder species randomly (NOT DONE)
	#set.seed(32)
	#ord<-sample(ncol(spc))
	#spc<-spc[,ord]
	#colnames(spc)<-1:ncol(spc)

	########################################################################
	#sort species according to taxonomy
	#index the species names so they are no longer long sequences

	taxa.sel<-colnames(spc)#separate seqs

	#pull tax info for the subsetted taxa
	tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)

	#add in unknown classification
	tax_df[is.na(tax_df)]<-"Unknown"

	#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
	tax_df<-tax_df[do.call(order, tax_df), ]

	#sort spc df according to ordered tax df
	spc<-spc[,match(rownames(tax_df), colnames(spc))]
	identical(rownames(tax_df),colnames(spc))#check

	#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
	colnames(spc)<-sprintf("%03d",1:ncol(spc))
	tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa

	#################################################################################
	#reorder rows (samples) according to "order_samps" argument
	spc<-spc[with(env,do.call(order,mget(order_samps))),]
	env<-env[with(env,do.call(order,mget(order_samps))),]
	env<-env[,order_samps]
	env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"

	##################################################################################
	#make long df for ggplotting, with xy coords and abundances (for band fatness)
	dat.w<-data.frame(spc,env,check.names=F)
	dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))

	names(dat.l)[names(dat.l) == 'variable'] <- 'y'
	names(dat.l)[names(dat.l) == 'value'] <- 'abund'

	#add in selected taxon levels for labelling taxa to long df
	dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
	dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]

	#make a new name for labelling taxa
	dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")

	##############################################################
	#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
	#min
	dat.l$y_min<--1

	#max which will be number of taxa per tax_1 designation (the facet) plus some padding
	#count number of taxa per tax_1 taxonomic designation
	tab<-data.frame(table(tax_df[,label_tax_1]))
	dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1

	############################################################################################
	#we also wont have room on y axis to label all species, so need to subset somehow

	#first need make a column containing labels for plotting using ggrepel.
	#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated

	dat.l$labels<-dat.l$id
	dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
	dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name

	#We can further filter these eg by abundance if the y is looking too busy
	# This could be expanded to only include "indicators" or anything really of interest

	#make table with abundances each taxon (ie make long df wide again)
	tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
	tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent
	dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df

	#not used but potentially useful for advanced labelling
	#require(ggtext)
	#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)

	#########################################################################################################
	# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
	# They then get bound by cowplot

	pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
	geom_blank()+
	scale_x_discrete(NULL, guide = "axis_nested") +
	geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
	scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
	theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
	axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
	facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
	theme(strip.text.y = element_text(size = 6))+
	geom_blank(aes(y = y_min-2)) +
	geom_blank(aes(y = y_max+2))

	axis<-ggplot(dat.l,aes(x=0, y=y))+
	geom_blank()+
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
	scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
	theme(panel.spacing.y = unit(0.2, "lines"),
	axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
	strip.text.y = element_blank())+
	facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
	geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
	color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text
	size = y_labs_size, ## ggplot2 theme_grey() axis text
	max.overlaps=Inf,
	seed = 42,
	force = 4,
	force_pull = 0.0,
	nudge_x = -1,
	direction = "x",
	hjust = 0,
	segment.size = 0.0001,
	segment.alpha = 0.1,
	box.padding = 0.000
	)+
	scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
	breaks = NULL, labels = NULL, name = NULL)+
	theme(panel.background = element_blank(),
	plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
	geom_blank(aes(y = y_min-2)) +
	geom_blank(aes(y = y_max+2))

	#align and plot
	aligned <- align_plots(axis, pl, align = "h", axis = "l")
	plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(1, 5))

	}