robiwangriff/amplicon_help.r

## amplicon_help.r
##source_gist("https://gist.github.com/robiwangriff/79633a738b128e64226bf58381232da7")
#' # 1. Merge and filter

#'A small function to merge and align, and output a list containing the aligned $spc, $env, and $tax
#' files. The main part of this is merge() and requires specifying two "common reference" vectors in both datasets.

aligner<-function(spc,env,tax=NULL, spc_id="row.names",env_id="row.names",tax_id="row.names",transpose_spc=FALSE){

#transpose the spc file so sample IDs are rows (and row names)
spc.t<-spc
if(transpose_spc==TRUE){
spc.t=t(spc)}
#merge
spc_env_merged<- merge(spc.t, env, by.x=spc_id, by.y=env_id)

#fix the added Row.name column to be row.names (a nuance of merge())
rownames(spc_env_merged)=spc_env_merged$Row.names
spc_env_merged=spc_env_merged[2:length(spc_env_merged)]#get rid of Row.names

#split again into spc and env files
y<-ncol(spc.t)#counts how many ASVs - needed to split in right place
spc.out<-spc_env_merged[,1:y]
env.out<-spc_env_merged[,(y+1):ncol(spc_env_merged)]

#check tax file is aligned
if(!is.null(tax)){
if(tax_id=="row.names"){
	tx<-row.names(tax)}else{
	tx<-tax[,tax_id]
rownames(tax)<-tx #just adding tax identifiers to rownames, makes other things work
}

if(identical(colnames(spc.out),tx)==FALSE)  {
warning("spc colnames and tax rownames are not ordered same. I fixed it for you.")
tax<-tax[order(tx,colnames(spc.t)),]
}

return(list(spc=spc.out,env=env.out,tax=tax))}else{
	return(list(spc=spc.out,env=env.out))}
}

########################################################################################################
#' # 2. Get rid of samples with low reads
# A function to drop samples from a list containing spc env and tax(tax is left unchanged) and nreads.

drop_samples<-function(mylist, read_threshold=1000){
mylist$env<-mylist$env[rowSums(mylist$spc)>read_threshold,]
mylist$spc<-mylist$spc[rowSums(mylist$spc)>read_threshold,]
return(mylist)
}


#####################################################################################################################
#' # 3.  Create another list element with spc containing  standard number reads


#A function to rarify samples from a list containing spc env and tax.
#test
#mylist<-bac$spc
#resample_to=1000
rarefy_samples<-function(mylist,resample_to=1000){
require(vegan)
set.seed(35)#to reproduce
mylist$spc<-rrarefy(mylist$spc,resample_to)
	if(!is.null(mylist$tax)){
mylist$tax<-mylist$tax[colSums(mylist$spc)>0,]
		}
mylist$spc<-mylist$spc[,colSums(mylist$spc)>0]

return(mylist)
}

#############################################################################################################
#' # 4.  A nice looking ordination plot showing 2 main effects

#test
#ord<-mod
#env<-mg.l$rare$env
#group1<-"plot"
#group2<-"season"
#titl<-"blah"

ncplot <- function(ord,env,group1,group2,titl,group1_cols,group2_cols){

require(ggordiplots)
require(ggnewscale)

p.b<-gg_ordiplot(ord, groups =env[,group1],ellipse=F,spider=T,label=F,pt.size = 1,plot=FALSE)
p.h<-gg_ordiplot(ord, groups = env[,group2],hull=T,spider=T,label=T,plot=FALSE)

ord.data<-p.b$df_ord
names(ord.data)<-c("x","y", group1)
ord.data[,group2]<-p.h$df_ord[,3]
hulls<-p.h$df_hull
names(hulls)<-c("Group","x1","y1")
plot.b<-ggplot(data=ord.data, aes_string(x=ord$points[,1], y=ord$points[,2],color=group1))+
geom_point(aes_string(shape=group2),size=2) +

#if group_cols specified
#scale_color_manual(values=group2_cols)+
#theme(legend.position="none")+
geom_segment(data=p.b$df_spiders, aes(x=cntr.x, xend=x, y=cntr.y, yend=y, color=Group),
show.legend = TRUE)+
geom_label(data=p.b$df_mean.ord, aes(x=x, y=y, label=Group, color=Group),
                   fontface="bold", size=3,
                   label.size = 1,
                   label.r = unit(0.3, "lines"), show.legend = TRUE)+
geom_polygon(data=hulls,aes(x=x1, y=y1, fill=Group),inherit.aes = FALSE,alpha=0.05,
show.legend = TRUE)+
new_scale_colour()+
#if group cols specified
#scale_fill_manual(values=group1_cols)+
theme_bw()+ theme(legend.position="none")+
theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_text(data=p.h$df_mean.ord, aes(x=x, y=y, label=Group),inherit.aes = FALSE,
                   fontface="bold", size=3,
#colour=c(group1_cols),
                   show.legend = TRUE)+ ggtitle(titl)

return(plot.b)
}
###############################################################################################################
#' # 5. Aggregate counts of taxa

#aggreg8 function
#requires
#spc: species data.frame samples as rows
#ann: annotation file, ordered so first column is most aggregated, subsequent are incrementally less
#fac: the column name in ann by which to aggregate


aggreg8<-function(spc,ann,fac){
#aggreg8 function
#requires
#spc: species data.frame samples as rows, cols are species
#ann: annotation file, row names should match spc colnames
#fac: the column name in ann by which to aggregate


ann[is.na(ann)] <-"" # just clear any NAs
stopifnot("spc colnames and ann rownames should be the same, fool!" = identical(colnames(spc),rownames(ann)))
tabfac = factor(ann[,fac])
tab = apply(t(spc), MARGIN = 2, function(x) {
  tapply(x, INDEX = tabfac, FUN = sum, na.rm = TRUE, simplify = TRUE)
})

spc.out<-t(tab)
return(spc.out)
}

# eg bac.order<-aggregator(bac.final$rare$spc,bac.final$rare$tax,fac="Family")

###############################################################################################################
#' # 6. Remove duplicates by optimal number of reads. Sometimes duplicates get run, and you want to take them out.

#requires alignment of spc and env
rm_dups_x_reads<-function(spc,env, duplicated_var){
nreads<-rowSums(spc)
env.red<-env[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
spc.red<-spc[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
	return(list(spc=spc.red,env=env.red))
				      }

###############################################################################################################
#' # 7.Drop and reorder samples in spc env list by vector.
#eg sometimes you may want to align across multiple data lists (bact, fungi mg...)
# sub_vec_common_samples<-Reduce(intersect, list(mg.l$sub$env$REP_ID19,rr.l$sub$env$REP_ID19,rp1.l$sub$env$REP_ID19))

drop_order<-function(mylist, env_vec,sub_vec ){
mylist$spc<-mylist$spc[match(sub_vec,mylist$env[,env_vec]),]
mylist$env<-mylist$env[match(sub_vec,mylist$env[,env_vec]),]
return(mylist)
}

###############################################################################################################
#' # 8.  A gel like image of community data

ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
label_tax_2="order", indie_seq_id,indie_group, indie_col, band_col="grey",y_labs_remover=2,y_labs_size=4){

#libs used
require(reshape2)
require(vegan)
require(ggh4x)
library(cowplot)
library(ggrepel)

#convert species df to proportions (0 to 1 scale)
spc<-decostand(spc,"total")

#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
spc[spc<subs]<-0
spc<-spc[,colSums(spc)>0]
spc[spc==0]<-NA


#######################################################################
#reorder species randomly (NOT DONE)
#set.seed(32)
#ord<-sample(ncol(spc))
#spc<-spc[,ord]
#colnames(spc)<-1:ncol(spc)

########################################################################
#sort species according to taxonomy
#index the species names so they are no longer long sequences

taxa.sel<-colnames(spc)#separate seqs

#pull tax info for the subsetted taxa
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)

#add in unknown classification
tax_df[is.na(tax_df)]<-"Unknown"

#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
tax_df<-tax_df[do.call(order, tax_df), ]

#sort spc df according to ordered tax df
spc<-spc[,match(rownames(tax_df), colnames(spc))]
identical(rownames(tax_df),colnames(spc))#check

#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
colnames(spc)<-sprintf("%03d",1:ncol(spc))
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa

#add in indicator label (if requested)
if (hasArg(indie_seq_id)){
tax_df$indicator<-indie_group[match(rownames(tax_df),indie_seq_id)]
	}
#################################################################################
#reorder rows (samples) according to "order_samps" argument
spc<-spc[with(env,do.call(order,mget(order_samps))),]
env<-env[with(env,do.call(order,mget(order_samps))),]
env<-env[,order_samps]
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"

##################################################################################
#make long df for ggplotting,  with xy coords and abundances (for band fatness)
dat.w<-data.frame(spc,env,check.names=F)
dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))

names(dat.l)[names(dat.l) == 'variable'] <- 'y'
names(dat.l)[names(dat.l) == 'value'] <- 'abund'

#add in selected taxon levels for labelling taxa to long df
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]

#add in indicator designation (if requested)

if (hasArg(indie_seq_id)){
dat.l$indicator<-tax_df[match(dat.l$y,tax_df$names_spc),][,"indicator"]
}

#make a new name for labelling taxa
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")


##############################################################
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
#min
dat.l$y_min<--1

#max which will be number of taxa per tax_1 designation (the facet) plus some padding
#count number of taxa per tax_1 taxonomic designation
tab<-data.frame(table(tax_df[,label_tax_1]))
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1


############################################################################################
#we also wont have room on y axis to label all species, so need to subset somehow

#first need make a column containing labels for plotting using ggrepel.
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated

dat.l$labels<-dat.l$id
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name

#We can further filter these eg by abundance if the y is looking too busy
# This could be expanded to only include "indicators" or anything really of interest

#make table with abundances each taxon (ie make long df wide again)
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent

#add in group indicator colour codes (if indicator argument requested)

if (hasArg(indie_seq_id)) {
tab$indie_group<-tax_df[match(tab$Group.1,tax_df$names_spc),]$indicator
tab2<-data.frame(x=levels(as.factor(indie_group)),y=indie_col)
tab$indie_col<-tab2[match(tab$indie_group,tab2$x),]$y
tab$col<-ifelse(tab$col=="black"&tab$indie_group%in%indie_group,tab$indie_col, "transparent")
  }


dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df

#also set the band colour
dat.l$band_col<-band_col
if (hasArg(indie_seq_id)) {
dat.l$band_col<-tab[match(dat.l$y,tab$Group.1),]$indie_col #add to long df
dat.l$band_col<-ifelse(is.na(dat.l$band_col),band_col,dat.l$band_col)
				}

#not used but potentially useful for advanced labelling
#require(ggtext)
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)

#########################################################################################################
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
# They then get bound by cowplot

pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
geom_blank()+
scale_x_discrete(NULL, guide = "axis_nested")  +
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=dat.l$band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = margin(0, 0, 2, 0, "pt"))+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
theme(strip.text.y = element_text(size = 6))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))+
theme(axis.ticks.x = element_line(color = indie_col[env[,order_samps[1]]],linewidth=2))


axis<-ggplot(dat.l,aes(x=0, y=y))+
geom_blank()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
theme(panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
  strip.text.y = element_blank())+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
                 color = dat.l$y_lab_col,  ## ggplot2 theme_grey() axis text
                 size = y_labs_size,  ## ggplot2 theme_grey() axis text
                  max.overlaps=Inf,
    seed         = 42,
    force        = 4,
    force_pull   = 0.0,
    nudge_x      = -1,
    direction    = "x",
    hjust        = 0,
    segment.size = 0.0001,
    segment.alpha = 0.1,
    box.padding  = 0.000
)+
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
                     breaks = NULL, labels = NULL, name = NULL)+
  theme(panel.background = element_blank(),
        plot.margin = margin(0, 0, 2, 0, "pt"))+
 geom_blank(aes(y = y_min-2)) +
 geom_blank(aes(y = y_max+2))

#align and plot
aligned <- align_plots(axis, pl, align = "h", axis = "l")
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(2, 5))

}


#######USE

#without indies
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.002,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
#label_tax_2="order", band_col="grey",y_labs_remover=1,y_labs_size=2)

#with indies
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.001,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
#label_tax_2="order", indie_seq_id=inds[,"Row.names"],indie_group=factor(inds[,"group_name"],levels=c("NL","FL","L")),
#indie_col=c("red","blue","green"), band_col="grey",y_labs_remover=1,y_labs_size=2)
	##source_gist("https://gist.github.com/robiwangriff/79633a738b128e64226bf58381232da7")
	#' # 1. Merge and filter

	#'A small function to merge and align, and output a list containing the aligned $spc, $env, and $tax
	#' files. The main part of this is merge() and requires specifying two "common reference" vectors in both datasets.

	aligner<-function(spc,env,tax=NULL, spc_id="row.names",env_id="row.names",tax_id="row.names",transpose_spc=FALSE){

	#transpose the spc file so sample IDs are rows (and row names)
	spc.t<-spc
	if(transpose_spc==TRUE){
	spc.t=t(spc)}
	#merge
	spc_env_merged<- merge(spc.t, env, by.x=spc_id, by.y=env_id)

	#fix the added Row.name column to be row.names (a nuance of merge())
	rownames(spc_env_merged)=spc_env_merged$Row.names
	spc_env_merged=spc_env_merged[2:length(spc_env_merged)]#get rid of Row.names

	#split again into spc and env files
	y<-ncol(spc.t)#counts how many ASVs - needed to split in right place
	spc.out<-spc_env_merged[,1:y]
	env.out<-spc_env_merged[,(y+1):ncol(spc_env_merged)]

	#check tax file is aligned
	if(!is.null(tax)){
	if(tax_id=="row.names"){
	tx<-row.names(tax)}else{
	tx<-tax[,tax_id]
	rownames(tax)<-tx #just adding tax identifiers to rownames, makes other things work
	}

	if(identical(colnames(spc.out),tx)==FALSE) {
	warning("spc colnames and tax rownames are not ordered same. I fixed it for you.")
	tax<-tax[order(tx,colnames(spc.t)),]
	}

	return(list(spc=spc.out,env=env.out,tax=tax))}else{
	return(list(spc=spc.out,env=env.out))}
	}

	########################################################################################################
	#' # 2. Get rid of samples with low reads
	# A function to drop samples from a list containing spc env and tax(tax is left unchanged) and nreads.

	drop_samples<-function(mylist, read_threshold=1000){
	mylist$env<-mylist$env[rowSums(mylist$spc)>read_threshold,]
	mylist$spc<-mylist$spc[rowSums(mylist$spc)>read_threshold,]
	return(mylist)
	}


	#####################################################################################################################
	#' # 3. Create another list element with spc containing standard number reads


	#A function to rarify samples from a list containing spc env and tax.
	#test
	#mylist<-bac$spc
	#resample_to=1000
	rarefy_samples<-function(mylist,resample_to=1000){
	require(vegan)
	set.seed(35)#to reproduce
	mylist$spc<-rrarefy(mylist$spc,resample_to)
	if(!is.null(mylist$tax)){
	mylist$tax<-mylist$tax[colSums(mylist$spc)>0,]
	}
	mylist$spc<-mylist$spc[,colSums(mylist$spc)>0]

	return(mylist)
	}

	#############################################################################################################
	#' # 4. A nice looking ordination plot showing 2 main effects

	#test
	#ord<-mod
	#env<-mg.l$rare$env
	#group1<-"plot"
	#group2<-"season"
	#titl<-"blah"

	ncplot <- function(ord,env,group1,group2,titl,group1_cols,group2_cols){

	require(ggordiplots)
	require(ggnewscale)

	p.b<-gg_ordiplot(ord, groups =env[,group1],ellipse=F,spider=T,label=F,pt.size = 1,plot=FALSE)
	p.h<-gg_ordiplot(ord, groups = env[,group2],hull=T,spider=T,label=T,plot=FALSE)

	ord.data<-p.b$df_ord
	names(ord.data)<-c("x","y", group1)
	ord.data[,group2]<-p.h$df_ord[,3]
	hulls<-p.h$df_hull
	names(hulls)<-c("Group","x1","y1")
	plot.b<-ggplot(data=ord.data, aes_string(x=ord$points[,1], y=ord$points[,2],color=group1))+
	geom_point(aes_string(shape=group2),size=2) +

	#if group_cols specified
	#scale_color_manual(values=group2_cols)+
	#theme(legend.position="none")+
	geom_segment(data=p.b$df_spiders, aes(x=cntr.x, xend=x, y=cntr.y, yend=y, color=Group),
	show.legend = TRUE)+
	geom_label(data=p.b$df_mean.ord, aes(x=x, y=y, label=Group, color=Group),
	fontface="bold", size=3,
	label.size = 1,
	label.r = unit(0.3, "lines"), show.legend = TRUE)+
	geom_polygon(data=hulls,aes(x=x1, y=y1, fill=Group),inherit.aes = FALSE,alpha=0.05,
	show.legend = TRUE)+
	new_scale_colour()+
	#if group cols specified
	#scale_fill_manual(values=group1_cols)+
	theme_bw()+ theme(legend.position="none")+
	theme(axis.title.x=element_blank(),
	axis.text.x=element_blank(),
	axis.ticks.x=element_blank(),axis.title.y=element_blank(),
	axis.text.y=element_blank(),
	axis.ticks.y=element_blank(),
	panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
	geom_text(data=p.h$df_mean.ord, aes(x=x, y=y, label=Group),inherit.aes = FALSE,
	fontface="bold", size=3,
	#colour=c(group1_cols),
	show.legend = TRUE)+ ggtitle(titl)

	return(plot.b)
	}
	###############################################################################################################
	#' # 5. Aggregate counts of taxa

	#aggreg8 function
	#requires
	#spc: species data.frame samples as rows
	#ann: annotation file, ordered so first column is most aggregated, subsequent are incrementally less
	#fac: the column name in ann by which to aggregate



	aggreg8<-function(spc,ann,fac){
	#aggreg8 function
	#requires
	#spc: species data.frame samples as rows, cols are species
	#ann: annotation file, row names should match spc colnames
	#fac: the column name in ann by which to aggregate


	ann[is.na(ann)] <-"" # just clear any NAs
	stopifnot("spc colnames and ann rownames should be the same, fool!" = identical(colnames(spc),rownames(ann)))
	tabfac = factor(ann[,fac])
	tab = apply(t(spc), MARGIN = 2, function(x) {
	tapply(x, INDEX = tabfac, FUN = sum, na.rm = TRUE, simplify = TRUE)
	})

	spc.out<-t(tab)
	return(spc.out)
	}

	# eg bac.order<-aggregator(bac.final$rare$spc,bac.final$rare$tax,fac="Family")

	###############################################################################################################
	#' # 6. Remove duplicates by optimal number of reads. Sometimes duplicates get run, and you want to take them out.

	#requires alignment of spc and env
	rm_dups_x_reads<-function(spc,env, duplicated_var){
	nreads<-rowSums(spc)
	env.red<-env[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
	spc.red<-spc[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
	return(list(spc=spc.red,env=env.red))
	}

	###############################################################################################################
	#' # 7.Drop and reorder samples in spc env list by vector.
	#eg sometimes you may want to align across multiple data lists (bact, fungi mg...)
	# sub_vec_common_samples<-Reduce(intersect, list(mg.l$sub$env$REP_ID19,rr.l$sub$env$REP_ID19,rp1.l$sub$env$REP_ID19))

	drop_order<-function(mylist, env_vec,sub_vec ){
	mylist$spc<-mylist$spc[match(sub_vec,mylist$env[,env_vec]),]
	mylist$env<-mylist$env[match(sub_vec,mylist$env[,env_vec]),]
	return(mylist)
	}

	###############################################################################################################
	#' # 8. A gel like image of community data

	ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
	label_tax_2="order", indie_seq_id,indie_group, indie_col, band_col="grey",y_labs_remover=2,y_labs_size=4){

	#libs used
	require(reshape2)
	require(vegan)
	require(ggh4x)
	library(cowplot)
	library(ggrepel)

	#convert species df to proportions (0 to 1 scale)
	spc<-decostand(spc,"total")

	#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
	spc[spc<subs]<-0
	spc<-spc[,colSums(spc)>0]
	spc[spc==0]<-NA


	#######################################################################
	#reorder species randomly (NOT DONE)
	#set.seed(32)
	#ord<-sample(ncol(spc))
	#spc<-spc[,ord]
	#colnames(spc)<-1:ncol(spc)

	########################################################################
	#sort species according to taxonomy
	#index the species names so they are no longer long sequences

	taxa.sel<-colnames(spc)#separate seqs

	#pull tax info for the subsetted taxa
	tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)

	#add in unknown classification
	tax_df[is.na(tax_df)]<-"Unknown"

	#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
	tax_df<-tax_df[do.call(order, tax_df), ]

	#sort spc df according to ordered tax df
	spc<-spc[,match(rownames(tax_df), colnames(spc))]
	identical(rownames(tax_df),colnames(spc))#check

	#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
	colnames(spc)<-sprintf("%03d",1:ncol(spc))
	tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa

	#add in indicator label (if requested)
	if (hasArg(indie_seq_id)){
	tax_df$indicator<-indie_group[match(rownames(tax_df),indie_seq_id)]
	}
	#################################################################################
	#reorder rows (samples) according to "order_samps" argument
	spc<-spc[with(env,do.call(order,mget(order_samps))),]
	env<-env[with(env,do.call(order,mget(order_samps))),]
	env<-env[,order_samps]
	env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"

	##################################################################################
	#make long df for ggplotting, with xy coords and abundances (for band fatness)
	dat.w<-data.frame(spc,env,check.names=F)
	dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))

	names(dat.l)[names(dat.l) == 'variable'] <- 'y'
	names(dat.l)[names(dat.l) == 'value'] <- 'abund'

	#add in selected taxon levels for labelling taxa to long df
	dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
	dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]

	#add in indicator designation (if requested)

	if (hasArg(indie_seq_id)){
	dat.l$indicator<-tax_df[match(dat.l$y,tax_df$names_spc),][,"indicator"]
	}

	#make a new name for labelling taxa
	dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")


	##############################################################
	#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
	#min
	dat.l$y_min<--1

	#max which will be number of taxa per tax_1 designation (the facet) plus some padding
	#count number of taxa per tax_1 taxonomic designation
	tab<-data.frame(table(tax_df[,label_tax_1]))
	dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1


	############################################################################################
	#we also wont have room on y axis to label all species, so need to subset somehow

	#first need make a column containing labels for plotting using ggrepel.
	#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated

	dat.l$labels<-dat.l$id
	dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
	dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name

	#We can further filter these eg by abundance if the y is looking too busy
	# This could be expanded to only include "indicators" or anything really of interest

	#make table with abundances each taxon (ie make long df wide again)
	tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
	tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent

	#add in group indicator colour codes (if indicator argument requested)

	if (hasArg(indie_seq_id)) {
	tab$indie_group<-tax_df[match(tab$Group.1,tax_df$names_spc),]$indicator
	tab2<-data.frame(x=levels(as.factor(indie_group)),y=indie_col)
	tab$indie_col<-tab2[match(tab$indie_group,tab2$x),]$y
	tab$col<-ifelse(tab$col=="black"&tab$indie_group%in%indie_group,tab$indie_col, "transparent")
	}


	dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df

	#also set the band colour
	dat.l$band_col<-band_col
	if (hasArg(indie_seq_id)) {
	dat.l$band_col<-tab[match(dat.l$y,tab$Group.1),]$indie_col #add to long df
	dat.l$band_col<-ifelse(is.na(dat.l$band_col),band_col,dat.l$band_col)
	}

	#not used but potentially useful for advanced labelling
	#require(ggtext)
	#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)

	#########################################################################################################
	# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
	# They then get bound by cowplot

	pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
	geom_blank()+
	scale_x_discrete(NULL, guide = "axis_nested") +
	geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=dat.l$band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
	scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
	theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
	axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = margin(0, 0, 2, 0, "pt"))+
	facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
	theme(strip.text.y = element_text(size = 6))+
	geom_blank(aes(y = y_min-2)) +
	geom_blank(aes(y = y_max+2))+
	theme(axis.ticks.x = element_line(color = indie_col[env[,order_samps[1]]],linewidth=2))


	axis<-ggplot(dat.l,aes(x=0, y=y))+
	geom_blank()+
	theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
	scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
	theme(panel.spacing.y = unit(0.2, "lines"),
	axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
	strip.text.y = element_blank())+
	facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
	geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
	color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text
	size = y_labs_size, ## ggplot2 theme_grey() axis text
	max.overlaps=Inf,
	seed = 42,
	force = 4,
	force_pull = 0.0,
	nudge_x = -1,
	direction = "x",
	hjust = 0,
	segment.size = 0.0001,
	segment.alpha = 0.1,
	box.padding = 0.000
	)+
	scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
	breaks = NULL, labels = NULL, name = NULL)+
	theme(panel.background = element_blank(),
	plot.margin = margin(0, 0, 2, 0, "pt"))+
	geom_blank(aes(y = y_min-2)) +
	geom_blank(aes(y = y_max+2))

	#align and plot
	aligned <- align_plots(axis, pl, align = "h", axis = "l")
	plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(2, 5))

	}


	#######USE

	#without indies
	#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.002,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
	#label_tax_2="order", band_col="grey",y_labs_remover=1,y_labs_size=2)

	#with indies
	#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.001,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
	#label_tax_2="order", indie_seq_id=inds[,"Row.names"],indie_group=factor(inds[,"group_name"],levels=c("NL","FL","L")),
	#indie_col=c("red","blue","green"), band_col="grey",y_labs_remover=1,y_labs_size=2)