Skip to content

Instantly share code, notes, and snippets.

@robiwangriff
Last active September 22, 2023 10:38
Show Gist options
  • Save robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86 to your computer and use it in GitHub Desktop.
Save robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86 to your computer and use it in GitHub Desktop.
do you remember Spangles? have you visualized microbes on acrylamide gels? do you want to view amplicon data as a gel image?
#################################################################################################################
#install
#library(devtools)
#source_gist("https://gist.github.com/robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86")
#spc= a species(cols) by samples (rows) data frame
#env= an environmental metadata data frame, aligned with spc. Used to order and label samples on x axis (#the order loaded on gel)
#tax= a taxonomy file with sequences/OTU ids as row names - normally would match colnames of spc df. Though shouldnt matter.
#subs= Unlike real gels, we cant plot the abundance of every species. This sets a relative abundance threshold for taxa to plot.
#band_multiplier= band phatness
#order_samps= column(s) in env to order x axis, outermost first eg c("Fertiliser","block")
#label_samps= column(s) in env to label x axis, outermost first eg c("Fertiliser","block")
#y_labs_remover=1 We cant label every taxon on y axis, this sets an abundance threshold for taxa to label as: y_labs_remover*subs
#y_labs_size= text size y labels
#band_col= band colour
#label_tax_1= column in tax denoting the facetted taxonomic labels on right hand size - eg go broad, "Phylum"...or "phylum"
#label_tax_2= column in tax denoting the taxonomic labels on left hand side eg more resolved - "order"
#ISSUES: very hacky, some code redundant and much can be improved:
#1. optimising ggrepel params for plotting many y axis labs.
#2. making the function more generalizable
#3. indicator labelling
#...
##########################################################################################################################
ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
label_tax_2="order", band_col="grey",y_labs_remover=2,y_labs_size=4){
#libs used
require(reshape2)
require(vegan)
require(ggh4x)
library(cowplot)
library(ggrepel)
#convert species df to proportions (0 to 1 scale)
spc<-decostand(spc,"total")
#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
spc[spc<subs]<-0
spc<-spc[,colSums(spc)>0]
spc[spc==0]<-NA
#######################################################################
#reorder species randomly (NOT DONE)
#set.seed(32)
#ord<-sample(ncol(spc))
#spc<-spc[,ord]
#colnames(spc)<-1:ncol(spc)
########################################################################
#sort species according to taxonomy
#index the species names so they are no longer long sequences
taxa.sel<-colnames(spc)#separate seqs
#pull tax info for the subsetted taxa
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)
#add in unknown classification
tax_df[is.na(tax_df)]<-"Unknown"
#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
tax_df<-tax_df[do.call(order, tax_df), ]
#sort spc df according to ordered tax df
spc<-spc[,match(rownames(tax_df), colnames(spc))]
identical(rownames(tax_df),colnames(spc))#check
#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
colnames(spc)<-sprintf("%03d",1:ncol(spc))
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa
#################################################################################
#reorder rows (samples) according to "order_samps" argument
spc<-spc[with(env,do.call(order,mget(order_samps))),]
env<-env[with(env,do.call(order,mget(order_samps))),]
env<-env[,order_samps]
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"
##################################################################################
#make long df for ggplotting, with xy coords and abundances (for band fatness)
dat.w<-data.frame(spc,env,check.names=F)
dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))
names(dat.l)[names(dat.l) == 'variable'] <- 'y'
names(dat.l)[names(dat.l) == 'value'] <- 'abund'
#add in selected taxon levels for labelling taxa to long df
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]
#make a new name for labelling taxa
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")
##############################################################
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
#min
dat.l$y_min<--1
#max which will be number of taxa per tax_1 designation (the facet) plus some padding
#count number of taxa per tax_1 taxonomic designation
tab<-data.frame(table(tax_df[,label_tax_1]))
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1
############################################################################################
#we also wont have room on y axis to label all species, so need to subset somehow
#first need make a column containing labels for plotting using ggrepel.
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated
dat.l$labels<-dat.l$id
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name
#We can further filter these eg by abundance if the y is looking too busy
# This could be expanded to only include "indicators" or anything really of interest
#make table with abundances each taxon (ie make long df wide again)
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent
dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df
#not used but potentially useful for advanced labelling
#require(ggtext)
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)
#########################################################################################################
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
# They then get bound by cowplot
pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
geom_blank()+
scale_x_discrete(NULL, guide = "axis_nested") +
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
theme(strip.text.y = element_text(size = 6))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))
axis<-ggplot(dat.l,aes(x=0, y=y))+
geom_blank()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
theme(panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
strip.text.y = element_blank())+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text
size = y_labs_size, ## ggplot2 theme_grey() axis text
max.overlaps=Inf,
seed = 42,
force = 4,
force_pull = 0.0,
nudge_x = -1,
direction = "x",
hjust = 0,
segment.size = 0.0001,
segment.alpha = 0.1,
box.padding = 0.000
)+
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
breaks = NULL, labels = NULL, name = NULL)+
theme(panel.background = element_blank(),
plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))
#align and plot
aligned <- align_plots(axis, pl, align = "h", axis = "l")
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(1, 5))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment