Last active
September 22, 2023 10:38
-
-
Save robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86 to your computer and use it in GitHub Desktop.
do you remember Spangles? have you visualized microbes on acrylamide gels? do you want to view amplicon data as a gel image?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################################################# | |
#install | |
#library(devtools) | |
#source_gist("https://gist.github.com/robiwangriff/62ee1fd1adfcb50cfd5520075ce9da86") | |
#spc= a species(cols) by samples (rows) data frame | |
#env= an environmental metadata data frame, aligned with spc. Used to order and label samples on x axis (#the order loaded on gel) | |
#tax= a taxonomy file with sequences/OTU ids as row names - normally would match colnames of spc df. Though shouldnt matter. | |
#subs= Unlike real gels, we cant plot the abundance of every species. This sets a relative abundance threshold for taxa to plot. | |
#band_multiplier= band phatness | |
#order_samps= column(s) in env to order x axis, outermost first eg c("Fertiliser","block") | |
#label_samps= column(s) in env to label x axis, outermost first eg c("Fertiliser","block") | |
#y_labs_remover=1 We cant label every taxon on y axis, this sets an abundance threshold for taxa to label as: y_labs_remover*subs | |
#y_labs_size= text size y labels | |
#band_col= band colour | |
#label_tax_1= column in tax denoting the facetted taxonomic labels on right hand size - eg go broad, "Phylum"...or "phylum" | |
#label_tax_2= column in tax denoting the taxonomic labels on left hand side eg more resolved - "order" | |
#ISSUES: very hacky, some code redundant and much can be improved: | |
#1. optimising ggrepel params for plotting many y axis labs. | |
#2. making the function more generalizable | |
#3. indicator labelling | |
#... | |
########################################################################################################################## | |
ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum", | |
label_tax_2="order", band_col="grey",y_labs_remover=2,y_labs_size=4){ | |
#libs used | |
require(reshape2) | |
require(vegan) | |
require(ggh4x) | |
library(cowplot) | |
library(ggrepel) | |
#convert species df to proportions (0 to 1 scale) | |
spc<-decostand(spc,"total") | |
#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA | |
spc[spc<subs]<-0 | |
spc<-spc[,colSums(spc)>0] | |
spc[spc==0]<-NA | |
####################################################################### | |
#reorder species randomly (NOT DONE) | |
#set.seed(32) | |
#ord<-sample(ncol(spc)) | |
#spc<-spc[,ord] | |
#colnames(spc)<-1:ncol(spc) | |
######################################################################## | |
#sort species according to taxonomy | |
#index the species names so they are no longer long sequences | |
taxa.sel<-colnames(spc)#separate seqs | |
#pull tax info for the subsetted taxa | |
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel) | |
#add in unknown classification | |
tax_df[is.na(tax_df)]<-"Unknown" | |
#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny) | |
tax_df<-tax_df[do.call(order, tax_df), ] | |
#sort spc df according to ordered tax df | |
spc<-spc[,match(rownames(tax_df), colnames(spc))] | |
identical(rownames(tax_df),colnames(spc))#check | |
#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot) | |
colnames(spc)<-sprintf("%03d",1:ncol(spc)) | |
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa | |
################################################################################# | |
#reorder rows (samples) according to "order_samps" argument | |
spc<-spc[with(env,do.call(order,mget(order_samps))),] | |
env<-env[with(env,do.call(order,mget(order_samps))),] | |
env<-env[,order_samps] | |
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes" | |
################################################################################## | |
#make long df for ggplotting, with xy coords and abundances (for band fatness) | |
dat.w<-data.frame(spc,env,check.names=F) | |
dat.l<-melt(dat.w,id.vars=c(order_samps,"x")) | |
names(dat.l)[names(dat.l) == 'variable'] <- 'y' | |
names(dat.l)[names(dat.l) == 'value'] <- 'abund' | |
#add in selected taxon levels for labelling taxa to long df | |
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1] | |
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2] | |
#make a new name for labelling taxa | |
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_") | |
############################################################## | |
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom | |
#min | |
dat.l$y_min<--1 | |
#max which will be number of taxa per tax_1 designation (the facet) plus some padding | |
#count number of taxa per tax_1 taxonomic designation | |
tab<-data.frame(table(tax_df[,label_tax_1])) | |
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1 | |
############################################################################################ | |
#we also wont have room on y axis to label all species, so need to subset somehow | |
#first need make a column containing labels for plotting using ggrepel. | |
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated | |
dat.l$labels<-dat.l$id | |
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df | |
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name | |
#We can further filter these eg by abundance if the y is looking too busy | |
# This could be expanded to only include "indicators" or anything really of interest | |
#make table with abundances each taxon (ie make long df wide again) | |
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE)) | |
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent | |
dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df | |
#not used but potentially useful for advanced labelling | |
#require(ggtext) | |
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id) | |
######################################################################################################### | |
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis). | |
# They then get bound by cowplot | |
pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+ | |
geom_blank()+ | |
scale_x_discrete(NULL, guide = "axis_nested") + | |
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+ | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+ | |
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+ | |
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"), | |
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+ | |
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+ | |
theme(strip.text.y = element_text(size = 6))+ | |
geom_blank(aes(y = y_min-2)) + | |
geom_blank(aes(y = y_max+2)) | |
axis<-ggplot(dat.l,aes(x=0, y=y))+ | |
geom_blank()+ | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+ | |
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+ | |
theme(panel.spacing.y = unit(0.2, "lines"), | |
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(), | |
strip.text.y = element_blank())+ | |
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+ | |
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"), | |
color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text | |
size = y_labs_size, ## ggplot2 theme_grey() axis text | |
max.overlaps=Inf, | |
seed = 42, | |
force = 4, | |
force_pull = 0.0, | |
nudge_x = -1, | |
direction = "x", | |
hjust = 0, | |
segment.size = 0.0001, | |
segment.alpha = 0.1, | |
box.padding = 0.000 | |
)+ | |
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0), | |
breaks = NULL, labels = NULL, name = NULL)+ | |
theme(panel.background = element_blank(), | |
plot.margin = ggplot2::margin(0, 0, 2, 0, "pt"))+ | |
geom_blank(aes(y = y_min-2)) + | |
geom_blank(aes(y = y_max+2)) | |
#align and plot | |
aligned <- align_plots(axis, pl, align = "h", axis = "l") | |
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(1, 5)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment