Skip to content

Instantly share code, notes, and snippets.

@robiwangriff
Last active October 19, 2023 10:41
Show Gist options
  • Save robiwangriff/79633a738b128e64226bf58381232da7 to your computer and use it in GitHub Desktop.
Save robiwangriff/79633a738b128e64226bf58381232da7 to your computer and use it in GitHub Desktop.
amplicon_helpers
##source_gist("https://gist.github.com/robiwangriff/79633a738b128e64226bf58381232da7")
#' # 1. Merge and filter
#'A small function to merge and align, and output a list containing the aligned $spc, $env, and $tax
#' files. The main part of this is merge() and requires specifying two "common reference" vectors in both datasets.
aligner<-function(spc,env,tax=NULL, spc_id="row.names",env_id="row.names",tax_id="row.names",transpose_spc=FALSE){
#transpose the spc file so sample IDs are rows (and row names)
spc.t<-spc
if(transpose_spc==TRUE){
spc.t=t(spc)}
#merge
spc_env_merged<- merge(spc.t, env, by.x=spc_id, by.y=env_id)
#fix the added Row.name column to be row.names (a nuance of merge())
rownames(spc_env_merged)=spc_env_merged$Row.names
spc_env_merged=spc_env_merged[2:length(spc_env_merged)]#get rid of Row.names
#split again into spc and env files
y<-ncol(spc.t)#counts how many ASVs - needed to split in right place
spc.out<-spc_env_merged[,1:y]
env.out<-spc_env_merged[,(y+1):ncol(spc_env_merged)]
#check tax file is aligned
if(!is.null(tax)){
if(tax_id=="row.names"){
tx<-row.names(tax)}else{
tx<-tax[,tax_id]
rownames(tax)<-tx #just adding tax identifiers to rownames, makes other things work
}
if(identical(colnames(spc.out),tx)==FALSE) {
warning("spc colnames and tax rownames are not ordered same. I fixed it for you.")
tax<-tax[order(tx,colnames(spc.t)),]
}
return(list(spc=spc.out,env=env.out,tax=tax))}else{
return(list(spc=spc.out,env=env.out))}
}
########################################################################################################
#' # 2. Get rid of samples with low reads
# A function to drop samples from a list containing spc env and tax(tax is left unchanged) and nreads.
drop_samples<-function(mylist, read_threshold=1000){
mylist$env<-mylist$env[rowSums(mylist$spc)>read_threshold,]
mylist$spc<-mylist$spc[rowSums(mylist$spc)>read_threshold,]
return(mylist)
}
#####################################################################################################################
#' # 3. Create another list element with spc containing standard number reads
#A function to rarify samples from a list containing spc env and tax.
#test
#mylist<-bac$spc
#resample_to=1000
rarefy_samples<-function(mylist,resample_to=1000){
require(vegan)
set.seed(35)#to reproduce
mylist$spc<-rrarefy(mylist$spc,resample_to)
if(!is.null(mylist$tax)){
mylist$tax<-mylist$tax[colSums(mylist$spc)>0,]
}
mylist$spc<-mylist$spc[,colSums(mylist$spc)>0]
return(mylist)
}
#############################################################################################################
#' # 4. A nice looking ordination plot showing 2 main effects
#test
#ord<-mod
#env<-mg.l$rare$env
#group1<-"plot"
#group2<-"season"
#titl<-"blah"
ncplot <- function(ord,env,group1,group2,titl,group1_cols,group2_cols){
require(ggordiplots)
require(ggnewscale)
p.b<-gg_ordiplot(ord, groups =env[,group1],ellipse=F,spider=T,label=F,pt.size = 1,plot=FALSE)
p.h<-gg_ordiplot(ord, groups = env[,group2],hull=T,spider=T,label=T,plot=FALSE)
ord.data<-p.b$df_ord
names(ord.data)<-c("x","y", group1)
ord.data[,group2]<-p.h$df_ord[,3]
hulls<-p.h$df_hull
names(hulls)<-c("Group","x1","y1")
plot.b<-ggplot(data=ord.data, aes_string(x=ord$points[,1], y=ord$points[,2],color=group1))+
geom_point(aes_string(shape=group2),size=2) +
#if group_cols specified
#scale_color_manual(values=group2_cols)+
#theme(legend.position="none")+
geom_segment(data=p.b$df_spiders, aes(x=cntr.x, xend=x, y=cntr.y, yend=y, color=Group),
show.legend = TRUE)+
geom_label(data=p.b$df_mean.ord, aes(x=x, y=y, label=Group, color=Group),
fontface="bold", size=3,
label.size = 1,
label.r = unit(0.3, "lines"), show.legend = TRUE)+
geom_polygon(data=hulls,aes(x=x1, y=y1, fill=Group),inherit.aes = FALSE,alpha=0.05,
show.legend = TRUE)+
new_scale_colour()+
#if group cols specified
#scale_fill_manual(values=group1_cols)+
theme_bw()+ theme(legend.position="none")+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
geom_text(data=p.h$df_mean.ord, aes(x=x, y=y, label=Group),inherit.aes = FALSE,
fontface="bold", size=3,
#colour=c(group1_cols),
show.legend = TRUE)+ ggtitle(titl)
return(plot.b)
}
###############################################################################################################
#' # 5. Aggregate counts of taxa
#aggreg8 function
#requires
#spc: species data.frame samples as rows
#ann: annotation file, ordered so first column is most aggregated, subsequent are incrementally less
#fac: the column name in ann by which to aggregate
aggreg8<-function(spc,ann,fac){
#aggreg8 function
#requires
#spc: species data.frame samples as rows, cols are species
#ann: annotation file, row names should match spc colnames
#fac: the column name in ann by which to aggregate
ann[is.na(ann)] <-"" # just clear any NAs
stopifnot("spc colnames and ann rownames should be the same, fool!" = identical(colnames(spc),rownames(ann)))
tabfac = factor(ann[,fac])
tab = apply(t(spc), MARGIN = 2, function(x) {
tapply(x, INDEX = tabfac, FUN = sum, na.rm = TRUE, simplify = TRUE)
})
spc.out<-t(tab)
return(spc.out)
}
# eg bac.order<-aggregator(bac.final$rare$spc,bac.final$rare$tax,fac="Family")
###############################################################################################################
#' # 6. Remove duplicates by optimal number of reads. Sometimes duplicates get run, and you want to take them out.
#requires alignment of spc and env
rm_dups_x_reads<-function(spc,env, duplicated_var){
nreads<-rowSums(spc)
env.red<-env[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
spc.red<-spc[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ]
return(list(spc=spc.red,env=env.red))
}
###############################################################################################################
#' # 7.Drop and reorder samples in spc env list by vector.
#eg sometimes you may want to align across multiple data lists (bact, fungi mg...)
# sub_vec_common_samples<-Reduce(intersect, list(mg.l$sub$env$REP_ID19,rr.l$sub$env$REP_ID19,rp1.l$sub$env$REP_ID19))
drop_order<-function(mylist, env_vec,sub_vec ){
mylist$spc<-mylist$spc[match(sub_vec,mylist$env[,env_vec]),]
mylist$env<-mylist$env[match(sub_vec,mylist$env[,env_vec]),]
return(mylist)
}
###############################################################################################################
#' # 8. A gel like image of community data
ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum",
label_tax_2="order", indie_seq_id,indie_group, indie_col, band_col="grey",y_labs_remover=2,y_labs_size=4){
#libs used
require(reshape2)
require(vegan)
require(ggh4x)
library(cowplot)
library(ggrepel)
#convert species df to proportions (0 to 1 scale)
spc<-decostand(spc,"total")
#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA
spc[spc<subs]<-0
spc<-spc[,colSums(spc)>0]
spc[spc==0]<-NA
#######################################################################
#reorder species randomly (NOT DONE)
#set.seed(32)
#ord<-sample(ncol(spc))
#spc<-spc[,ord]
#colnames(spc)<-1:ncol(spc)
########################################################################
#sort species according to taxonomy
#index the species names so they are no longer long sequences
taxa.sel<-colnames(spc)#separate seqs
#pull tax info for the subsetted taxa
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel)
#add in unknown classification
tax_df[is.na(tax_df)]<-"Unknown"
#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny)
tax_df<-tax_df[do.call(order, tax_df), ]
#sort spc df according to ordered tax df
spc<-spc[,match(rownames(tax_df), colnames(spc))]
identical(rownames(tax_df),colnames(spc))#check
#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot)
colnames(spc)<-sprintf("%03d",1:ncol(spc))
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa
#add in indicator label (if requested)
if (hasArg(indie_seq_id)){
tax_df$indicator<-indie_group[match(rownames(tax_df),indie_seq_id)]
}
#################################################################################
#reorder rows (samples) according to "order_samps" argument
spc<-spc[with(env,do.call(order,mget(order_samps))),]
env<-env[with(env,do.call(order,mget(order_samps))),]
env<-env[,order_samps]
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes"
##################################################################################
#make long df for ggplotting, with xy coords and abundances (for band fatness)
dat.w<-data.frame(spc,env,check.names=F)
dat.l<-melt(dat.w,id.vars=c(order_samps,"x"))
names(dat.l)[names(dat.l) == 'variable'] <- 'y'
names(dat.l)[names(dat.l) == 'value'] <- 'abund'
#add in selected taxon levels for labelling taxa to long df
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1]
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2]
#add in indicator designation (if requested)
if (hasArg(indie_seq_id)){
dat.l$indicator<-tax_df[match(dat.l$y,tax_df$names_spc),][,"indicator"]
}
#make a new name for labelling taxa
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_")
##############################################################
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom
#min
dat.l$y_min<--1
#max which will be number of taxa per tax_1 designation (the facet) plus some padding
#count number of taxa per tax_1 taxonomic designation
tab<-data.frame(table(tax_df[,label_tax_1]))
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1
############################################################################################
#we also wont have room on y axis to label all species, so need to subset somehow
#first need make a column containing labels for plotting using ggrepel.
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated
dat.l$labels<-dat.l$id
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name
#We can further filter these eg by abundance if the y is looking too busy
# This could be expanded to only include "indicators" or anything really of interest
#make table with abundances each taxon (ie make long df wide again)
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE))
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent
#add in group indicator colour codes (if indicator argument requested)
if (hasArg(indie_seq_id)) {
tab$indie_group<-tax_df[match(tab$Group.1,tax_df$names_spc),]$indicator
tab2<-data.frame(x=levels(as.factor(indie_group)),y=indie_col)
tab$indie_col<-tab2[match(tab$indie_group,tab2$x),]$y
tab$col<-ifelse(tab$col=="black"&tab$indie_group%in%indie_group,tab$indie_col, "transparent")
}
dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df
#also set the band colour
dat.l$band_col<-band_col
if (hasArg(indie_seq_id)) {
dat.l$band_col<-tab[match(dat.l$y,tab$Group.1),]$indie_col #add to long df
dat.l$band_col<-ifelse(is.na(dat.l$band_col),band_col,dat.l$band_col)
}
#not used but potentially useful for advanced labelling
#require(ggtext)
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id)
#########################################################################################################
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis).
# They then get bound by cowplot
pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+
geom_blank()+
scale_x_discrete(NULL, guide = "axis_nested") +
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=dat.l$band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = margin(0, 0, 2, 0, "pt"))+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
theme(strip.text.y = element_text(size = 6))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))+
theme(axis.ticks.x = element_line(color = indie_col[env[,order_samps[1]]],linewidth=2))
axis<-ggplot(dat.l,aes(x=0, y=y))+
geom_blank()+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+
theme(panel.spacing.y = unit(0.2, "lines"),
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(),
strip.text.y = element_blank())+
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"),
color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text
size = y_labs_size, ## ggplot2 theme_grey() axis text
max.overlaps=Inf,
seed = 42,
force = 4,
force_pull = 0.0,
nudge_x = -1,
direction = "x",
hjust = 0,
segment.size = 0.0001,
segment.alpha = 0.1,
box.padding = 0.000
)+
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0),
breaks = NULL, labels = NULL, name = NULL)+
theme(panel.background = element_blank(),
plot.margin = margin(0, 0, 2, 0, "pt"))+
geom_blank(aes(y = y_min-2)) +
geom_blank(aes(y = y_max+2))
#align and plot
aligned <- align_plots(axis, pl, align = "h", axis = "l")
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(2, 5))
}
#######USE
#without indies
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.002,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
#label_tax_2="order", band_col="grey",y_labs_remover=1,y_labs_size=2)
#with indies
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.001,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum",
#label_tax_2="order", indie_seq_id=inds[,"Row.names"],indie_group=factor(inds[,"group_name"],levels=c("NL","FL","L")),
#indie_col=c("red","blue","green"), band_col="grey",y_labs_remover=1,y_labs_size=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment