Last active
October 19, 2023 10:41
-
-
Save robiwangriff/79633a738b128e64226bf58381232da7 to your computer and use it in GitHub Desktop.
amplicon_helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##source_gist("https://gist.github.com/robiwangriff/79633a738b128e64226bf58381232da7") | |
#' # 1. Merge and filter | |
#'A small function to merge and align, and output a list containing the aligned $spc, $env, and $tax | |
#' files. The main part of this is merge() and requires specifying two "common reference" vectors in both datasets. | |
aligner<-function(spc,env,tax=NULL, spc_id="row.names",env_id="row.names",tax_id="row.names",transpose_spc=FALSE){ | |
#transpose the spc file so sample IDs are rows (and row names) | |
spc.t<-spc | |
if(transpose_spc==TRUE){ | |
spc.t=t(spc)} | |
#merge | |
spc_env_merged<- merge(spc.t, env, by.x=spc_id, by.y=env_id) | |
#fix the added Row.name column to be row.names (a nuance of merge()) | |
rownames(spc_env_merged)=spc_env_merged$Row.names | |
spc_env_merged=spc_env_merged[2:length(spc_env_merged)]#get rid of Row.names | |
#split again into spc and env files | |
y<-ncol(spc.t)#counts how many ASVs - needed to split in right place | |
spc.out<-spc_env_merged[,1:y] | |
env.out<-spc_env_merged[,(y+1):ncol(spc_env_merged)] | |
#check tax file is aligned | |
if(!is.null(tax)){ | |
if(tax_id=="row.names"){ | |
tx<-row.names(tax)}else{ | |
tx<-tax[,tax_id] | |
rownames(tax)<-tx #just adding tax identifiers to rownames, makes other things work | |
} | |
if(identical(colnames(spc.out),tx)==FALSE) { | |
warning("spc colnames and tax rownames are not ordered same. I fixed it for you.") | |
tax<-tax[order(tx,colnames(spc.t)),] | |
} | |
return(list(spc=spc.out,env=env.out,tax=tax))}else{ | |
return(list(spc=spc.out,env=env.out))} | |
} | |
######################################################################################################## | |
#' # 2. Get rid of samples with low reads | |
# A function to drop samples from a list containing spc env and tax(tax is left unchanged) and nreads. | |
drop_samples<-function(mylist, read_threshold=1000){ | |
mylist$env<-mylist$env[rowSums(mylist$spc)>read_threshold,] | |
mylist$spc<-mylist$spc[rowSums(mylist$spc)>read_threshold,] | |
return(mylist) | |
} | |
##################################################################################################################### | |
#' # 3. Create another list element with spc containing standard number reads | |
#A function to rarify samples from a list containing spc env and tax. | |
#test | |
#mylist<-bac$spc | |
#resample_to=1000 | |
rarefy_samples<-function(mylist,resample_to=1000){ | |
require(vegan) | |
set.seed(35)#to reproduce | |
mylist$spc<-rrarefy(mylist$spc,resample_to) | |
if(!is.null(mylist$tax)){ | |
mylist$tax<-mylist$tax[colSums(mylist$spc)>0,] | |
} | |
mylist$spc<-mylist$spc[,colSums(mylist$spc)>0] | |
return(mylist) | |
} | |
############################################################################################################# | |
#' # 4. A nice looking ordination plot showing 2 main effects | |
#test | |
#ord<-mod | |
#env<-mg.l$rare$env | |
#group1<-"plot" | |
#group2<-"season" | |
#titl<-"blah" | |
ncplot <- function(ord,env,group1,group2,titl,group1_cols,group2_cols){ | |
require(ggordiplots) | |
require(ggnewscale) | |
p.b<-gg_ordiplot(ord, groups =env[,group1],ellipse=F,spider=T,label=F,pt.size = 1,plot=FALSE) | |
p.h<-gg_ordiplot(ord, groups = env[,group2],hull=T,spider=T,label=T,plot=FALSE) | |
ord.data<-p.b$df_ord | |
names(ord.data)<-c("x","y", group1) | |
ord.data[,group2]<-p.h$df_ord[,3] | |
hulls<-p.h$df_hull | |
names(hulls)<-c("Group","x1","y1") | |
plot.b<-ggplot(data=ord.data, aes_string(x=ord$points[,1], y=ord$points[,2],color=group1))+ | |
geom_point(aes_string(shape=group2),size=2) + | |
#if group_cols specified | |
#scale_color_manual(values=group2_cols)+ | |
#theme(legend.position="none")+ | |
geom_segment(data=p.b$df_spiders, aes(x=cntr.x, xend=x, y=cntr.y, yend=y, color=Group), | |
show.legend = TRUE)+ | |
geom_label(data=p.b$df_mean.ord, aes(x=x, y=y, label=Group, color=Group), | |
fontface="bold", size=3, | |
label.size = 1, | |
label.r = unit(0.3, "lines"), show.legend = TRUE)+ | |
geom_polygon(data=hulls,aes(x=x1, y=y1, fill=Group),inherit.aes = FALSE,alpha=0.05, | |
show.legend = TRUE)+ | |
new_scale_colour()+ | |
#if group cols specified | |
#scale_fill_manual(values=group1_cols)+ | |
theme_bw()+ theme(legend.position="none")+ | |
theme(axis.title.x=element_blank(), | |
axis.text.x=element_blank(), | |
axis.ticks.x=element_blank(),axis.title.y=element_blank(), | |
axis.text.y=element_blank(), | |
axis.ticks.y=element_blank(), | |
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ | |
geom_text(data=p.h$df_mean.ord, aes(x=x, y=y, label=Group),inherit.aes = FALSE, | |
fontface="bold", size=3, | |
#colour=c(group1_cols), | |
show.legend = TRUE)+ ggtitle(titl) | |
return(plot.b) | |
} | |
############################################################################################################### | |
#' # 5. Aggregate counts of taxa | |
#aggreg8 function | |
#requires | |
#spc: species data.frame samples as rows | |
#ann: annotation file, ordered so first column is most aggregated, subsequent are incrementally less | |
#fac: the column name in ann by which to aggregate | |
aggreg8<-function(spc,ann,fac){ | |
#aggreg8 function | |
#requires | |
#spc: species data.frame samples as rows, cols are species | |
#ann: annotation file, row names should match spc colnames | |
#fac: the column name in ann by which to aggregate | |
ann[is.na(ann)] <-"" # just clear any NAs | |
stopifnot("spc colnames and ann rownames should be the same, fool!" = identical(colnames(spc),rownames(ann))) | |
tabfac = factor(ann[,fac]) | |
tab = apply(t(spc), MARGIN = 2, function(x) { | |
tapply(x, INDEX = tabfac, FUN = sum, na.rm = TRUE, simplify = TRUE) | |
}) | |
spc.out<-t(tab) | |
return(spc.out) | |
} | |
# eg bac.order<-aggregator(bac.final$rare$spc,bac.final$rare$tax,fac="Family") | |
############################################################################################################### | |
#' # 6. Remove duplicates by optimal number of reads. Sometimes duplicates get run, and you want to take them out. | |
#requires alignment of spc and env | |
rm_dups_x_reads<-function(spc,env, duplicated_var){ | |
nreads<-rowSums(spc) | |
env.red<-env[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ] | |
spc.red<-spc[which(abs(nreads) == ave(nreads, env[,duplicated_var],FUN=function(x) max(abs(x)))), ] | |
return(list(spc=spc.red,env=env.red)) | |
} | |
############################################################################################################### | |
#' # 7.Drop and reorder samples in spc env list by vector. | |
#eg sometimes you may want to align across multiple data lists (bact, fungi mg...) | |
# sub_vec_common_samples<-Reduce(intersect, list(mg.l$sub$env$REP_ID19,rr.l$sub$env$REP_ID19,rp1.l$sub$env$REP_ID19)) | |
drop_order<-function(mylist, env_vec,sub_vec ){ | |
mylist$spc<-mylist$spc[match(sub_vec,mylist$env[,env_vec]),] | |
mylist$env<-mylist$env[match(sub_vec,mylist$env[,env_vec]),] | |
return(mylist) | |
} | |
############################################################################################################### | |
#' # 8. A gel like image of community data | |
ngs2gel<-function(spc,env,tax,subs=0.005,band_multiplier=200,order_samps,label_samps,label_tax_1="phylum", | |
label_tax_2="order", indie_seq_id,indie_group, indie_col, band_col="grey",y_labs_remover=2,y_labs_size=4){ | |
#libs used | |
require(reshape2) | |
require(vegan) | |
require(ggh4x) | |
library(cowplot) | |
library(ggrepel) | |
#convert species df to proportions (0 to 1 scale) | |
spc<-decostand(spc,"total") | |
#remove species whose proportional abundance is less than "subs" argument & recode new "0" to NA | |
spc[spc<subs]<-0 | |
spc<-spc[,colSums(spc)>0] | |
spc[spc==0]<-NA | |
####################################################################### | |
#reorder species randomly (NOT DONE) | |
#set.seed(32) | |
#ord<-sample(ncol(spc)) | |
#spc<-spc[,ord] | |
#colnames(spc)<-1:ncol(spc) | |
######################################################################## | |
#sort species according to taxonomy | |
#index the species names so they are no longer long sequences | |
taxa.sel<-colnames(spc)#separate seqs | |
#pull tax info for the subsetted taxa | |
tax_df<-cbind(tax[match(taxa.sel,rownames(tax)),],taxa.sel) | |
#add in unknown classification | |
tax_df[is.na(tax_df)]<-"Unknown" | |
#Now sort alphabetically by taxonomy (other sorts could be implemented eg phylogeny) | |
tax_df<-tax_df[do.call(order, tax_df), ] | |
#sort spc df according to ordered tax df | |
spc<-spc[,match(rownames(tax_df), colnames(spc))] | |
identical(rownames(tax_df),colnames(spc))#check | |
#add in a unique ID (index) for each taxon based on the taxonomy sort (this is to help order the y axis in plot) | |
colnames(spc)<-sprintf("%03d",1:ncol(spc)) | |
tax_df$names_spc<-sprintf("%03d",1:nrow(tax_df))# create matching index for tax_df taxa | |
#add in indicator label (if requested) | |
if (hasArg(indie_seq_id)){ | |
tax_df$indicator<-indie_group[match(rownames(tax_df),indie_seq_id)] | |
} | |
################################################################################# | |
#reorder rows (samples) according to "order_samps" argument | |
spc<-spc[with(env,do.call(order,mget(order_samps))),] | |
env<-env[with(env,do.call(order,mget(order_samps))),] | |
env<-env[,order_samps] | |
env$x<-1:nrow(env)# this is just a vector which could order the samples along x in final plot, but also allows spacing between "lanes" | |
################################################################################## | |
#make long df for ggplotting, with xy coords and abundances (for band fatness) | |
dat.w<-data.frame(spc,env,check.names=F) | |
dat.l<-melt(dat.w,id.vars=c(order_samps,"x")) | |
names(dat.l)[names(dat.l) == 'variable'] <- 'y' | |
names(dat.l)[names(dat.l) == 'value'] <- 'abund' | |
#add in selected taxon levels for labelling taxa to long df | |
dat.l$tax_1<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_1] | |
dat.l$tax_2<-tax_df[match(dat.l$y,tax_df$names_spc),][,label_tax_2] | |
#add in indicator designation (if requested) | |
if (hasArg(indie_seq_id)){ | |
dat.l$indicator<-tax_df[match(dat.l$y,tax_df$names_spc),][,"indicator"] | |
} | |
#make a new name for labelling taxa | |
dat.l$id<-paste(dat.l$tax_2,dat.l$y,sep="_") | |
############################################################## | |
#we'll be plotting facets later, and want to add some padding to each facet at top and bottom | |
#min | |
dat.l$y_min<--1 | |
#max which will be number of taxa per tax_1 designation (the facet) plus some padding | |
#count number of taxa per tax_1 taxonomic designation | |
tab<-data.frame(table(tax_df[,label_tax_1])) | |
dat.l$y_max<-tab[match(dat.l$tax_1,tab$Var1),]$Freq*1 | |
############################################################################################ | |
#we also wont have room on y axis to label all species, so need to subset somehow | |
#first need make a column containing labels for plotting using ggrepel. | |
#Because our plotting df is "long format", we need only one label name for each y value - ie non duplicated | |
dat.l$labels<-dat.l$id | |
dat.l$labels[duplicated(dat.l$labels)] <- "" #remove duplicates due to long df | |
dat.l$labels<-sub("_[^_]+$", "", dat.l$labels) #remove numeric index in name | |
#We can further filter these eg by abundance if the y is looking too busy | |
# This could be expanded to only include "indicators" or anything really of interest | |
#make table with abundances each taxon (ie make long df wide again) | |
tab<-data.frame(aggregate(dat.l$abund, by = list(dat.l$y), max,na.rm = TRUE)) | |
tab$col<-ifelse(tab$x>y_labs_remover*subs,"black","transparent") # set colour code so low abund labels transparent | |
#add in group indicator colour codes (if indicator argument requested) | |
if (hasArg(indie_seq_id)) { | |
tab$indie_group<-tax_df[match(tab$Group.1,tax_df$names_spc),]$indicator | |
tab2<-data.frame(x=levels(as.factor(indie_group)),y=indie_col) | |
tab$indie_col<-tab2[match(tab$indie_group,tab2$x),]$y | |
tab$col<-ifelse(tab$col=="black"&tab$indie_group%in%indie_group,tab$indie_col, "transparent") | |
} | |
dat.l$y_lab_col<-tab[match(dat.l$y,tab$Group.1),]$col #add to long df | |
#also set the band colour | |
dat.l$band_col<-band_col | |
if (hasArg(indie_seq_id)) { | |
dat.l$band_col<-tab[match(dat.l$y,tab$Group.1),]$indie_col #add to long df | |
dat.l$band_col<-ifelse(is.na(dat.l$band_col),band_col,dat.l$band_col) | |
} | |
#not used but potentially useful for advanced labelling | |
#require(ggtext) | |
#dat.l$id2<-ifelse(dat.l$y_lab_col=="transparent",paste0("<span style='font-size: 0pt;color:NA'>", dat.l$id, "</span>"),dat.l$id) | |
######################################################################################################### | |
# Now plot. Two plots will be created, one being the plot (pl), the other the taxon labels (axis). | |
# They then get bound by cowplot | |
pl<-ggplot(dat.l,aes_string(x=paste0("interaction(x,",paste(rev(label_samps), collapse = ','),")") ))+ | |
geom_blank()+ | |
scale_x_discrete(NULL, guide = "axis_nested") + | |
geom_segment(aes(x = x-0.3, y = y, xend = x+0.3, yend = y),col=dat.l$band_col,lineend = "round",linewidth=band_multiplier*dat.l$abund)+ | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+ | |
scale_y_discrete(drop=TRUE,expand=c(0,0),,labels = NULL, name = NULL)+ | |
theme(strip.text.y.right = element_text(angle = 0),panel.spacing.y = unit(0.2, "lines"), | |
axis.text.y = element_text(size=y_labs_size, angle=30),axis.ticks.y = element_blank(),plot.margin = margin(0, 0, 2, 0, "pt"))+ | |
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+ | |
theme(strip.text.y = element_text(size = 6))+ | |
geom_blank(aes(y = y_min-2)) + | |
geom_blank(aes(y = y_max+2))+ | |
theme(axis.ticks.x = element_line(color = indie_col[env[,order_samps[1]]],linewidth=2)) | |
axis<-ggplot(dat.l,aes(x=0, y=y))+ | |
geom_blank()+ | |
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), | |
panel.background = element_blank(), axis.line = element_blank(),strip.background=element_rect(fill="grey96"))+ | |
scale_y_discrete(drop=TRUE,expand=c(0,0),labels = NULL, name = NULL)+ | |
theme(panel.spacing.y = unit(0.2, "lines"), | |
axis.text.y = element_blank(),axis.ticks.y = element_blank(),strip.background = element_blank(), | |
strip.text.y = element_blank())+ | |
facet_grid2(tax_1~., scales = "free", space = "free_y",strip = strip_vanilla(size = "variable",clip = "off"))+ | |
geom_text_repel(label=dat.l$labels,y=dat.l$y,min.segment.length = grid::unit(0, "pt"), | |
color = dat.l$y_lab_col, ## ggplot2 theme_grey() axis text | |
size = y_labs_size, ## ggplot2 theme_grey() axis text | |
max.overlaps=Inf, | |
seed = 42, | |
force = 4, | |
force_pull = 0.0, | |
nudge_x = -1, | |
direction = "x", | |
hjust = 0, | |
segment.size = 0.0001, | |
segment.alpha = 0.1, | |
box.padding = 0.000 | |
)+ | |
scale_x_continuous(limits = c(-1, 0), expand = c(0, 0), | |
breaks = NULL, labels = NULL, name = NULL)+ | |
theme(panel.background = element_blank(), | |
plot.margin = margin(0, 0, 2, 0, "pt"))+ | |
geom_blank(aes(y = y_min-2)) + | |
geom_blank(aes(y = y_max+2)) | |
#align and plot | |
aligned <- align_plots(axis, pl, align = "h", axis = "l") | |
plot_grid(plotlist = aligned, nrow = 1, rel_widths = c(2, 5)) | |
} | |
#######USE | |
#without indies | |
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.002,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum", | |
#label_tax_2="order", band_col="grey",y_labs_remover=1,y_labs_size=2) | |
#with indies | |
#ngs2gel(bac$rare$spc,bac$rare$env,bac$rare$tax,subs=0.001,band_multiplier=200,order_samps=c("Fertiliser","block","pH"),label_samps=c("Fertiliser","block"),label_tax_1="phylum", | |
#label_tax_2="order", indie_seq_id=inds[,"Row.names"],indie_group=factor(inds[,"group_name"],levels=c("NL","FL","L")), | |
#indie_col=c("red","blue","green"), band_col="grey",y_labs_remover=1,y_labs_size=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment