Skip to content

Instantly share code, notes, and snippets.

@peterk87
Created March 13, 2014 08:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterk87/d92f81ae475063792f49 to your computer and use it in GitHub Desktop.
Save peterk87/d92f81ae475063792f49 to your computer and use it in GitHub Desktop.
R: Arcobacter butzleri CGF72 vs CGF40 tanglegrams
# need dendextend(Rcpp) for generating tanglegram plot and untangling trees
require(dendextend)
require(dendextendRcpp)
library(RColorBrewer)
# read CGF40 lab binary data - mostly for names of CGF40 markers in the column headers
df_cgf40 <- read.table('CGF40_tangle_wMarkerNames.txt', sep='\t', header=T, row.names=1, check.names=F)
# read MIST match info
mist72 <- read.csv('mist-results.csv')
# use reshape2 library to take 'long' data and make it 'wide' - table to matrix
library(reshape2)
mat_mist_72_vs_4_ncbi <- acast(mist72, formula=Strain~marker, value.var='Result')
# write MIST CGF72 binary data matrix to CSV
write.csv(mat_mist_72_vs_4_ncbi, 'MIST-Arco_CGF72-4_NCBI_Abutzleri.csv')
# read correct CGF72 lab binary data
df_cgf72 <- read.table('CGF72_tangle_wMarkerNames_fixed.txt', sep='\t', header=T, row.names=1, check.names=F)
# verify that all markers in lab CGF72 data are present in MIST CGF72 data
setdiff(colnames(df_cgf72), colnames(mat_mist_72_vs_4_ncbi))
# and vice versa
setdiff(colnames(mat_mist_72_vs_4_ncbi), colnames(df_cgf72))
# MIST CGF72 matrix to dataframe
df_mist72_4 <- as.data.frame(mat_mist_72_vs_4_ncbi)
# re-order markers in same order as lab data
df_mist72_4 <- df_mist72_4[,colnames(df_cgf72)]
# merge the lab and MIST CGF72 data
df_merged <- rbind(df_cgf72, df_mist72_4)
# verify all CGF40 markers are in CGF72
setdiff(colnames(df_cgf40), colnames(df_cgf72))
mat_72 <- as.matrix(df_merged)
# get CGF40 subset from MIST+lab binary CGF72 data
mat_40 <- as.matrix(df_merged)[,markers_cgf40_order]
# Use a Manhattan distance matrix (approx for Hamming distance) for HC using complete linkage
# CGF72 HC and dendrogram
hc_72 <- hclust(dist(mat_72, method='man'))
dend_72 <- as.dendrogram(hc_72)
# CGF40 HC and dendrogram
hc_40 <- hclust(dist(mat_40, method='man'))
dend_40 <- as.dendrogram(hc_40)
# untangle CGF40 tree respective to CGF72 tree
dend_40_corr <- untangle_step_rotate_1side(dend1=dend_40, dend2_fixed=dend_72)
# get clusters at all distance levels - CGF72 has 72 markers so 0 to 72 differences
cl_72 <- cutree(hc_72, h=0:72)
# get strains in HC order
order_72 <- hc_72$labels[hc_72$order]
# get clusters in HC order
cl_72_order <- cl_72[order_72,]
# get number of clusters at each distance level
cl_72_max <- apply(cl_72_order, 2, max)
# up to 8 differences == 88.9% similiarity threshold for clusters
cl_72_order_h8 <- cl_72_order[,9]
max_72_cl_h8 <- max(cl_72_order_h8)
# get random branch colours for clusters at h=8 using RColorBrewer Set1 colorRampPalette colours
set1_colours <- colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Set1'))
set.seed(1) # set RNG seed for reproducible colours
branch_colours <- sample(set1_colours(max_72_cl_h8), size=max_72_cl_h8, replace=F)[cl_72_order_h8]
# make singleton clusters transparent so multi-strain clusters are more visible
branch_colours[!(branch_colours %in% unique(branch_colours[duplicated(branch_colours)]))] <- rgb(1,1,1,0)
# save tanglegram to PDF
pdf('tanglegram-CGF72_vs_CGF40-h8_CGF72_multi_clusters.pdf', width=10, height=10)
tanglegram(dend_72, dend_40_corr, lab.cex=.5, color_lines=branch_colours, main_left='CGF72', main_right='CGF40')
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment