-
-
Save peterk87/d92f81ae475063792f49 to your computer and use it in GitHub Desktop.
R: Arcobacter butzleri CGF72 vs CGF40 tanglegrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# need dendextend(Rcpp) for generating tanglegram plot and untangling trees | |
require(dendextend) | |
require(dendextendRcpp) | |
library(RColorBrewer) | |
# read CGF40 lab binary data - mostly for names of CGF40 markers in the column headers | |
df_cgf40 <- read.table('CGF40_tangle_wMarkerNames.txt', sep='\t', header=T, row.names=1, check.names=F) | |
# read MIST match info | |
mist72 <- read.csv('mist-results.csv') | |
# use reshape2 library to take 'long' data and make it 'wide' - table to matrix | |
library(reshape2) | |
mat_mist_72_vs_4_ncbi <- acast(mist72, formula=Strain~marker, value.var='Result') | |
# write MIST CGF72 binary data matrix to CSV | |
write.csv(mat_mist_72_vs_4_ncbi, 'MIST-Arco_CGF72-4_NCBI_Abutzleri.csv') | |
# read correct CGF72 lab binary data | |
df_cgf72 <- read.table('CGF72_tangle_wMarkerNames_fixed.txt', sep='\t', header=T, row.names=1, check.names=F) | |
# verify that all markers in lab CGF72 data are present in MIST CGF72 data | |
setdiff(colnames(df_cgf72), colnames(mat_mist_72_vs_4_ncbi)) | |
# and vice versa | |
setdiff(colnames(mat_mist_72_vs_4_ncbi), colnames(df_cgf72)) | |
# MIST CGF72 matrix to dataframe | |
df_mist72_4 <- as.data.frame(mat_mist_72_vs_4_ncbi) | |
# re-order markers in same order as lab data | |
df_mist72_4 <- df_mist72_4[,colnames(df_cgf72)] | |
# merge the lab and MIST CGF72 data | |
df_merged <- rbind(df_cgf72, df_mist72_4) | |
# verify all CGF40 markers are in CGF72 | |
setdiff(colnames(df_cgf40), colnames(df_cgf72)) | |
mat_72 <- as.matrix(df_merged) | |
# get CGF40 subset from MIST+lab binary CGF72 data | |
mat_40 <- as.matrix(df_merged)[,markers_cgf40_order] | |
# Use a Manhattan distance matrix (approx for Hamming distance) for HC using complete linkage | |
# CGF72 HC and dendrogram | |
hc_72 <- hclust(dist(mat_72, method='man')) | |
dend_72 <- as.dendrogram(hc_72) | |
# CGF40 HC and dendrogram | |
hc_40 <- hclust(dist(mat_40, method='man')) | |
dend_40 <- as.dendrogram(hc_40) | |
# untangle CGF40 tree respective to CGF72 tree | |
dend_40_corr <- untangle_step_rotate_1side(dend1=dend_40, dend2_fixed=dend_72) | |
# get clusters at all distance levels - CGF72 has 72 markers so 0 to 72 differences | |
cl_72 <- cutree(hc_72, h=0:72) | |
# get strains in HC order | |
order_72 <- hc_72$labels[hc_72$order] | |
# get clusters in HC order | |
cl_72_order <- cl_72[order_72,] | |
# get number of clusters at each distance level | |
cl_72_max <- apply(cl_72_order, 2, max) | |
# up to 8 differences == 88.9% similiarity threshold for clusters | |
cl_72_order_h8 <- cl_72_order[,9] | |
max_72_cl_h8 <- max(cl_72_order_h8) | |
# get random branch colours for clusters at h=8 using RColorBrewer Set1 colorRampPalette colours | |
set1_colours <- colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Set1')) | |
set.seed(1) # set RNG seed for reproducible colours | |
branch_colours <- sample(set1_colours(max_72_cl_h8), size=max_72_cl_h8, replace=F)[cl_72_order_h8] | |
# make singleton clusters transparent so multi-strain clusters are more visible | |
branch_colours[!(branch_colours %in% unique(branch_colours[duplicated(branch_colours)]))] <- rgb(1,1,1,0) | |
# save tanglegram to PDF | |
pdf('tanglegram-CGF72_vs_CGF40-h8_CGF72_multi_clusters.pdf', width=10, height=10) | |
tanglegram(dend_72, dend_40_corr, lab.cex=.5, color_lines=branch_colours, main_left='CGF72', main_right='CGF40') | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment