peterk87/arco_tanglegrams.R Secret

## arco_tanglegrams.R
# need dendextend(Rcpp) for generating tanglegram plot and untangling trees
require(dendextend)
require(dendextendRcpp)
library(RColorBrewer)

# read CGF40 lab binary data - mostly for names of CGF40 markers in the column headers
df_cgf40 <- read.table('CGF40_tangle_wMarkerNames.txt', sep='\t', header=T, row.names=1, check.names=F)

# read MIST match info
mist72 <- read.csv('mist-results.csv')

# use reshape2 library to take 'long' data and make it 'wide' - table to matrix
library(reshape2)
mat_mist_72_vs_4_ncbi <- acast(mist72, formula=Strain~marker, value.var='Result')
# write MIST CGF72 binary data matrix to CSV
write.csv(mat_mist_72_vs_4_ncbi, 'MIST-Arco_CGF72-4_NCBI_Abutzleri.csv')
# read correct CGF72 lab binary data
df_cgf72 <- read.table('CGF72_tangle_wMarkerNames_fixed.txt', sep='\t', header=T, row.names=1, check.names=F)
# verify that all markers in lab CGF72 data are present in MIST CGF72 data
setdiff(colnames(df_cgf72), colnames(mat_mist_72_vs_4_ncbi))
# and vice versa
setdiff(colnames(mat_mist_72_vs_4_ncbi), colnames(df_cgf72))
# MIST CGF72 matrix to dataframe
df_mist72_4 <- as.data.frame(mat_mist_72_vs_4_ncbi)
# re-order markers in same order as lab data
df_mist72_4 <- df_mist72_4[,colnames(df_cgf72)]
# merge the lab and MIST CGF72 data
df_merged <- rbind(df_cgf72, df_mist72_4)

# verify all CGF40 markers are in CGF72
setdiff(colnames(df_cgf40), colnames(df_cgf72))

mat_72 <- as.matrix(df_merged)
# get CGF40 subset from MIST+lab binary CGF72 data
mat_40 <- as.matrix(df_merged)[,markers_cgf40_order]


# Use a Manhattan distance matrix (approx for Hamming distance) for HC using complete linkage
# CGF72 HC and dendrogram
hc_72 <- hclust(dist(mat_72, method='man'))
dend_72 <- as.dendrogram(hc_72)
# CGF40 HC and dendrogram
hc_40 <- hclust(dist(mat_40, method='man'))
dend_40 <- as.dendrogram(hc_40)

# untangle CGF40 tree respective to CGF72 tree
dend_40_corr <- untangle_step_rotate_1side(dend1=dend_40, dend2_fixed=dend_72)

# get clusters at all distance levels - CGF72 has 72 markers so 0 to 72 differences
cl_72 <- cutree(hc_72, h=0:72)
# get strains in HC order
order_72 <- hc_72$labels[hc_72$order]
# get clusters in HC order
cl_72_order <- cl_72[order_72,]
# get number of clusters at each distance level
cl_72_max <- apply(cl_72_order, 2, max)
# up to 8 differences == 88.9% similiarity threshold for clusters
cl_72_order_h8 <- cl_72_order[,9]
max_72_cl_h8 <- max(cl_72_order_h8)
# get random branch colours for clusters at h=8 using RColorBrewer Set1 colorRampPalette colours
set1_colours <- colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Set1'))
set.seed(1) # set RNG seed for reproducible colours
branch_colours <- sample(set1_colours(max_72_cl_h8), size=max_72_cl_h8, replace=F)[cl_72_order_h8]
# make singleton clusters transparent so multi-strain clusters are more visible
branch_colours[!(branch_colours %in% unique(branch_colours[duplicated(branch_colours)]))] <- rgb(1,1,1,0)
# save tanglegram to PDF
pdf('tanglegram-CGF72_vs_CGF40-h8_CGF72_multi_clusters.pdf', width=10, height=10)
tanglegram(dend_72, dend_40_corr, lab.cex=.5, color_lines=branch_colours, main_left='CGF72', main_right='CGF40')
dev.off()
	# need dendextend(Rcpp) for generating tanglegram plot and untangling trees
	require(dendextend)
	require(dendextendRcpp)
	library(RColorBrewer)

	# read CGF40 lab binary data - mostly for names of CGF40 markers in the column headers
	df_cgf40 <- read.table('CGF40_tangle_wMarkerNames.txt', sep='\t', header=T, row.names=1, check.names=F)

	# read MIST match info
	mist72 <- read.csv('mist-results.csv')

	# use reshape2 library to take 'long' data and make it 'wide' - table to matrix
	library(reshape2)
	mat_mist_72_vs_4_ncbi <- acast(mist72, formula=Strain~marker, value.var='Result')
	# write MIST CGF72 binary data matrix to CSV
	write.csv(mat_mist_72_vs_4_ncbi, 'MIST-Arco_CGF72-4_NCBI_Abutzleri.csv')
	# read correct CGF72 lab binary data
	df_cgf72 <- read.table('CGF72_tangle_wMarkerNames_fixed.txt', sep='\t', header=T, row.names=1, check.names=F)
	# verify that all markers in lab CGF72 data are present in MIST CGF72 data
	setdiff(colnames(df_cgf72), colnames(mat_mist_72_vs_4_ncbi))
	# and vice versa
	setdiff(colnames(mat_mist_72_vs_4_ncbi), colnames(df_cgf72))
	# MIST CGF72 matrix to dataframe
	df_mist72_4 <- as.data.frame(mat_mist_72_vs_4_ncbi)
	# re-order markers in same order as lab data
	df_mist72_4 <- df_mist72_4[,colnames(df_cgf72)]
	# merge the lab and MIST CGF72 data
	df_merged <- rbind(df_cgf72, df_mist72_4)

	# verify all CGF40 markers are in CGF72
	setdiff(colnames(df_cgf40), colnames(df_cgf72))

	mat_72 <- as.matrix(df_merged)
	# get CGF40 subset from MIST+lab binary CGF72 data
	mat_40 <- as.matrix(df_merged)[,markers_cgf40_order]


	# Use a Manhattan distance matrix (approx for Hamming distance) for HC using complete linkage
	# CGF72 HC and dendrogram
	hc_72 <- hclust(dist(mat_72, method='man'))
	dend_72 <- as.dendrogram(hc_72)
	# CGF40 HC and dendrogram
	hc_40 <- hclust(dist(mat_40, method='man'))
	dend_40 <- as.dendrogram(hc_40)

	# untangle CGF40 tree respective to CGF72 tree
	dend_40_corr <- untangle_step_rotate_1side(dend1=dend_40, dend2_fixed=dend_72)

	# get clusters at all distance levels - CGF72 has 72 markers so 0 to 72 differences
	cl_72 <- cutree(hc_72, h=0:72)
	# get strains in HC order
	order_72 <- hc_72$labels[hc_72$order]
	# get clusters in HC order
	cl_72_order <- cl_72[order_72,]
	# get number of clusters at each distance level
	cl_72_max <- apply(cl_72_order, 2, max)
	# up to 8 differences == 88.9% similiarity threshold for clusters
	cl_72_order_h8 <- cl_72_order[,9]
	max_72_cl_h8 <- max(cl_72_order_h8)
	# get random branch colours for clusters at h=8 using RColorBrewer Set1 colorRampPalette colours
	set1_colours <- colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Set1'))
	set.seed(1) # set RNG seed for reproducible colours
	branch_colours <- sample(set1_colours(max_72_cl_h8), size=max_72_cl_h8, replace=F)[cl_72_order_h8]
	# make singleton clusters transparent so multi-strain clusters are more visible
	branch_colours[!(branch_colours %in% unique(branch_colours[duplicated(branch_colours)]))] <- rgb(1,1,1,0)
	# save tanglegram to PDF
	pdf('tanglegram-CGF72_vs_CGF40-h8_CGF72_multi_clusters.pdf', width=10, height=10)
	tanglegram(dend_72, dend_40_corr, lab.cex=.5, color_lines=branch_colours, main_left='CGF72', main_right='CGF40')
	dev.off()