roblanf/concordance.R

## concordance.R
library(viridis)
library(ggplot2)
library(dplyr)
library(ggrepel)
library(GGally)
library(entropy)

# read the data
d = read.delim("concord.cf.stat", header = T, comment.char=‘#')

names(d)[10] = "bootstrap"
names(d)[11] = "branchlength"


# plot the values
ggplot(d, aes(x = gCF, y = sCF)) +
    geom_point(aes(colour = bootstrap)) +
    scale_colour_viridis(direction = -1) +
    xlim(0, 100) +
    ylim(0, 100) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed")

# label the points
ggplot(d, aes(x = gCF, y = sCF, label = ID)) +
    geom_point(aes(colour = bootstrap)) +
    scale_colour_viridis(direction = -1) +
    xlim(0, 100) +
    ylim(0, 100) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    geom_text_repel()


# find a branch of interest
d[d$ID==327,]


# test ILS assumptions

# first we use a slightly modified chisq function
# which behaves nicely when you feed it zeros
chisq = function(DF1, DF2, N){
    tryCatch({
        # converts percentages to counts, runs chisq, gets pvalue
        chisq.test(c(round(DF1*N)/100, round(DF2*N)/100))$p.value
    },
    error = function(err) {
        # errors come if you give chisq two zeros
        # but here we're sure that there's no difference
        return(1.0)
    })
}

e = d %>%
    group_by(ID) %>%
    mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>%
    mutate(sEF_p = chisq(sDF1, sDF2, sN))


subset(data.frame(e), (gEF_p < 0.05 | sEF_p < 0.05))


# calculate internode certainty

IC = function(CF, DF1, DF2, N){

    # convert to counts
    X = CF * N / 100
    Y = max(DF1, DF2) * N / 100

    pX = X/(X+Y)
    pY = Y/(X+Y)

    IC = 1 + pX * log2(pX) +
             pY * log2(pY)

    return(IC)
}


e = e %>%
    group_by(ID) %>%
    mutate(gIC = IC(gCF, gDF1, gDF2, gN)) %>%
    mutate(sIC = IC(sCF, sDF1, sDF2, sN))


# entropy

ENT = function(CF, DF1, DF2, N){

    CF = CF * N / 100
    DF1 = DF1 * N / 100
    DF2 = DF2 * N / 100

    return(entropy(c(CF, DF1, DF2)))

}

ENTC = function(CF, DF1, DF2, N){

    maxent = 1.098612

    CF = CF * N / 100
    DF1 = DF1 * N / 100
    DF2 = DF2 * N / 100

    ent = entropy(c(CF, DF1, DF2))

    entc = 1 - (ent / maxent)

    return(entc)

}

e = e %>%
    group_by(ID) %>%
    mutate(sENT = ENT(sCF, sDF1, sDF2, sN)) %>%
    mutate(sENTC = ENTC(sCF, sDF1, sDF2, sN))

head(e)

# plot it
ggpairs(e, columns = c(2, 6, 10, 12, 13, 14, 15, 16, 17))
	library(viridis)
	library(ggplot2)
	library(dplyr)
	library(ggrepel)
	library(GGally)
	library(entropy)

	# read the data
	d = read.delim("concord.cf.stat", header = T, comment.char=‘#')

	names(d)[10] = "bootstrap"
	names(d)[11] = "branchlength"


	# plot the values
	ggplot(d, aes(x = gCF, y = sCF)) +
	geom_point(aes(colour = bootstrap)) +
	scale_colour_viridis(direction = -1) +
	xlim(0, 100) +
	ylim(0, 100) +
	geom_abline(slope = 1, intercept = 0, linetype = "dashed")

	# label the points
	ggplot(d, aes(x = gCF, y = sCF, label = ID)) +
	geom_point(aes(colour = bootstrap)) +
	scale_colour_viridis(direction = -1) +
	xlim(0, 100) +
	ylim(0, 100) +
	geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
	geom_text_repel()


	# find a branch of interest
	d[d$ID==327,]



	# test ILS assumptions

	# first we use a slightly modified chisq function
	# which behaves nicely when you feed it zeros
	chisq = function(DF1, DF2, N){
	tryCatch({
	# converts percentages to counts, runs chisq, gets pvalue
	chisq.test(c(round(DF1N)/100, round(DF2N)/100))$p.value
	},
	error = function(err) {
	# errors come if you give chisq two zeros
	# but here we're sure that there's no difference
	return(1.0)
	})
	}

	e = d %>%
	group_by(ID) %>%
	mutate(gEF_p = chisq(gDF1, gDF2, gN)) %>%
	mutate(sEF_p = chisq(sDF1, sDF2, sN))


	subset(data.frame(e), (gEF_p < 0.05 \| sEF_p < 0.05))



	# calculate internode certainty

	IC = function(CF, DF1, DF2, N){

	# convert to counts
	X = CF * N / 100
	Y = max(DF1, DF2) * N / 100

	pX = X/(X+Y)
	pY = Y/(X+Y)

	IC = 1 + pX * log2(pX) +
	pY * log2(pY)

	return(IC)
	}


	e = e %>%
	group_by(ID) %>%
	mutate(gIC = IC(gCF, gDF1, gDF2, gN)) %>%
	mutate(sIC = IC(sCF, sDF1, sDF2, sN))


	# entropy

	ENT = function(CF, DF1, DF2, N){

	CF = CF * N / 100
	DF1 = DF1 * N / 100
	DF2 = DF2 * N / 100

	return(entropy(c(CF, DF1, DF2)))

	}

	ENTC = function(CF, DF1, DF2, N){

	maxent = 1.098612

	CF = CF * N / 100
	DF1 = DF1 * N / 100
	DF2 = DF2 * N / 100

	ent = entropy(c(CF, DF1, DF2))

	entc = 1 - (ent / maxent)

	return(entc)

	}

	e = e %>%
	group_by(ID) %>%
	mutate(sENT = ENT(sCF, sDF1, sDF2, sN)) %>%
	mutate(sENTC = ENTC(sCF, sDF1, sDF2, sN))

	head(e)

	# plot it
	ggpairs(e, columns = c(2, 6, 10, 12, 13, 14, 15, 16, 17))