nmatzke/num_unobservable_patterns_v2.R

## num_unobservable_patterns_v2.R
#######################################################
# Calculating the number of unobservable site patterns
# for the Mk-parsimony-informative ascertainment bias
# correction
#
# by Nick Matzke, 2015-2016
#
# Free to re-use; we are preparing an ms on this issue,
# please email nick.matzke@anu.edu.au if interested.
#
#######################################################


# Allman et al. 2010 as Mk pars-inf
#
# Allman ES, Holder MT, Rhodes JA (2010).
# Estimating trees from filtered data: Identifiability of models
# for morphological phylogenetics. Journal of Theoretical Biology
# 263 (1):108-119


#######################################################
# Permutations
# http://stackoverflow.com/questions/7906332/how-to-calculate-combination-and-permutation-in-r
#######################################################
perm <- function(n, k)
	{
	return(factorial(n) / factorial(n-k))
	}

# Number of unobservable patterns in Mk-parsimony-informative ascertainment bias correction
num_unobservable_patterns_ParsInf <- function(ntaxa, nstates, printflag=TRUE)
	{
	defaults='
	ntaxa=5
	nstates=2
	printflag=TRUE
	'

	total_number_of_possible_patterns = nstates^ntaxa
	number_of_unobservable_patterns_given_nstates_observed = 0
	number_of_unobservable_patterns_given_j_states_in_pattern = 0
	number_of_unobservable_patterns_given_j_states_in_pattern_AND_patterns_are_uniform = 0

	# Initialize table
	pattern_counts_table = NULL

	# First, get the number of states in the patterns
	for (i in nstates:1)
		{
		# Then, make a list of the number of combinations of states that are possible
		number_of_combinations_of_states = choose(n=nstates, k=i)

		# For each of these combinations, 1 of the states will be the
		# dominant state, and (i-1) will be autapomorphic states
		# This can happen nstates i choose 1 ways
		number_of_ways_to_pick_the_dominant_state = choose(n=i, k=1)

		# After a dominant state is chosen, there are ntaxa choose (i-1)
		# ways to pick the autapomorphic taxa
		number_of_ways_to_pick_autapomorphic_taxa = choose(n=ntaxa, k=(i-1))

		# Given that the autapomorphic taxa have been picked, there are
		# (i-1) PERMUTE (i-1) ways to assign the autapomorphic states
		number_of_ways_to_assign_autapo_states_to_autapo_taxa = perm(n=(i-1), k=(i-1))

		product = number_of_combinations_of_states * number_of_ways_to_pick_the_dominant_state * number_of_ways_to_pick_autapomorphic_taxa * number_of_ways_to_assign_autapo_states_to_autapo_taxa

		tmprow = c(number_of_combinations_of_states, number_of_ways_to_pick_the_dominant_state, number_of_ways_to_pick_autapomorphic_taxa, number_of_ways_to_assign_autapo_states_to_autapo_taxa, product)

		pattern_counts_table = rbind(pattern_counts_table, tmprow)
		}


	abbreviations = c("ncomb_sts", "npicks_domstate", "npicks_autapo_taxa", "n_assign_autapo_states", "product")
	meaning = c("number of states in the pattern", "number_of_combinations_of_states", "number_of_ways_to_pick_the_dominant_state", "number_of_ways_to_pick_autapomorphic_taxa", "number_of_ways_to_assign_autapo_states_to_autapo_taxa", "number of unobservable patterns given this number of states in the pattern")
	headers_explained = cbind(c("(row names)", abbreviations), meaning)
	headers_explained = as.data.frame(headers_explained, stringsAsFactors=FALSE)
	names(headers_explained) = c("abbreviation", "meaning")
	row.names(headers_explained) = NULL


	# Label the table
	pattern_counts_table = as.data.frame(pattern_counts_table, stringsAsFactors=FALSE)
	names(pattern_counts_table) = abbreviations
	row.names(pattern_counts_table) = paste0("when_nstates=", nstates:1)

	number_of_unobservable_patterns = sum(pattern_counts_table$product)

	percentage_of_patterns_unobservable = number_of_unobservable_patterns / total_number_of_possible_patterns * 100

	summary_table = matrix(data=c(ntaxa, nstates, number_of_unobservable_patterns, total_number_of_possible_patterns, percentage_of_patterns_unobservable), nrow=1)
	summary_table = as.data.frame(summary_table, stringsAsFactors=FALSE)
	names(summary_table) = c("ntaxa", "nstates", "number_of_unobservable_patterns", "total_number_of_possible_patterns", "percentage_of_patterns_unobservable")

	res = NULL
	res$headers_explained = headers_explained
	res$pattern_counts_table = pattern_counts_table
	res$summary_table = summary_table

	if (printflag == TRUE)
		{
		require(BioGeoBEARS)
		cat("\n\nTable counting the number of unobservable patterns:\n\n")
		print(conditional_format_table(pattern_counts_table))
		cat("\n\nExplanation of the table headers:\n\n")
		print(headers_explained)
		cat("\n\nSummary of possible and unobservable patterns:\n\n")
		print(conditional_format_table(summary_table))
		}


	return(res)
	}
	#######################################################
	# Calculating the number of unobservable site patterns
	# for the Mk-parsimony-informative ascertainment bias
	# correction
	#
	# by Nick Matzke, 2015-2016
	#
	# Free to re-use; we are preparing an ms on this issue,
	# please email nick.matzke@anu.edu.au if interested.
	#
	#######################################################


	# Allman et al. 2010 as Mk pars-inf
	#
	# Allman ES, Holder MT, Rhodes JA (2010).
	# Estimating trees from filtered data: Identifiability of models
	# for morphological phylogenetics. Journal of Theoretical Biology
	# 263 (1):108-119


	#######################################################
	# Permutations
	# http://stackoverflow.com/questions/7906332/how-to-calculate-combination-and-permutation-in-r
	#######################################################
	perm <- function(n, k)
	{
	return(factorial(n) / factorial(n-k))
	}

	# Number of unobservable patterns in Mk-parsimony-informative ascertainment bias correction
	num_unobservable_patterns_ParsInf <- function(ntaxa, nstates, printflag=TRUE)
	{
	defaults='
	ntaxa=5
	nstates=2
	printflag=TRUE
	'

	total_number_of_possible_patterns = nstates^ntaxa
	number_of_unobservable_patterns_given_nstates_observed = 0
	number_of_unobservable_patterns_given_j_states_in_pattern = 0
	number_of_unobservable_patterns_given_j_states_in_pattern_AND_patterns_are_uniform = 0

	# Initialize table
	pattern_counts_table = NULL

	# First, get the number of states in the patterns
	for (i in nstates:1)
	{
	# Then, make a list of the number of combinations of states that are possible
	number_of_combinations_of_states = choose(n=nstates, k=i)

	# For each of these combinations, 1 of the states will be the
	# dominant state, and (i-1) will be autapomorphic states
	# This can happen nstates i choose 1 ways
	number_of_ways_to_pick_the_dominant_state = choose(n=i, k=1)

	# After a dominant state is chosen, there are ntaxa choose (i-1)
	# ways to pick the autapomorphic taxa
	number_of_ways_to_pick_autapomorphic_taxa = choose(n=ntaxa, k=(i-1))

	# Given that the autapomorphic taxa have been picked, there are
	# (i-1) PERMUTE (i-1) ways to assign the autapomorphic states
	number_of_ways_to_assign_autapo_states_to_autapo_taxa = perm(n=(i-1), k=(i-1))

	product = number_of_combinations_of_states * number_of_ways_to_pick_the_dominant_state * number_of_ways_to_pick_autapomorphic_taxa * number_of_ways_to_assign_autapo_states_to_autapo_taxa

	tmprow = c(number_of_combinations_of_states, number_of_ways_to_pick_the_dominant_state, number_of_ways_to_pick_autapomorphic_taxa, number_of_ways_to_assign_autapo_states_to_autapo_taxa, product)

	pattern_counts_table = rbind(pattern_counts_table, tmprow)
	}


	abbreviations = c("ncomb_sts", "npicks_domstate", "npicks_autapo_taxa", "n_assign_autapo_states", "product")
	meaning = c("number of states in the pattern", "number_of_combinations_of_states", "number_of_ways_to_pick_the_dominant_state", "number_of_ways_to_pick_autapomorphic_taxa", "number_of_ways_to_assign_autapo_states_to_autapo_taxa", "number of unobservable patterns given this number of states in the pattern")
	headers_explained = cbind(c("(row names)", abbreviations), meaning)
	headers_explained = as.data.frame(headers_explained, stringsAsFactors=FALSE)
	names(headers_explained) = c("abbreviation", "meaning")
	row.names(headers_explained) = NULL


	# Label the table
	pattern_counts_table = as.data.frame(pattern_counts_table, stringsAsFactors=FALSE)
	names(pattern_counts_table) = abbreviations
	row.names(pattern_counts_table) = paste0("when_nstates=", nstates:1)

	number_of_unobservable_patterns = sum(pattern_counts_table$product)

	percentage_of_patterns_unobservable = number_of_unobservable_patterns / total_number_of_possible_patterns * 100

	summary_table = matrix(data=c(ntaxa, nstates, number_of_unobservable_patterns, total_number_of_possible_patterns, percentage_of_patterns_unobservable), nrow=1)
	summary_table = as.data.frame(summary_table, stringsAsFactors=FALSE)
	names(summary_table) = c("ntaxa", "nstates", "number_of_unobservable_patterns", "total_number_of_possible_patterns", "percentage_of_patterns_unobservable")

	res = NULL
	res$headers_explained = headers_explained
	res$pattern_counts_table = pattern_counts_table
	res$summary_table = summary_table

	if (printflag == TRUE)
	{
	require(BioGeoBEARS)
	cat("\n\nTable counting the number of unobservable patterns:\n\n")
	print(conditional_format_table(pattern_counts_table))
	cat("\n\nExplanation of the table headers:\n\n")
	print(headers_explained)
	cat("\n\nSummary of possible and unobservable patterns:\n\n")
	print(conditional_format_table(summary_table))
	}


	return(res)
	}